Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 1 day ago

Commit

c287b6a

unverified ·

1 Parent(s): 7dd8eb5

feat(ui): add advanced controls accordion — inference steps, cfg, infer method, seed, lm cot, schedule, metadata

Browse files

User feedback: outputs feel samey regardless of prompt. Root cause:
GenerationParams.inference_steps defaults to 8 (ACE-Step turbo), too
few for the XL SFT model to actually express variation. Also
guidance_scale, infer_method, shift, ADG, and CoT flags all left at
dataclass defaults.

New ``Advanced ▼`` accordion under every song mode (Generate / Cover
/ Extend / Edit; lyrics mode is its own Qwen path and doesn't need
it). Four groups:

- Diffusion: inference_steps (8-80, default 27), guidance_scale
(1-15, default 7.0), infer_method (ode|sde), seed (number,
-1=random).
- CFG schedule + shift + ADG.
- 5Hz LM (CoT): thinking + use_cot_caption + use_cot_metas +
use_cot_language now defaulted ON; LM temperature / top_p / top_k /
cfg / negative_prompt sliders.
- Music metadata: bpm / keyscale / timesignature / vocal_language
manual overrides.

ace_pipeline.generate passes all of these through to GenerationParams.
Output metadata JSON now echoes the advanced + lm dicts so user can
see what knobs were active for a given output and lock-iterate from
there (seed shown is the ACTUAL used seed, not -1).

Files changed (6) hide show

ace_pipeline.py +27 -8
app.py +331 -12
backend.py +4 -0
tests/test_ace_pipeline_lazy.py +11 -1
theme.py +79 -0
ui.py +173 -0

ace_pipeline.py CHANGED Viewed

@@ -226,6 +226,21 @@ class ACEStepStudio:
         )
         duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
         gen_params = GenerationParams(
             task_type=task_type,
             caption=caption,
@@ -233,8 +248,10 @@ class ACEStepStudio:
             instrumental=instrumental,
             duration=duration_s,
             seed=int(params.get("seed", -1)),
-            inference_steps=int(advanced.get("steps", 32)),
-            guidance_scale=float(advanced.get("cfg", 4.0)),
             shift=float(advanced.get("shift", 1.0)),
             bpm=advanced.get("bpm"),
             keyscale=advanced.get("keyscale", ""),
@@ -248,16 +265,18 @@ class ACEStepStudio:
             audio_cover_strength=audio_cover_strength,
             repainting_start=repainting_start,
             repainting_end=repainting_end,
-            # 5Hz language model knobs
-            thinking=bool(lm_opts.get("thinking", False)),
             lm_temperature=float(lm_opts.get("temperature", 0.85)),
             lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
             lm_top_k=int(lm_opts.get("top_k", 0)),
             lm_top_p=float(lm_opts.get("top_p", 0.9)),
-            lm_negative_prompt=lm_opts.get("negative_prompt", ""),
-            use_cot_metas=bool(lm_opts.get("cot_metas", False)),
-            use_cot_caption=bool(lm_opts.get("cot_caption", False)),
-            use_cot_language=bool(lm_opts.get("cot_language", False)),
         )
         gen_config = GenerationConfig(

         )
         duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
+        # ``advanced``/``lm`` dicts are sent by app.py's
+        # ``_build_advanced_params``. Key changes from the prior contract:
+        # - ``inference_steps`` (was ``steps``, defaulted to 8 which made the
+        #   XL SFT model behave too turbo-ish; new default 27).
+        # - ``guidance_scale`` (was ``cfg``, default 7.0 for stronger prompt
+        #   adherence).
+        # - ``infer_method`` (new — ``"ode"`` deterministic / ``"sde"``
+        #   stochastic; the user can now flip to ``sde`` to actually get
+        #   different output each click even with the same seed).
+        # - ``use_adg`` (new — Adaptive Dual Guidance; experimental).
+        # - ``thinking`` (5Hz LM CoT — default flips to True so the LM can
+        #   reason about caption + metadata, which is the actual source of
+        #   the "no matter what prompt the style barely changes" symptom).
+        # - ``use_cot_metas`` / ``use_cot_caption`` / ``use_cot_language``
+        #   keys renamed from ``cot_*`` for consistency with the dataclass.
         gen_params = GenerationParams(
             task_type=task_type,
             caption=caption,
             instrumental=instrumental,
             duration=duration_s,
             seed=int(params.get("seed", -1)),
+            inference_steps=int(advanced.get("inference_steps", 27)),
+            guidance_scale=float(advanced.get("guidance_scale", 7.0)),
+            infer_method=str(advanced.get("infer_method", "ode")),
+            use_adg=bool(advanced.get("use_adg", False)),
             shift=float(advanced.get("shift", 1.0)),
             bpm=advanced.get("bpm"),
             keyscale=advanced.get("keyscale", ""),
             audio_cover_strength=audio_cover_strength,
             repainting_start=repainting_start,
             repainting_end=repainting_end,
+            # 5Hz language model knobs — defaults flipped to True so the
+            # LM actually reasons about each prompt instead of returning
+            # blank captions / metadata back to the DiT.
+            thinking=bool(lm_opts.get("thinking", True)),
             lm_temperature=float(lm_opts.get("temperature", 0.85)),
             lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
             lm_top_k=int(lm_opts.get("top_k", 0)),
             lm_top_p=float(lm_opts.get("top_p", 0.9)),
+            lm_negative_prompt=lm_opts.get("negative_prompt", "NO USER INPUT"),
+            use_cot_metas=bool(lm_opts.get("use_cot_metas", True)),
+            use_cot_caption=bool(lm_opts.get("use_cot_caption", True)),
+            use_cot_language=bool(lm_opts.get("use_cot_language", True)),
         )
         gen_config = GenerationConfig(

app.py CHANGED Viewed

@@ -436,6 +436,65 @@ def on_lora_strength_change(state, strength: float):
     return new_state, _active_md(new_state["name"], float(strength), kind)
 @_maybe_spaces_gpu("generate")
 def on_generate_click(
     prompt: str,
@@ -443,9 +502,53 @@ def on_generate_click(
     duration_s: float,
     instrumental_label: str,
     lora_state,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     loras = [lora_state] if lora_state else []
     out_path, meta = _safe_call(
         modes.generate,
         get_backend(),
@@ -454,10 +557,10 @@ def on_generate_click(
             "lyrics": lyrics,
             "duration_s": int(duration_s),
             "instrumental": instrumental_label == "Instrumental",
-            "seed": random.randint(1, 2_147_483_647),
             "loras": loras,
-            "advanced": {},
-            "lm": {},
             "dcw": {},
         },
     )
@@ -473,10 +576,54 @@ def on_cover_click(
     duration_s: float,
     audio_cover_strength: float,
     lora_state,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
     out_path, meta = _safe_call(
         modes.cover,
         get_backend(),
@@ -486,10 +633,10 @@ def on_cover_click(
             "lyrics": lyrics,
             "duration_s": int(duration_s),
             "audio_cover_strength": float(audio_cover_strength),
-            "seed": random.randint(1, 2_147_483_647),
             "loras": loras,
-            "advanced": {},
-            "lm": {},
             "dcw": {},
         },
     )
@@ -509,10 +656,54 @@ def on_extend_click(
     latent_crossfade_frames: float,
     chunk_mask_mode: str,
     lora_state,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
     out_path, meta = _safe_call(
         modes.extend,
         get_backend(),
@@ -526,10 +717,10 @@ def on_extend_click(
             "repaint_strength": float(repaint_strength),
             "latent_crossfade_frames": int(latent_crossfade_frames),
             "chunk_mask_mode": chunk_mask_mode,
-            "seed": random.randint(1, 2_147_483_647),
             "loras": loras,
-            "advanced": {},
-            "lm": {},
             "dcw": {},
         },
     )
@@ -632,10 +823,54 @@ def on_edit_click(
     flow_n_max: float,
     flow_n_avg: float,
     lora_state,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
     out_path, meta = _safe_call(
         modes.edit,
         get_backend(),
@@ -652,10 +887,10 @@ def on_edit_click(
             "flow_n_min": float(flow_n_min),
             "flow_n_max": float(flow_n_max),
             "flow_n_avg": int(flow_n_avg),
-            "seed": random.randint(1, 2_147_483_647),
             "loras": loras,
-            "advanced": {},
-            "lm": {},
             "dcw": {},
         },
     )
@@ -799,6 +1034,27 @@ def build_app() -> gr.Blocks:
                             g["duration_s"],
                             g["instrumental"],
                             g["lora_state"],
                         ],
                         outputs=[g["output_audio"], g["output_meta"], history_html],
                     )
@@ -844,6 +1100,27 @@ def build_app() -> gr.Blocks:
                             c["duration_s"],
                             c["audio_cover_strength"],
                             c["lora_state"],
                         ],
                         outputs=[c["output_audio"], c["output_meta"], history_html],
                     )
@@ -893,6 +1170,27 @@ def build_app() -> gr.Blocks:
                             x["latent_crossfade_frames"],
                             x["chunk_mask_mode"],
                             x["lora_state"],
                         ],
                         outputs=[x["output_audio"], x["output_meta"], history_html],
                     )
@@ -945,6 +1243,27 @@ def build_app() -> gr.Blocks:
                             e["flow_n_max"],
                             e["flow_n_avg"],
                             e["lora_state"],
                         ],
                         outputs=[e["output_audio"], e["output_meta"], history_html],
                     )

     return new_state, _active_md(new_state["name"], float(strength), kind)
+def _build_advanced_params(
+    adv_inference_steps,
+    adv_guidance_scale,
+    adv_infer_method,
+    adv_seed,
+    adv_cfg_interval_start,
+    adv_cfg_interval_end,
+    adv_shift,
+    adv_use_adg,
+    adv_thinking,
+    adv_use_cot_caption,
+    adv_use_cot_metas,
+    adv_use_cot_language,
+    adv_lm_temperature,
+    adv_lm_top_p,
+    adv_lm_top_k,
+    adv_lm_cfg_scale,
+    adv_lm_negative_prompt,
+    adv_bpm,
+    adv_keyscale,
+    adv_timesignature,
+    adv_vocal_language,
+):
+    """Pack the 21 Advanced-accordion inputs into the ``advanced`` + ``lm``
+    dicts that ``ace_pipeline.ACEStepStudio.generate`` consumes.
+    Centralising this avoids repeating the same dict-construction in each
+    of the four song-mode click handlers. Returns ``(seed, advanced, lm)``.
+    ``seed`` is the resolved seed (-1 / 0 / None → random 32-bit positive).
+    """
+    seed_raw = int(adv_seed) if adv_seed is not None else -1
+    seed = seed_raw if seed_raw > 0 else random.randint(1, 2_147_483_647)
+    advanced = {
+        "inference_steps": int(adv_inference_steps),
+        "guidance_scale": float(adv_guidance_scale),
+        "infer_method": adv_infer_method,
+        "cfg_interval_start": float(adv_cfg_interval_start),
+        "cfg_interval_end": float(adv_cfg_interval_end),
+        "shift": float(adv_shift),
+        "use_adg": bool(adv_use_adg),
+        "bpm": int(adv_bpm) if adv_bpm else None,
+        "keyscale": adv_keyscale or "",
+        "timesignature": adv_timesignature or "",
+        "vocal_language": adv_vocal_language or "unknown",
+    }
+    lm = {
+        "thinking": bool(adv_thinking),
+        "use_cot_caption": bool(adv_use_cot_caption),
+        "use_cot_metas": bool(adv_use_cot_metas),
+        "use_cot_language": bool(adv_use_cot_language),
+        "temperature": float(adv_lm_temperature),
+        "top_p": float(adv_lm_top_p),
+        "top_k": int(adv_lm_top_k) if adv_lm_top_k else 0,
+        "cfg": float(adv_lm_cfg_scale),
+        "negative_prompt": adv_lm_negative_prompt or "NO USER INPUT",
+    }
+    return seed, advanced, lm
 @_maybe_spaces_gpu("generate")
 def on_generate_click(
     prompt: str,
     duration_s: float,
     instrumental_label: str,
     lora_state,
+    adv_inference_steps,
+    adv_guidance_scale,
+    adv_infer_method,
+    adv_seed,
+    adv_cfg_interval_start,
+    adv_cfg_interval_end,
+    adv_shift,
+    adv_use_adg,
+    adv_thinking,
+    adv_use_cot_caption,
+    adv_use_cot_metas,
+    adv_use_cot_language,
+    adv_lm_temperature,
+    adv_lm_top_p,
+    adv_lm_top_k,
+    adv_lm_cfg_scale,
+    adv_lm_negative_prompt,
+    adv_bpm,
+    adv_keyscale,
+    adv_timesignature,
+    adv_vocal_language,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     loras = [lora_state] if lora_state else []
+    seed, advanced, lm = _build_advanced_params(
+        adv_inference_steps,
+        adv_guidance_scale,
+        adv_infer_method,
+        adv_seed,
+        adv_cfg_interval_start,
+        adv_cfg_interval_end,
+        adv_shift,
+        adv_use_adg,
+        adv_thinking,
+        adv_use_cot_caption,
+        adv_use_cot_metas,
+        adv_use_cot_language,
+        adv_lm_temperature,
+        adv_lm_top_p,
+        adv_lm_top_k,
+        adv_lm_cfg_scale,
+        adv_lm_negative_prompt,
+        adv_bpm,
+        adv_keyscale,
+        adv_timesignature,
+        adv_vocal_language,
+    )
     out_path, meta = _safe_call(
         modes.generate,
         get_backend(),
             "lyrics": lyrics,
             "duration_s": int(duration_s),
             "instrumental": instrumental_label == "Instrumental",
+            "seed": seed,
             "loras": loras,
+            "advanced": advanced,
+            "lm": lm,
             "dcw": {},
         },
     )
     duration_s: float,
     audio_cover_strength: float,
     lora_state,
+    adv_inference_steps,
+    adv_guidance_scale,
+    adv_infer_method,
+    adv_seed,
+    adv_cfg_interval_start,
+    adv_cfg_interval_end,
+    adv_shift,
+    adv_use_adg,
+    adv_thinking,
+    adv_use_cot_caption,
+    adv_use_cot_metas,
+    adv_use_cot_language,
+    adv_lm_temperature,
+    adv_lm_top_p,
+    adv_lm_top_k,
+    adv_lm_cfg_scale,
+    adv_lm_negative_prompt,
+    adv_bpm,
+    adv_keyscale,
+    adv_timesignature,
+    adv_vocal_language,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
+    seed, advanced, lm = _build_advanced_params(
+        adv_inference_steps,
+        adv_guidance_scale,
+        adv_infer_method,
+        adv_seed,
+        adv_cfg_interval_start,
+        adv_cfg_interval_end,
+        adv_shift,
+        adv_use_adg,
+        adv_thinking,
+        adv_use_cot_caption,
+        adv_use_cot_metas,
+        adv_use_cot_language,
+        adv_lm_temperature,
+        adv_lm_top_p,
+        adv_lm_top_k,
+        adv_lm_cfg_scale,
+        adv_lm_negative_prompt,
+        adv_bpm,
+        adv_keyscale,
+        adv_timesignature,
+        adv_vocal_language,
+    )
     out_path, meta = _safe_call(
         modes.cover,
         get_backend(),
             "lyrics": lyrics,
             "duration_s": int(duration_s),
             "audio_cover_strength": float(audio_cover_strength),
+            "seed": seed,
             "loras": loras,
+            "advanced": advanced,
+            "lm": lm,
             "dcw": {},
         },
     )
     latent_crossfade_frames: float,
     chunk_mask_mode: str,
     lora_state,
+    adv_inference_steps,
+    adv_guidance_scale,
+    adv_infer_method,
+    adv_seed,
+    adv_cfg_interval_start,
+    adv_cfg_interval_end,
+    adv_shift,
+    adv_use_adg,
+    adv_thinking,
+    adv_use_cot_caption,
+    adv_use_cot_metas,
+    adv_use_cot_language,
+    adv_lm_temperature,
+    adv_lm_top_p,
+    adv_lm_top_k,
+    adv_lm_cfg_scale,
+    adv_lm_negative_prompt,
+    adv_bpm,
+    adv_keyscale,
+    adv_timesignature,
+    adv_vocal_language,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
+    seed, advanced, lm = _build_advanced_params(
+        adv_inference_steps,
+        adv_guidance_scale,
+        adv_infer_method,
+        adv_seed,
+        adv_cfg_interval_start,
+        adv_cfg_interval_end,
+        adv_shift,
+        adv_use_adg,
+        adv_thinking,
+        adv_use_cot_caption,
+        adv_use_cot_metas,
+        adv_use_cot_language,
+        adv_lm_temperature,
+        adv_lm_top_p,
+        adv_lm_top_k,
+        adv_lm_cfg_scale,
+        adv_lm_negative_prompt,
+        adv_bpm,
+        adv_keyscale,
+        adv_timesignature,
+        adv_vocal_language,
+    )
     out_path, meta = _safe_call(
         modes.extend,
         get_backend(),
             "repaint_strength": float(repaint_strength),
             "latent_crossfade_frames": int(latent_crossfade_frames),
             "chunk_mask_mode": chunk_mask_mode,
+            "seed": seed,
             "loras": loras,
+            "advanced": advanced,
+            "lm": lm,
             "dcw": {},
         },
     )
     flow_n_max: float,
     flow_n_avg: float,
     lora_state,
+    adv_inference_steps,
+    adv_guidance_scale,
+    adv_infer_method,
+    adv_seed,
+    adv_cfg_interval_start,
+    adv_cfg_interval_end,
+    adv_shift,
+    adv_use_adg,
+    adv_thinking,
+    adv_use_cot_caption,
+    adv_use_cot_metas,
+    adv_use_cot_language,
+    adv_lm_temperature,
+    adv_lm_top_p,
+    adv_lm_top_k,
+    adv_lm_cfg_scale,
+    adv_lm_negative_prompt,
+    adv_bpm,
+    adv_keyscale,
+    adv_timesignature,
+    adv_vocal_language,
     progress=gr.Progress(track_tqdm=True),  # noqa: B008
 ):
     """Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
     loras = [lora_state] if lora_state else []
+    seed, advanced, lm = _build_advanced_params(
+        adv_inference_steps,
+        adv_guidance_scale,
+        adv_infer_method,
+        adv_seed,
+        adv_cfg_interval_start,
+        adv_cfg_interval_end,
+        adv_shift,
+        adv_use_adg,
+        adv_thinking,
+        adv_use_cot_caption,
+        adv_use_cot_metas,
+        adv_use_cot_language,
+        adv_lm_temperature,
+        adv_lm_top_p,
+        adv_lm_top_k,
+        adv_lm_cfg_scale,
+        adv_lm_negative_prompt,
+        adv_bpm,
+        adv_keyscale,
+        adv_timesignature,
+        adv_vocal_language,
+    )
     out_path, meta = _safe_call(
         modes.edit,
         get_backend(),
             "flow_n_min": float(flow_n_min),
             "flow_n_max": float(flow_n_max),
             "flow_n_avg": int(flow_n_avg),
+            "seed": seed,
             "loras": loras,
+            "advanced": advanced,
+            "lm": lm,
             "dcw": {},
         },
     )
                             g["duration_s"],
                             g["instrumental"],
                             g["lora_state"],
+                            g["adv_inference_steps"],
+                            g["adv_guidance_scale"],
+                            g["adv_infer_method"],
+                            g["adv_seed"],
+                            g["adv_cfg_interval_start"],
+                            g["adv_cfg_interval_end"],
+                            g["adv_shift"],
+                            g["adv_use_adg"],
+                            g["adv_thinking"],
+                            g["adv_use_cot_caption"],
+                            g["adv_use_cot_metas"],
+                            g["adv_use_cot_language"],
+                            g["adv_lm_temperature"],
+                            g["adv_lm_top_p"],
+                            g["adv_lm_top_k"],
+                            g["adv_lm_cfg_scale"],
+                            g["adv_lm_negative_prompt"],
+                            g["adv_bpm"],
+                            g["adv_keyscale"],
+                            g["adv_timesignature"],
+                            g["adv_vocal_language"],
                         ],
                         outputs=[g["output_audio"], g["output_meta"], history_html],
                     )
                             c["duration_s"],
                             c["audio_cover_strength"],
                             c["lora_state"],
+                            c["adv_inference_steps"],
+                            c["adv_guidance_scale"],
+                            c["adv_infer_method"],
+                            c["adv_seed"],
+                            c["adv_cfg_interval_start"],
+                            c["adv_cfg_interval_end"],
+                            c["adv_shift"],
+                            c["adv_use_adg"],
+                            c["adv_thinking"],
+                            c["adv_use_cot_caption"],
+                            c["adv_use_cot_metas"],
+                            c["adv_use_cot_language"],
+                            c["adv_lm_temperature"],
+                            c["adv_lm_top_p"],
+                            c["adv_lm_top_k"],
+                            c["adv_lm_cfg_scale"],
+                            c["adv_lm_negative_prompt"],
+                            c["adv_bpm"],
+                            c["adv_keyscale"],
+                            c["adv_timesignature"],
+                            c["adv_vocal_language"],
                         ],
                         outputs=[c["output_audio"], c["output_meta"], history_html],
                     )
                             x["latent_crossfade_frames"],
                             x["chunk_mask_mode"],
                             x["lora_state"],
+                            x["adv_inference_steps"],
+                            x["adv_guidance_scale"],
+                            x["adv_infer_method"],
+                            x["adv_seed"],
+                            x["adv_cfg_interval_start"],
+                            x["adv_cfg_interval_end"],
+                            x["adv_shift"],
+                            x["adv_use_adg"],
+                            x["adv_thinking"],
+                            x["adv_use_cot_caption"],
+                            x["adv_use_cot_metas"],
+                            x["adv_use_cot_language"],
+                            x["adv_lm_temperature"],
+                            x["adv_lm_top_p"],
+                            x["adv_lm_top_k"],
+                            x["adv_lm_cfg_scale"],
+                            x["adv_lm_negative_prompt"],
+                            x["adv_bpm"],
+                            x["adv_keyscale"],
+                            x["adv_timesignature"],
+                            x["adv_vocal_language"],
                         ],
                         outputs=[x["output_audio"], x["output_meta"], history_html],
                     )
                             e["flow_n_max"],
                             e["flow_n_avg"],
                             e["lora_state"],
+                            e["adv_inference_steps"],
+                            e["adv_guidance_scale"],
+                            e["adv_infer_method"],
+                            e["adv_seed"],
+                            e["adv_cfg_interval_start"],
+                            e["adv_cfg_interval_end"],
+                            e["adv_shift"],
+                            e["adv_use_adg"],
+                            e["adv_thinking"],
+                            e["adv_use_cot_caption"],
+                            e["adv_use_cot_metas"],
+                            e["adv_use_cot_language"],
+                            e["adv_lm_temperature"],
+                            e["adv_lm_top_p"],
+                            e["adv_lm_top_k"],
+                            e["adv_lm_cfg_scale"],
+                            e["adv_lm_negative_prompt"],
+                            e["adv_bpm"],
+                            e["adv_keyscale"],
+                            e["adv_timesignature"],
+                            e["adv_vocal_language"],
                         ],
                         outputs=[e["output_audio"], e["output_meta"], history_html],
                     )

backend.py CHANGED Viewed

@@ -64,6 +64,10 @@ class ACEStepStudioBackend:
                 {"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
                 for lora in params.get("loras", [])
             ],
             "lm": params.get("lm", {}),
             "dcw": params.get("dcw", {}),
         }

                 {"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
                 for lora in params.get("loras", [])
             ],
+            # Echo the advanced + lm dicts back so the user can see which
+            # knobs were active for a given output and lock-iterate from
+            # there. The "seed" above is the resolved seed (never -1).
+            "advanced": params.get("advanced", {}),
             "lm": params.get("lm", {}),
             "dcw": params.get("dcw", {}),
         }

tests/test_ace_pipeline_lazy.py CHANGED Viewed

@@ -106,7 +106,15 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
             "instrumental": False,
             "seed": 42,
             "loras": [],
-            "advanced": {"steps": 32, "cfg": 4.0, "bpm": 135},
             "lm": {"thinking": False},
             "dcw": {},
         }
@@ -118,6 +126,8 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
     assert captured["gp"]["duration"] == 30
     assert captured["gp"]["seed"] == 42
     assert captured["gp"]["inference_steps"] == 32
     assert captured["gp"]["bpm"] == 135

             "instrumental": False,
             "seed": 42,
             "loras": [],
+            # New advanced contract: ``inference_steps`` + ``guidance_scale``
+            # + ``infer_method`` replace the old ``steps`` + ``cfg`` keys.
+            # See ace_pipeline.ACEStepStudio.generate for the full schema.
+            "advanced": {
+                "inference_steps": 32,
+                "guidance_scale": 4.0,
+                "infer_method": "ode",
+                "bpm": 135,
+            },
             "lm": {"thinking": False},
             "dcw": {},
         }
     assert captured["gp"]["duration"] == 30
     assert captured["gp"]["seed"] == 42
     assert captured["gp"]["inference_steps"] == 32
+    assert captured["gp"]["guidance_scale"] == 4.0
+    assert captured["gp"]["infer_method"] == "ode"
     assert captured["gp"]["bpm"] == 135

theme.py CHANGED Viewed

@@ -989,6 +989,85 @@ main, .contain {{
   padding:0 12px 12px 12px !important;
 }}
 /* ============================================================
  * Post-process action row (M5/G2) — sits below the Output Audio.
  * Three compact mono pills (separate stems / normalise / mp3 export)

   padding:0 12px 12px 12px !important;
 }}
+/* ============================================================
+ * Advanced controls accordion (M0-X)
+ * Bordered chrome matching the LoRA + LM + experimental accordions so
+ * the four song-mode panes read consistently. Inside the accordion we
+ * additionally render small <h>/<p strong> section headers (Diffusion,
+ * CFG schedule, 5Hz LM, Music metadata) to chunk the 21 knobs into
+ * logical groups; those need their own mono-uppercase-faint treatment
+ * so they don't compete with the form labels for visual weight.
+ * ============================================================ */
+.ams-content .ams-advanced {{
+  border:1px solid {BORDER} !important;
+  border-radius:3px !important;
+  background:{SURFACE_STRONG} !important;
+  margin-top:10px !important;
+  padding:0 !important;
+}}
+.ams-content .ams-advanced > .label-wrap,
+.ams-content .ams-advanced summary,
+.ams-content .ams-advanced > button {{
+  font-family: {FONT_MONO} !important;
+  font-size:10px !important;
+  letter-spacing:0.08em !important;
+  text-transform:uppercase !important;
+  color:{INK_MUTED} !important;
+  padding:10px 12px !important;
+  background:transparent !important;
+  border:none !important;
+}}
+.ams-content .ams-advanced > .label-wrap span,
+.ams-content .ams-advanced summary span,
+.ams-content .ams-advanced > button span {{
+  color:{INK_MUTED} !important;
+  font-family: {FONT_MONO} !important;
+  font-size:10px !important;
+  letter-spacing:0.08em !important;
+  text-transform:uppercase !important;
+}}
+.ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
+  padding:0 12px 12px 12px !important;
+}}
+/* Section divider Markdown headers inside the accordion. We render them
+   as **Diffusion** (etc) via gr.Markdown — Gradio wraps that in
+   ``.prose strong``. Treat the strong tag as a small mono uppercase
+   header with a subtle underline so the four groups have clear visual
+   boundaries without competing with the actual form labels. */
+.ams-content .ams-advanced .ams-adv-section .prose p {{
+  margin:14px 0 4px 0 !important;
+  padding:0 0 4px 0 !important;
+  border-bottom:1px solid {BORDER} !important;
+}}
+.ams-content .ams-advanced .ams-adv-section .prose p:first-child {{
+  margin-top:6px !important;
+}}
+.ams-content .ams-advanced .ams-adv-section .prose strong {{
+  font-family: {FONT_MONO} !important;
+  font-size:10px !important;
+  letter-spacing:0.12em !important;
+  text-transform:uppercase !important;
+  color:{INK} !important;
+  font-weight:600 !important;
+}}
+.ams-content .ams-advanced .ams-adv-section .prose {{
+  background:transparent !important;
+}}
+@media (max-width: 640px) {{
+  .ams-content .ams-advanced > .label-wrap,
+  .ams-content .ams-advanced summary,
+  .ams-content .ams-advanced > button {{
+    font-size:9px !important;
+    padding:8px 10px !important;
+  }}
+  .ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
+    padding:0 10px 10px 10px !important;
+  }}
+  .ams-content .ams-advanced .ams-adv-section .prose strong {{
+    font-size:9px !important;
+  }}
+}}
 /* ============================================================
  * Post-process action row (M5/G2) — sits below the Output Audio.
  * Three compact mono pills (separate stems / normalise / mp3 export)

ui.py CHANGED Viewed

@@ -16,6 +16,175 @@ import lora_stack
 import tooltips
 def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
     """LoRA accordion with single-LoRA semantics. Mutates ``components``.
@@ -179,6 +348,7 @@ def build_generate_tab() -> dict[str, gr.components.Component]:
                 )
             _build_lora_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Generate",
@@ -240,6 +410,7 @@ def build_cover_tab() -> dict[str, gr.components.Component]:
                 )
             _build_lora_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Generate cover",
@@ -341,6 +512,7 @@ def build_extend_tab() -> dict[str, gr.components.Component]:
                 )
             _build_lora_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Extend",
@@ -455,6 +627,7 @@ def build_edit_tab() -> dict[str, gr.components.Component]:
                 components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
             _build_lora_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Apply edit",

 import tooltips
+def _build_advanced_accordion(components: dict[str, gr.components.Component]) -> None:
+    """Advanced controls accordion shared by all four song modes.
+    User complaint: "no matter what prompt I write, style is not deviating
+    by a lot". Root cause: ``GenerationParams.inference_steps`` defaults
+    to 8 (ACE-Step turbo) — too few for the XL SFT model to actually
+    express prompt variation. ``guidance_scale``, ``infer_method``,
+    ``shift``, ``use_adg``, and the CoT flags were all left at dataclass
+    defaults too. This accordion surfaces the ~21 most useful knobs in
+    four logical groups so the user can lock-and-iterate.
+    Each song-mode pane (Generate / Cover / Extend / Edit) calls this
+    right after ``_build_lora_accordion(components)`` so the layout is
+    consistent. The Lyrics tab does NOT get this — it's a Qwen path with
+    its own LM-params accordion already.
+    """
+    with gr.Accordion(
+        label="Advanced",
+        open=False,
+        elem_classes=["ams-advanced"],
+    ):
+        # --- Group A — Diffusion (most impactful) ---
+        gr.Markdown("**Diffusion**", elem_classes=["ams-adv-section"])
+        components["adv_inference_steps"] = gr.Slider(
+            minimum=8,
+            maximum=80,
+            value=27,
+            step=1,
+            label="Inference steps",
+            info="More steps → richer detail. 8 is turbo, 27-60 is the sweet spot for XL SFT.",
+        )
+        components["adv_guidance_scale"] = gr.Slider(
+            minimum=1.0,
+            maximum=15.0,
+            value=7.0,
+            step=0.5,
+            label="Guidance scale (CFG)",
+            info="Higher = follow the prompt more strictly. Lower = more creative / weirder.",
+        )
+        components["adv_infer_method"] = gr.Radio(
+            choices=["ode", "sde"],
+            value="ode",
+            label="Inference method",
+            info="ode = deterministic per seed. sde = injects stochastic noise per step → genuinely different outputs each run.",
+        )
+        components["adv_seed"] = gr.Number(
+            value=-1,
+            precision=0,
+            label="Seed",
+            info="-1 = randomize each run. Set a number to lock-and-iterate.",
+        )
+        # --- Group B — CFG schedule + shift + ADG ---
+        gr.Markdown("**CFG schedule + shift**", elem_classes=["ams-adv-section"])
+        components["adv_cfg_interval_start"] = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.0,
+            step=0.05,
+            label="CFG interval start",
+            info="Fraction of diffusion at which CFG kicks in.",
+        )
+        components["adv_cfg_interval_end"] = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=1.0,
+            step=0.05,
+            label="CFG interval end",
+            info="Fraction of diffusion at which CFG stops.",
+        )
+        components["adv_shift"] = gr.Slider(
+            minimum=0.5,
+            maximum=3.0,
+            value=1.0,
+            step=0.1,
+            label="Shift",
+            info="Timestep shift. Try 0.7-1.3 for different feel.",
+        )
+        components["adv_use_adg"] = gr.Checkbox(
+            value=False,
+            label="Use Adaptive Dual Guidance (ADG)",
+            info="Experimental — sometimes improves base model output.",
+        )
+        # --- Group C — 5Hz Language Model (CoT reasoning) ---
+        gr.Markdown("**5Hz LM (CoT)**", elem_classes=["ams-adv-section"])
+        components["adv_thinking"] = gr.Checkbox(
+            value=True,
+            label="Enable thinking (CoT)",
+            info="Let the 5Hz LM reason before generating. Recommended ON.",
+        )
+        components["adv_use_cot_caption"] = gr.Checkbox(
+            value=True,
+            label="Let LM rewrite caption",
+            info="LM expands/rephrases your prompt. Adds variety.",
+        )
+        components["adv_use_cot_metas"] = gr.Checkbox(
+            value=True,
+            label="Let LM infer metadata (bpm/key/time)",
+            info="LM picks musical metadata. Turn off to force your manual values below.",
+        )
+        components["adv_use_cot_language"] = gr.Checkbox(
+            value=True,
+            label="Let LM detect vocal language",
+            info="LM picks vocal language from caption + lyrics.",
+        )
+        components["adv_lm_temperature"] = gr.Slider(
+            minimum=0.0,
+            maximum=2.0,
+            value=0.85,
+            step=0.05,
+            label="LM temperature",
+            info="Higher = more creative metadata/structure.",
+        )
+        components["adv_lm_top_p"] = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.9,
+            step=0.05,
+            label="LM top-p",
+            info="Nucleus sampling.",
+        )
+        components["adv_lm_top_k"] = gr.Number(
+            value=0,
+            precision=0,
+            label="LM top-k",
+            info="0 = disabled.",
+        )
+        components["adv_lm_cfg_scale"] = gr.Slider(
+            minimum=1.0,
+            maximum=10.0,
+            value=2.0,
+            step=0.5,
+            label="LM CFG scale",
+            info="5Hz LM classifier-free guidance.",
+        )
+        components["adv_lm_negative_prompt"] = gr.Textbox(
+            value="NO USER INPUT",
+            label="LM negative prompt",
+            info="Steer the LM AWAY from these traits.",
+        )
+        # --- Group D — Music metadata (manual overrides) ---
+        gr.Markdown("**Music metadata**", elem_classes=["ams-adv-section"])
+        components["adv_bpm"] = gr.Number(
+            value=None,
+            precision=0,
+            label="BPM",
+            info="Empty = auto. 30-300.",
+        )
+        components["adv_keyscale"] = gr.Textbox(
+            value="",
+            label="Key / scale",
+            info="e.g. 'C Major', 'Am'. Empty = auto.",
+        )
+        components["adv_timesignature"] = gr.Dropdown(
+            choices=["", "2", "3", "4", "6"],
+            value="",
+            label="Time signature",
+            info="2=2/4, 3=3/4, 4=4/4, 6=6/8. Empty = auto.",
+        )
+        components["adv_vocal_language"] = gr.Dropdown(
+            choices=["unknown", "en", "zh", "ja", "ko", "es", "fr", "de", "it", "pt", "ru"],
+            value="unknown",
+            label="Vocal language",
+            info="Hint for the 5Hz LM. unknown = auto.",
+        )
 def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
     """LoRA accordion with single-LoRA semantics. Mutates ``components``.
                 )
             _build_lora_accordion(components)
+            _build_advanced_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Generate",
                 )
             _build_lora_accordion(components)
+            _build_advanced_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Generate cover",
                 )
             _build_lora_accordion(components)
+            _build_advanced_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Extend",
                 components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
             _build_lora_accordion(components)
+            _build_advanced_accordion(components)
             components["generate_btn"] = gr.Button(
                 "▶ Apply edit",