Spaces:
Running on Zero
feat(ui): add advanced controls accordion — inference steps, cfg, infer method, seed, lm cot, schedule, metadata
Browse filesUser feedback: outputs feel samey regardless of prompt. Root cause:
GenerationParams.inference_steps defaults to 8 (ACE-Step turbo), too
few for the XL SFT model to actually express variation. Also
guidance_scale, infer_method, shift, ADG, and CoT flags all left at
dataclass defaults.
New ``Advanced ▼`` accordion under every song mode (Generate / Cover
/ Extend / Edit; lyrics mode is its own Qwen path and doesn't need
it). Four groups:
- Diffusion: inference_steps (8-80, default 27), guidance_scale
(1-15, default 7.0), infer_method (ode|sde), seed (number,
-1=random).
- CFG schedule + shift + ADG.
- 5Hz LM (CoT): thinking + use_cot_caption + use_cot_metas +
use_cot_language now defaulted ON; LM temperature / top_p / top_k /
cfg / negative_prompt sliders.
- Music metadata: bpm / keyscale / timesignature / vocal_language
manual overrides.
ace_pipeline.generate passes all of these through to GenerationParams.
Output metadata JSON now echoes the advanced + lm dicts so user can
see what knobs were active for a given output and lock-iterate from
there (seed shown is the ACTUAL used seed, not -1).
- ace_pipeline.py +27 -8
- app.py +331 -12
- backend.py +4 -0
- tests/test_ace_pipeline_lazy.py +11 -1
- theme.py +79 -0
- ui.py +173 -0
|
@@ -226,6 +226,21 @@ class ACEStepStudio:
|
|
| 226 |
)
|
| 227 |
duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
gen_params = GenerationParams(
|
| 230 |
task_type=task_type,
|
| 231 |
caption=caption,
|
|
@@ -233,8 +248,10 @@ class ACEStepStudio:
|
|
| 233 |
instrumental=instrumental,
|
| 234 |
duration=duration_s,
|
| 235 |
seed=int(params.get("seed", -1)),
|
| 236 |
-
inference_steps=int(advanced.get("
|
| 237 |
-
guidance_scale=float(advanced.get("
|
|
|
|
|
|
|
| 238 |
shift=float(advanced.get("shift", 1.0)),
|
| 239 |
bpm=advanced.get("bpm"),
|
| 240 |
keyscale=advanced.get("keyscale", ""),
|
|
@@ -248,16 +265,18 @@ class ACEStepStudio:
|
|
| 248 |
audio_cover_strength=audio_cover_strength,
|
| 249 |
repainting_start=repainting_start,
|
| 250 |
repainting_end=repainting_end,
|
| 251 |
-
# 5Hz language model knobs
|
| 252 |
-
|
|
|
|
|
|
|
| 253 |
lm_temperature=float(lm_opts.get("temperature", 0.85)),
|
| 254 |
lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
|
| 255 |
lm_top_k=int(lm_opts.get("top_k", 0)),
|
| 256 |
lm_top_p=float(lm_opts.get("top_p", 0.9)),
|
| 257 |
-
lm_negative_prompt=lm_opts.get("negative_prompt", ""),
|
| 258 |
-
use_cot_metas=bool(lm_opts.get("
|
| 259 |
-
use_cot_caption=bool(lm_opts.get("
|
| 260 |
-
use_cot_language=bool(lm_opts.get("
|
| 261 |
)
|
| 262 |
|
| 263 |
gen_config = GenerationConfig(
|
|
|
|
| 226 |
)
|
| 227 |
duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
|
| 228 |
|
| 229 |
+
# ``advanced``/``lm`` dicts are sent by app.py's
|
| 230 |
+
# ``_build_advanced_params``. Key changes from the prior contract:
|
| 231 |
+
# - ``inference_steps`` (was ``steps``, defaulted to 8 which made the
|
| 232 |
+
# XL SFT model behave too turbo-ish; new default 27).
|
| 233 |
+
# - ``guidance_scale`` (was ``cfg``, default 7.0 for stronger prompt
|
| 234 |
+
# adherence).
|
| 235 |
+
# - ``infer_method`` (new — ``"ode"`` deterministic / ``"sde"``
|
| 236 |
+
# stochastic; the user can now flip to ``sde`` to actually get
|
| 237 |
+
# different output each click even with the same seed).
|
| 238 |
+
# - ``use_adg`` (new — Adaptive Dual Guidance; experimental).
|
| 239 |
+
# - ``thinking`` (5Hz LM CoT — default flips to True so the LM can
|
| 240 |
+
# reason about caption + metadata, which is the actual source of
|
| 241 |
+
# the "no matter what prompt the style barely changes" symptom).
|
| 242 |
+
# - ``use_cot_metas`` / ``use_cot_caption`` / ``use_cot_language``
|
| 243 |
+
# keys renamed from ``cot_*`` for consistency with the dataclass.
|
| 244 |
gen_params = GenerationParams(
|
| 245 |
task_type=task_type,
|
| 246 |
caption=caption,
|
|
|
|
| 248 |
instrumental=instrumental,
|
| 249 |
duration=duration_s,
|
| 250 |
seed=int(params.get("seed", -1)),
|
| 251 |
+
inference_steps=int(advanced.get("inference_steps", 27)),
|
| 252 |
+
guidance_scale=float(advanced.get("guidance_scale", 7.0)),
|
| 253 |
+
infer_method=str(advanced.get("infer_method", "ode")),
|
| 254 |
+
use_adg=bool(advanced.get("use_adg", False)),
|
| 255 |
shift=float(advanced.get("shift", 1.0)),
|
| 256 |
bpm=advanced.get("bpm"),
|
| 257 |
keyscale=advanced.get("keyscale", ""),
|
|
|
|
| 265 |
audio_cover_strength=audio_cover_strength,
|
| 266 |
repainting_start=repainting_start,
|
| 267 |
repainting_end=repainting_end,
|
| 268 |
+
# 5Hz language model knobs — defaults flipped to True so the
|
| 269 |
+
# LM actually reasons about each prompt instead of returning
|
| 270 |
+
# blank captions / metadata back to the DiT.
|
| 271 |
+
thinking=bool(lm_opts.get("thinking", True)),
|
| 272 |
lm_temperature=float(lm_opts.get("temperature", 0.85)),
|
| 273 |
lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
|
| 274 |
lm_top_k=int(lm_opts.get("top_k", 0)),
|
| 275 |
lm_top_p=float(lm_opts.get("top_p", 0.9)),
|
| 276 |
+
lm_negative_prompt=lm_opts.get("negative_prompt", "NO USER INPUT"),
|
| 277 |
+
use_cot_metas=bool(lm_opts.get("use_cot_metas", True)),
|
| 278 |
+
use_cot_caption=bool(lm_opts.get("use_cot_caption", True)),
|
| 279 |
+
use_cot_language=bool(lm_opts.get("use_cot_language", True)),
|
| 280 |
)
|
| 281 |
|
| 282 |
gen_config = GenerationConfig(
|
|
@@ -436,6 +436,65 @@ def on_lora_strength_change(state, strength: float):
|
|
| 436 |
return new_state, _active_md(new_state["name"], float(strength), kind)
|
| 437 |
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
@_maybe_spaces_gpu("generate")
|
| 440 |
def on_generate_click(
|
| 441 |
prompt: str,
|
|
@@ -443,9 +502,53 @@ def on_generate_click(
|
|
| 443 |
duration_s: float,
|
| 444 |
instrumental_label: str,
|
| 445 |
lora_state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 447 |
):
|
| 448 |
loras = [lora_state] if lora_state else []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
out_path, meta = _safe_call(
|
| 450 |
modes.generate,
|
| 451 |
get_backend(),
|
|
@@ -454,10 +557,10 @@ def on_generate_click(
|
|
| 454 |
"lyrics": lyrics,
|
| 455 |
"duration_s": int(duration_s),
|
| 456 |
"instrumental": instrumental_label == "Instrumental",
|
| 457 |
-
"seed":
|
| 458 |
"loras": loras,
|
| 459 |
-
"advanced":
|
| 460 |
-
"lm":
|
| 461 |
"dcw": {},
|
| 462 |
},
|
| 463 |
)
|
|
@@ -473,10 +576,54 @@ def on_cover_click(
|
|
| 473 |
duration_s: float,
|
| 474 |
audio_cover_strength: float,
|
| 475 |
lora_state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 477 |
):
|
| 478 |
"""Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
|
| 479 |
loras = [lora_state] if lora_state else []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
out_path, meta = _safe_call(
|
| 481 |
modes.cover,
|
| 482 |
get_backend(),
|
|
@@ -486,10 +633,10 @@ def on_cover_click(
|
|
| 486 |
"lyrics": lyrics,
|
| 487 |
"duration_s": int(duration_s),
|
| 488 |
"audio_cover_strength": float(audio_cover_strength),
|
| 489 |
-
"seed":
|
| 490 |
"loras": loras,
|
| 491 |
-
"advanced":
|
| 492 |
-
"lm":
|
| 493 |
"dcw": {},
|
| 494 |
},
|
| 495 |
)
|
|
@@ -509,10 +656,54 @@ def on_extend_click(
|
|
| 509 |
latent_crossfade_frames: float,
|
| 510 |
chunk_mask_mode: str,
|
| 511 |
lora_state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 513 |
):
|
| 514 |
"""Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
|
| 515 |
loras = [lora_state] if lora_state else []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
out_path, meta = _safe_call(
|
| 517 |
modes.extend,
|
| 518 |
get_backend(),
|
|
@@ -526,10 +717,10 @@ def on_extend_click(
|
|
| 526 |
"repaint_strength": float(repaint_strength),
|
| 527 |
"latent_crossfade_frames": int(latent_crossfade_frames),
|
| 528 |
"chunk_mask_mode": chunk_mask_mode,
|
| 529 |
-
"seed":
|
| 530 |
"loras": loras,
|
| 531 |
-
"advanced":
|
| 532 |
-
"lm":
|
| 533 |
"dcw": {},
|
| 534 |
},
|
| 535 |
)
|
|
@@ -632,10 +823,54 @@ def on_edit_click(
|
|
| 632 |
flow_n_max: float,
|
| 633 |
flow_n_avg: float,
|
| 634 |
lora_state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 636 |
):
|
| 637 |
"""Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
|
| 638 |
loras = [lora_state] if lora_state else []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
out_path, meta = _safe_call(
|
| 640 |
modes.edit,
|
| 641 |
get_backend(),
|
|
@@ -652,10 +887,10 @@ def on_edit_click(
|
|
| 652 |
"flow_n_min": float(flow_n_min),
|
| 653 |
"flow_n_max": float(flow_n_max),
|
| 654 |
"flow_n_avg": int(flow_n_avg),
|
| 655 |
-
"seed":
|
| 656 |
"loras": loras,
|
| 657 |
-
"advanced":
|
| 658 |
-
"lm":
|
| 659 |
"dcw": {},
|
| 660 |
},
|
| 661 |
)
|
|
@@ -799,6 +1034,27 @@ def build_app() -> gr.Blocks:
|
|
| 799 |
g["duration_s"],
|
| 800 |
g["instrumental"],
|
| 801 |
g["lora_state"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
],
|
| 803 |
outputs=[g["output_audio"], g["output_meta"], history_html],
|
| 804 |
)
|
|
@@ -844,6 +1100,27 @@ def build_app() -> gr.Blocks:
|
|
| 844 |
c["duration_s"],
|
| 845 |
c["audio_cover_strength"],
|
| 846 |
c["lora_state"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
],
|
| 848 |
outputs=[c["output_audio"], c["output_meta"], history_html],
|
| 849 |
)
|
|
@@ -893,6 +1170,27 @@ def build_app() -> gr.Blocks:
|
|
| 893 |
x["latent_crossfade_frames"],
|
| 894 |
x["chunk_mask_mode"],
|
| 895 |
x["lora_state"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
],
|
| 897 |
outputs=[x["output_audio"], x["output_meta"], history_html],
|
| 898 |
)
|
|
@@ -945,6 +1243,27 @@ def build_app() -> gr.Blocks:
|
|
| 945 |
e["flow_n_max"],
|
| 946 |
e["flow_n_avg"],
|
| 947 |
e["lora_state"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 948 |
],
|
| 949 |
outputs=[e["output_audio"], e["output_meta"], history_html],
|
| 950 |
)
|
|
|
|
| 436 |
return new_state, _active_md(new_state["name"], float(strength), kind)
|
| 437 |
|
| 438 |
|
| 439 |
+
def _build_advanced_params(
|
| 440 |
+
adv_inference_steps,
|
| 441 |
+
adv_guidance_scale,
|
| 442 |
+
adv_infer_method,
|
| 443 |
+
adv_seed,
|
| 444 |
+
adv_cfg_interval_start,
|
| 445 |
+
adv_cfg_interval_end,
|
| 446 |
+
adv_shift,
|
| 447 |
+
adv_use_adg,
|
| 448 |
+
adv_thinking,
|
| 449 |
+
adv_use_cot_caption,
|
| 450 |
+
adv_use_cot_metas,
|
| 451 |
+
adv_use_cot_language,
|
| 452 |
+
adv_lm_temperature,
|
| 453 |
+
adv_lm_top_p,
|
| 454 |
+
adv_lm_top_k,
|
| 455 |
+
adv_lm_cfg_scale,
|
| 456 |
+
adv_lm_negative_prompt,
|
| 457 |
+
adv_bpm,
|
| 458 |
+
adv_keyscale,
|
| 459 |
+
adv_timesignature,
|
| 460 |
+
adv_vocal_language,
|
| 461 |
+
):
|
| 462 |
+
"""Pack the 21 Advanced-accordion inputs into the ``advanced`` + ``lm``
|
| 463 |
+
dicts that ``ace_pipeline.ACEStepStudio.generate`` consumes.
|
| 464 |
+
|
| 465 |
+
Centralising this avoids repeating the same dict-construction in each
|
| 466 |
+
of the four song-mode click handlers. Returns ``(seed, advanced, lm)``.
|
| 467 |
+
``seed`` is the resolved seed (-1 / 0 / None → random 32-bit positive).
|
| 468 |
+
"""
|
| 469 |
+
seed_raw = int(adv_seed) if adv_seed is not None else -1
|
| 470 |
+
seed = seed_raw if seed_raw > 0 else random.randint(1, 2_147_483_647)
|
| 471 |
+
advanced = {
|
| 472 |
+
"inference_steps": int(adv_inference_steps),
|
| 473 |
+
"guidance_scale": float(adv_guidance_scale),
|
| 474 |
+
"infer_method": adv_infer_method,
|
| 475 |
+
"cfg_interval_start": float(adv_cfg_interval_start),
|
| 476 |
+
"cfg_interval_end": float(adv_cfg_interval_end),
|
| 477 |
+
"shift": float(adv_shift),
|
| 478 |
+
"use_adg": bool(adv_use_adg),
|
| 479 |
+
"bpm": int(adv_bpm) if adv_bpm else None,
|
| 480 |
+
"keyscale": adv_keyscale or "",
|
| 481 |
+
"timesignature": adv_timesignature or "",
|
| 482 |
+
"vocal_language": adv_vocal_language or "unknown",
|
| 483 |
+
}
|
| 484 |
+
lm = {
|
| 485 |
+
"thinking": bool(adv_thinking),
|
| 486 |
+
"use_cot_caption": bool(adv_use_cot_caption),
|
| 487 |
+
"use_cot_metas": bool(adv_use_cot_metas),
|
| 488 |
+
"use_cot_language": bool(adv_use_cot_language),
|
| 489 |
+
"temperature": float(adv_lm_temperature),
|
| 490 |
+
"top_p": float(adv_lm_top_p),
|
| 491 |
+
"top_k": int(adv_lm_top_k) if adv_lm_top_k else 0,
|
| 492 |
+
"cfg": float(adv_lm_cfg_scale),
|
| 493 |
+
"negative_prompt": adv_lm_negative_prompt or "NO USER INPUT",
|
| 494 |
+
}
|
| 495 |
+
return seed, advanced, lm
|
| 496 |
+
|
| 497 |
+
|
| 498 |
@_maybe_spaces_gpu("generate")
|
| 499 |
def on_generate_click(
|
| 500 |
prompt: str,
|
|
|
|
| 502 |
duration_s: float,
|
| 503 |
instrumental_label: str,
|
| 504 |
lora_state,
|
| 505 |
+
adv_inference_steps,
|
| 506 |
+
adv_guidance_scale,
|
| 507 |
+
adv_infer_method,
|
| 508 |
+
adv_seed,
|
| 509 |
+
adv_cfg_interval_start,
|
| 510 |
+
adv_cfg_interval_end,
|
| 511 |
+
adv_shift,
|
| 512 |
+
adv_use_adg,
|
| 513 |
+
adv_thinking,
|
| 514 |
+
adv_use_cot_caption,
|
| 515 |
+
adv_use_cot_metas,
|
| 516 |
+
adv_use_cot_language,
|
| 517 |
+
adv_lm_temperature,
|
| 518 |
+
adv_lm_top_p,
|
| 519 |
+
adv_lm_top_k,
|
| 520 |
+
adv_lm_cfg_scale,
|
| 521 |
+
adv_lm_negative_prompt,
|
| 522 |
+
adv_bpm,
|
| 523 |
+
adv_keyscale,
|
| 524 |
+
adv_timesignature,
|
| 525 |
+
adv_vocal_language,
|
| 526 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 527 |
):
|
| 528 |
loras = [lora_state] if lora_state else []
|
| 529 |
+
seed, advanced, lm = _build_advanced_params(
|
| 530 |
+
adv_inference_steps,
|
| 531 |
+
adv_guidance_scale,
|
| 532 |
+
adv_infer_method,
|
| 533 |
+
adv_seed,
|
| 534 |
+
adv_cfg_interval_start,
|
| 535 |
+
adv_cfg_interval_end,
|
| 536 |
+
adv_shift,
|
| 537 |
+
adv_use_adg,
|
| 538 |
+
adv_thinking,
|
| 539 |
+
adv_use_cot_caption,
|
| 540 |
+
adv_use_cot_metas,
|
| 541 |
+
adv_use_cot_language,
|
| 542 |
+
adv_lm_temperature,
|
| 543 |
+
adv_lm_top_p,
|
| 544 |
+
adv_lm_top_k,
|
| 545 |
+
adv_lm_cfg_scale,
|
| 546 |
+
adv_lm_negative_prompt,
|
| 547 |
+
adv_bpm,
|
| 548 |
+
adv_keyscale,
|
| 549 |
+
adv_timesignature,
|
| 550 |
+
adv_vocal_language,
|
| 551 |
+
)
|
| 552 |
out_path, meta = _safe_call(
|
| 553 |
modes.generate,
|
| 554 |
get_backend(),
|
|
|
|
| 557 |
"lyrics": lyrics,
|
| 558 |
"duration_s": int(duration_s),
|
| 559 |
"instrumental": instrumental_label == "Instrumental",
|
| 560 |
+
"seed": seed,
|
| 561 |
"loras": loras,
|
| 562 |
+
"advanced": advanced,
|
| 563 |
+
"lm": lm,
|
| 564 |
"dcw": {},
|
| 565 |
},
|
| 566 |
)
|
|
|
|
| 576 |
duration_s: float,
|
| 577 |
audio_cover_strength: float,
|
| 578 |
lora_state,
|
| 579 |
+
adv_inference_steps,
|
| 580 |
+
adv_guidance_scale,
|
| 581 |
+
adv_infer_method,
|
| 582 |
+
adv_seed,
|
| 583 |
+
adv_cfg_interval_start,
|
| 584 |
+
adv_cfg_interval_end,
|
| 585 |
+
adv_shift,
|
| 586 |
+
adv_use_adg,
|
| 587 |
+
adv_thinking,
|
| 588 |
+
adv_use_cot_caption,
|
| 589 |
+
adv_use_cot_metas,
|
| 590 |
+
adv_use_cot_language,
|
| 591 |
+
adv_lm_temperature,
|
| 592 |
+
adv_lm_top_p,
|
| 593 |
+
adv_lm_top_k,
|
| 594 |
+
adv_lm_cfg_scale,
|
| 595 |
+
adv_lm_negative_prompt,
|
| 596 |
+
adv_bpm,
|
| 597 |
+
adv_keyscale,
|
| 598 |
+
adv_timesignature,
|
| 599 |
+
adv_vocal_language,
|
| 600 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 601 |
):
|
| 602 |
"""Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
|
| 603 |
loras = [lora_state] if lora_state else []
|
| 604 |
+
seed, advanced, lm = _build_advanced_params(
|
| 605 |
+
adv_inference_steps,
|
| 606 |
+
adv_guidance_scale,
|
| 607 |
+
adv_infer_method,
|
| 608 |
+
adv_seed,
|
| 609 |
+
adv_cfg_interval_start,
|
| 610 |
+
adv_cfg_interval_end,
|
| 611 |
+
adv_shift,
|
| 612 |
+
adv_use_adg,
|
| 613 |
+
adv_thinking,
|
| 614 |
+
adv_use_cot_caption,
|
| 615 |
+
adv_use_cot_metas,
|
| 616 |
+
adv_use_cot_language,
|
| 617 |
+
adv_lm_temperature,
|
| 618 |
+
adv_lm_top_p,
|
| 619 |
+
adv_lm_top_k,
|
| 620 |
+
adv_lm_cfg_scale,
|
| 621 |
+
adv_lm_negative_prompt,
|
| 622 |
+
adv_bpm,
|
| 623 |
+
adv_keyscale,
|
| 624 |
+
adv_timesignature,
|
| 625 |
+
adv_vocal_language,
|
| 626 |
+
)
|
| 627 |
out_path, meta = _safe_call(
|
| 628 |
modes.cover,
|
| 629 |
get_backend(),
|
|
|
|
| 633 |
"lyrics": lyrics,
|
| 634 |
"duration_s": int(duration_s),
|
| 635 |
"audio_cover_strength": float(audio_cover_strength),
|
| 636 |
+
"seed": seed,
|
| 637 |
"loras": loras,
|
| 638 |
+
"advanced": advanced,
|
| 639 |
+
"lm": lm,
|
| 640 |
"dcw": {},
|
| 641 |
},
|
| 642 |
)
|
|
|
|
| 656 |
latent_crossfade_frames: float,
|
| 657 |
chunk_mask_mode: str,
|
| 658 |
lora_state,
|
| 659 |
+
adv_inference_steps,
|
| 660 |
+
adv_guidance_scale,
|
| 661 |
+
adv_infer_method,
|
| 662 |
+
adv_seed,
|
| 663 |
+
adv_cfg_interval_start,
|
| 664 |
+
adv_cfg_interval_end,
|
| 665 |
+
adv_shift,
|
| 666 |
+
adv_use_adg,
|
| 667 |
+
adv_thinking,
|
| 668 |
+
adv_use_cot_caption,
|
| 669 |
+
adv_use_cot_metas,
|
| 670 |
+
adv_use_cot_language,
|
| 671 |
+
adv_lm_temperature,
|
| 672 |
+
adv_lm_top_p,
|
| 673 |
+
adv_lm_top_k,
|
| 674 |
+
adv_lm_cfg_scale,
|
| 675 |
+
adv_lm_negative_prompt,
|
| 676 |
+
adv_bpm,
|
| 677 |
+
adv_keyscale,
|
| 678 |
+
adv_timesignature,
|
| 679 |
+
adv_vocal_language,
|
| 680 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 681 |
):
|
| 682 |
"""Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
|
| 683 |
loras = [lora_state] if lora_state else []
|
| 684 |
+
seed, advanced, lm = _build_advanced_params(
|
| 685 |
+
adv_inference_steps,
|
| 686 |
+
adv_guidance_scale,
|
| 687 |
+
adv_infer_method,
|
| 688 |
+
adv_seed,
|
| 689 |
+
adv_cfg_interval_start,
|
| 690 |
+
adv_cfg_interval_end,
|
| 691 |
+
adv_shift,
|
| 692 |
+
adv_use_adg,
|
| 693 |
+
adv_thinking,
|
| 694 |
+
adv_use_cot_caption,
|
| 695 |
+
adv_use_cot_metas,
|
| 696 |
+
adv_use_cot_language,
|
| 697 |
+
adv_lm_temperature,
|
| 698 |
+
adv_lm_top_p,
|
| 699 |
+
adv_lm_top_k,
|
| 700 |
+
adv_lm_cfg_scale,
|
| 701 |
+
adv_lm_negative_prompt,
|
| 702 |
+
adv_bpm,
|
| 703 |
+
adv_keyscale,
|
| 704 |
+
adv_timesignature,
|
| 705 |
+
adv_vocal_language,
|
| 706 |
+
)
|
| 707 |
out_path, meta = _safe_call(
|
| 708 |
modes.extend,
|
| 709 |
get_backend(),
|
|
|
|
| 717 |
"repaint_strength": float(repaint_strength),
|
| 718 |
"latent_crossfade_frames": int(latent_crossfade_frames),
|
| 719 |
"chunk_mask_mode": chunk_mask_mode,
|
| 720 |
+
"seed": seed,
|
| 721 |
"loras": loras,
|
| 722 |
+
"advanced": advanced,
|
| 723 |
+
"lm": lm,
|
| 724 |
"dcw": {},
|
| 725 |
},
|
| 726 |
)
|
|
|
|
| 823 |
flow_n_max: float,
|
| 824 |
flow_n_avg: float,
|
| 825 |
lora_state,
|
| 826 |
+
adv_inference_steps,
|
| 827 |
+
adv_guidance_scale,
|
| 828 |
+
adv_infer_method,
|
| 829 |
+
adv_seed,
|
| 830 |
+
adv_cfg_interval_start,
|
| 831 |
+
adv_cfg_interval_end,
|
| 832 |
+
adv_shift,
|
| 833 |
+
adv_use_adg,
|
| 834 |
+
adv_thinking,
|
| 835 |
+
adv_use_cot_caption,
|
| 836 |
+
adv_use_cot_metas,
|
| 837 |
+
adv_use_cot_language,
|
| 838 |
+
adv_lm_temperature,
|
| 839 |
+
adv_lm_top_p,
|
| 840 |
+
adv_lm_top_k,
|
| 841 |
+
adv_lm_cfg_scale,
|
| 842 |
+
adv_lm_negative_prompt,
|
| 843 |
+
adv_bpm,
|
| 844 |
+
adv_keyscale,
|
| 845 |
+
adv_timesignature,
|
| 846 |
+
adv_vocal_language,
|
| 847 |
progress=gr.Progress(track_tqdm=True), # noqa: B008
|
| 848 |
):
|
| 849 |
"""Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
|
| 850 |
loras = [lora_state] if lora_state else []
|
| 851 |
+
seed, advanced, lm = _build_advanced_params(
|
| 852 |
+
adv_inference_steps,
|
| 853 |
+
adv_guidance_scale,
|
| 854 |
+
adv_infer_method,
|
| 855 |
+
adv_seed,
|
| 856 |
+
adv_cfg_interval_start,
|
| 857 |
+
adv_cfg_interval_end,
|
| 858 |
+
adv_shift,
|
| 859 |
+
adv_use_adg,
|
| 860 |
+
adv_thinking,
|
| 861 |
+
adv_use_cot_caption,
|
| 862 |
+
adv_use_cot_metas,
|
| 863 |
+
adv_use_cot_language,
|
| 864 |
+
adv_lm_temperature,
|
| 865 |
+
adv_lm_top_p,
|
| 866 |
+
adv_lm_top_k,
|
| 867 |
+
adv_lm_cfg_scale,
|
| 868 |
+
adv_lm_negative_prompt,
|
| 869 |
+
adv_bpm,
|
| 870 |
+
adv_keyscale,
|
| 871 |
+
adv_timesignature,
|
| 872 |
+
adv_vocal_language,
|
| 873 |
+
)
|
| 874 |
out_path, meta = _safe_call(
|
| 875 |
modes.edit,
|
| 876 |
get_backend(),
|
|
|
|
| 887 |
"flow_n_min": float(flow_n_min),
|
| 888 |
"flow_n_max": float(flow_n_max),
|
| 889 |
"flow_n_avg": int(flow_n_avg),
|
| 890 |
+
"seed": seed,
|
| 891 |
"loras": loras,
|
| 892 |
+
"advanced": advanced,
|
| 893 |
+
"lm": lm,
|
| 894 |
"dcw": {},
|
| 895 |
},
|
| 896 |
)
|
|
|
|
| 1034 |
g["duration_s"],
|
| 1035 |
g["instrumental"],
|
| 1036 |
g["lora_state"],
|
| 1037 |
+
g["adv_inference_steps"],
|
| 1038 |
+
g["adv_guidance_scale"],
|
| 1039 |
+
g["adv_infer_method"],
|
| 1040 |
+
g["adv_seed"],
|
| 1041 |
+
g["adv_cfg_interval_start"],
|
| 1042 |
+
g["adv_cfg_interval_end"],
|
| 1043 |
+
g["adv_shift"],
|
| 1044 |
+
g["adv_use_adg"],
|
| 1045 |
+
g["adv_thinking"],
|
| 1046 |
+
g["adv_use_cot_caption"],
|
| 1047 |
+
g["adv_use_cot_metas"],
|
| 1048 |
+
g["adv_use_cot_language"],
|
| 1049 |
+
g["adv_lm_temperature"],
|
| 1050 |
+
g["adv_lm_top_p"],
|
| 1051 |
+
g["adv_lm_top_k"],
|
| 1052 |
+
g["adv_lm_cfg_scale"],
|
| 1053 |
+
g["adv_lm_negative_prompt"],
|
| 1054 |
+
g["adv_bpm"],
|
| 1055 |
+
g["adv_keyscale"],
|
| 1056 |
+
g["adv_timesignature"],
|
| 1057 |
+
g["adv_vocal_language"],
|
| 1058 |
],
|
| 1059 |
outputs=[g["output_audio"], g["output_meta"], history_html],
|
| 1060 |
)
|
|
|
|
| 1100 |
c["duration_s"],
|
| 1101 |
c["audio_cover_strength"],
|
| 1102 |
c["lora_state"],
|
| 1103 |
+
c["adv_inference_steps"],
|
| 1104 |
+
c["adv_guidance_scale"],
|
| 1105 |
+
c["adv_infer_method"],
|
| 1106 |
+
c["adv_seed"],
|
| 1107 |
+
c["adv_cfg_interval_start"],
|
| 1108 |
+
c["adv_cfg_interval_end"],
|
| 1109 |
+
c["adv_shift"],
|
| 1110 |
+
c["adv_use_adg"],
|
| 1111 |
+
c["adv_thinking"],
|
| 1112 |
+
c["adv_use_cot_caption"],
|
| 1113 |
+
c["adv_use_cot_metas"],
|
| 1114 |
+
c["adv_use_cot_language"],
|
| 1115 |
+
c["adv_lm_temperature"],
|
| 1116 |
+
c["adv_lm_top_p"],
|
| 1117 |
+
c["adv_lm_top_k"],
|
| 1118 |
+
c["adv_lm_cfg_scale"],
|
| 1119 |
+
c["adv_lm_negative_prompt"],
|
| 1120 |
+
c["adv_bpm"],
|
| 1121 |
+
c["adv_keyscale"],
|
| 1122 |
+
c["adv_timesignature"],
|
| 1123 |
+
c["adv_vocal_language"],
|
| 1124 |
],
|
| 1125 |
outputs=[c["output_audio"], c["output_meta"], history_html],
|
| 1126 |
)
|
|
|
|
| 1170 |
x["latent_crossfade_frames"],
|
| 1171 |
x["chunk_mask_mode"],
|
| 1172 |
x["lora_state"],
|
| 1173 |
+
x["adv_inference_steps"],
|
| 1174 |
+
x["adv_guidance_scale"],
|
| 1175 |
+
x["adv_infer_method"],
|
| 1176 |
+
x["adv_seed"],
|
| 1177 |
+
x["adv_cfg_interval_start"],
|
| 1178 |
+
x["adv_cfg_interval_end"],
|
| 1179 |
+
x["adv_shift"],
|
| 1180 |
+
x["adv_use_adg"],
|
| 1181 |
+
x["adv_thinking"],
|
| 1182 |
+
x["adv_use_cot_caption"],
|
| 1183 |
+
x["adv_use_cot_metas"],
|
| 1184 |
+
x["adv_use_cot_language"],
|
| 1185 |
+
x["adv_lm_temperature"],
|
| 1186 |
+
x["adv_lm_top_p"],
|
| 1187 |
+
x["adv_lm_top_k"],
|
| 1188 |
+
x["adv_lm_cfg_scale"],
|
| 1189 |
+
x["adv_lm_negative_prompt"],
|
| 1190 |
+
x["adv_bpm"],
|
| 1191 |
+
x["adv_keyscale"],
|
| 1192 |
+
x["adv_timesignature"],
|
| 1193 |
+
x["adv_vocal_language"],
|
| 1194 |
],
|
| 1195 |
outputs=[x["output_audio"], x["output_meta"], history_html],
|
| 1196 |
)
|
|
|
|
| 1243 |
e["flow_n_max"],
|
| 1244 |
e["flow_n_avg"],
|
| 1245 |
e["lora_state"],
|
| 1246 |
+
e["adv_inference_steps"],
|
| 1247 |
+
e["adv_guidance_scale"],
|
| 1248 |
+
e["adv_infer_method"],
|
| 1249 |
+
e["adv_seed"],
|
| 1250 |
+
e["adv_cfg_interval_start"],
|
| 1251 |
+
e["adv_cfg_interval_end"],
|
| 1252 |
+
e["adv_shift"],
|
| 1253 |
+
e["adv_use_adg"],
|
| 1254 |
+
e["adv_thinking"],
|
| 1255 |
+
e["adv_use_cot_caption"],
|
| 1256 |
+
e["adv_use_cot_metas"],
|
| 1257 |
+
e["adv_use_cot_language"],
|
| 1258 |
+
e["adv_lm_temperature"],
|
| 1259 |
+
e["adv_lm_top_p"],
|
| 1260 |
+
e["adv_lm_top_k"],
|
| 1261 |
+
e["adv_lm_cfg_scale"],
|
| 1262 |
+
e["adv_lm_negative_prompt"],
|
| 1263 |
+
e["adv_bpm"],
|
| 1264 |
+
e["adv_keyscale"],
|
| 1265 |
+
e["adv_timesignature"],
|
| 1266 |
+
e["adv_vocal_language"],
|
| 1267 |
],
|
| 1268 |
outputs=[e["output_audio"], e["output_meta"], history_html],
|
| 1269 |
)
|
|
@@ -64,6 +64,10 @@ class ACEStepStudioBackend:
|
|
| 64 |
{"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
|
| 65 |
for lora in params.get("loras", [])
|
| 66 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
"lm": params.get("lm", {}),
|
| 68 |
"dcw": params.get("dcw", {}),
|
| 69 |
}
|
|
|
|
| 64 |
{"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
|
| 65 |
for lora in params.get("loras", [])
|
| 66 |
],
|
| 67 |
+
# Echo the advanced + lm dicts back so the user can see which
|
| 68 |
+
# knobs were active for a given output and lock-iterate from
|
| 69 |
+
# there. The "seed" above is the resolved seed (never -1).
|
| 70 |
+
"advanced": params.get("advanced", {}),
|
| 71 |
"lm": params.get("lm", {}),
|
| 72 |
"dcw": params.get("dcw", {}),
|
| 73 |
}
|
|
@@ -106,7 +106,15 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
|
|
| 106 |
"instrumental": False,
|
| 107 |
"seed": 42,
|
| 108 |
"loras": [],
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"lm": {"thinking": False},
|
| 111 |
"dcw": {},
|
| 112 |
}
|
|
@@ -118,6 +126,8 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
|
|
| 118 |
assert captured["gp"]["duration"] == 30
|
| 119 |
assert captured["gp"]["seed"] == 42
|
| 120 |
assert captured["gp"]["inference_steps"] == 32
|
|
|
|
|
|
|
| 121 |
assert captured["gp"]["bpm"] == 135
|
| 122 |
|
| 123 |
|
|
|
|
| 106 |
"instrumental": False,
|
| 107 |
"seed": 42,
|
| 108 |
"loras": [],
|
| 109 |
+
# New advanced contract: ``inference_steps`` + ``guidance_scale``
|
| 110 |
+
# + ``infer_method`` replace the old ``steps`` + ``cfg`` keys.
|
| 111 |
+
# See ace_pipeline.ACEStepStudio.generate for the full schema.
|
| 112 |
+
"advanced": {
|
| 113 |
+
"inference_steps": 32,
|
| 114 |
+
"guidance_scale": 4.0,
|
| 115 |
+
"infer_method": "ode",
|
| 116 |
+
"bpm": 135,
|
| 117 |
+
},
|
| 118 |
"lm": {"thinking": False},
|
| 119 |
"dcw": {},
|
| 120 |
}
|
|
|
|
| 126 |
assert captured["gp"]["duration"] == 30
|
| 127 |
assert captured["gp"]["seed"] == 42
|
| 128 |
assert captured["gp"]["inference_steps"] == 32
|
| 129 |
+
assert captured["gp"]["guidance_scale"] == 4.0
|
| 130 |
+
assert captured["gp"]["infer_method"] == "ode"
|
| 131 |
assert captured["gp"]["bpm"] == 135
|
| 132 |
|
| 133 |
|
|
@@ -989,6 +989,85 @@ main, .contain {{
|
|
| 989 |
padding:0 12px 12px 12px !important;
|
| 990 |
}}
|
| 991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 992 |
/* ============================================================
|
| 993 |
* Post-process action row (M5/G2) — sits below the Output Audio.
|
| 994 |
* Three compact mono pills (separate stems / normalise / mp3 export)
|
|
|
|
| 989 |
padding:0 12px 12px 12px !important;
|
| 990 |
}}
|
| 991 |
|
| 992 |
+
/* ============================================================
|
| 993 |
+
* Advanced controls accordion (M0-X)
|
| 994 |
+
* Bordered chrome matching the LoRA + LM + experimental accordions so
|
| 995 |
+
* the four song-mode panes read consistently. Inside the accordion we
|
| 996 |
+
* additionally render small <h>/<p strong> section headers (Diffusion,
|
| 997 |
+
* CFG schedule, 5Hz LM, Music metadata) to chunk the 21 knobs into
|
| 998 |
+
* logical groups; those need their own mono-uppercase-faint treatment
|
| 999 |
+
* so they don't compete with the form labels for visual weight.
|
| 1000 |
+
* ============================================================ */
|
| 1001 |
+
.ams-content .ams-advanced {{
|
| 1002 |
+
border:1px solid {BORDER} !important;
|
| 1003 |
+
border-radius:3px !important;
|
| 1004 |
+
background:{SURFACE_STRONG} !important;
|
| 1005 |
+
margin-top:10px !important;
|
| 1006 |
+
padding:0 !important;
|
| 1007 |
+
}}
|
| 1008 |
+
.ams-content .ams-advanced > .label-wrap,
|
| 1009 |
+
.ams-content .ams-advanced summary,
|
| 1010 |
+
.ams-content .ams-advanced > button {{
|
| 1011 |
+
font-family: {FONT_MONO} !important;
|
| 1012 |
+
font-size:10px !important;
|
| 1013 |
+
letter-spacing:0.08em !important;
|
| 1014 |
+
text-transform:uppercase !important;
|
| 1015 |
+
color:{INK_MUTED} !important;
|
| 1016 |
+
padding:10px 12px !important;
|
| 1017 |
+
background:transparent !important;
|
| 1018 |
+
border:none !important;
|
| 1019 |
+
}}
|
| 1020 |
+
.ams-content .ams-advanced > .label-wrap span,
|
| 1021 |
+
.ams-content .ams-advanced summary span,
|
| 1022 |
+
.ams-content .ams-advanced > button span {{
|
| 1023 |
+
color:{INK_MUTED} !important;
|
| 1024 |
+
font-family: {FONT_MONO} !important;
|
| 1025 |
+
font-size:10px !important;
|
| 1026 |
+
letter-spacing:0.08em !important;
|
| 1027 |
+
text-transform:uppercase !important;
|
| 1028 |
+
}}
|
| 1029 |
+
.ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
|
| 1030 |
+
padding:0 12px 12px 12px !important;
|
| 1031 |
+
}}
|
| 1032 |
+
/* Section divider Markdown headers inside the accordion. We render them
|
| 1033 |
+
as **Diffusion** (etc) via gr.Markdown — Gradio wraps that in
|
| 1034 |
+
``.prose strong``. Treat the strong tag as a small mono uppercase
|
| 1035 |
+
header with a subtle underline so the four groups have clear visual
|
| 1036 |
+
boundaries without competing with the actual form labels. */
|
| 1037 |
+
.ams-content .ams-advanced .ams-adv-section .prose p {{
|
| 1038 |
+
margin:14px 0 4px 0 !important;
|
| 1039 |
+
padding:0 0 4px 0 !important;
|
| 1040 |
+
border-bottom:1px solid {BORDER} !important;
|
| 1041 |
+
}}
|
| 1042 |
+
.ams-content .ams-advanced .ams-adv-section .prose p:first-child {{
|
| 1043 |
+
margin-top:6px !important;
|
| 1044 |
+
}}
|
| 1045 |
+
.ams-content .ams-advanced .ams-adv-section .prose strong {{
|
| 1046 |
+
font-family: {FONT_MONO} !important;
|
| 1047 |
+
font-size:10px !important;
|
| 1048 |
+
letter-spacing:0.12em !important;
|
| 1049 |
+
text-transform:uppercase !important;
|
| 1050 |
+
color:{INK} !important;
|
| 1051 |
+
font-weight:600 !important;
|
| 1052 |
+
}}
|
| 1053 |
+
.ams-content .ams-advanced .ams-adv-section .prose {{
|
| 1054 |
+
background:transparent !important;
|
| 1055 |
+
}}
|
| 1056 |
+
@media (max-width: 640px) {{
|
| 1057 |
+
.ams-content .ams-advanced > .label-wrap,
|
| 1058 |
+
.ams-content .ams-advanced summary,
|
| 1059 |
+
.ams-content .ams-advanced > button {{
|
| 1060 |
+
font-size:9px !important;
|
| 1061 |
+
padding:8px 10px !important;
|
| 1062 |
+
}}
|
| 1063 |
+
.ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
|
| 1064 |
+
padding:0 10px 10px 10px !important;
|
| 1065 |
+
}}
|
| 1066 |
+
.ams-content .ams-advanced .ams-adv-section .prose strong {{
|
| 1067 |
+
font-size:9px !important;
|
| 1068 |
+
}}
|
| 1069 |
+
}}
|
| 1070 |
+
|
| 1071 |
/* ============================================================
|
| 1072 |
* Post-process action row (M5/G2) — sits below the Output Audio.
|
| 1073 |
* Three compact mono pills (separate stems / normalise / mp3 export)
|
|
@@ -16,6 +16,175 @@ import lora_stack
|
|
| 16 |
import tooltips
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
|
| 20 |
"""LoRA accordion with single-LoRA semantics. Mutates ``components``.
|
| 21 |
|
|
@@ -179,6 +348,7 @@ def build_generate_tab() -> dict[str, gr.components.Component]:
|
|
| 179 |
)
|
| 180 |
|
| 181 |
_build_lora_accordion(components)
|
|
|
|
| 182 |
|
| 183 |
components["generate_btn"] = gr.Button(
|
| 184 |
"▶ Generate",
|
|
@@ -240,6 +410,7 @@ def build_cover_tab() -> dict[str, gr.components.Component]:
|
|
| 240 |
)
|
| 241 |
|
| 242 |
_build_lora_accordion(components)
|
|
|
|
| 243 |
|
| 244 |
components["generate_btn"] = gr.Button(
|
| 245 |
"▶ Generate cover",
|
|
@@ -341,6 +512,7 @@ def build_extend_tab() -> dict[str, gr.components.Component]:
|
|
| 341 |
)
|
| 342 |
|
| 343 |
_build_lora_accordion(components)
|
|
|
|
| 344 |
|
| 345 |
components["generate_btn"] = gr.Button(
|
| 346 |
"▶ Extend",
|
|
@@ -455,6 +627,7 @@ def build_edit_tab() -> dict[str, gr.components.Component]:
|
|
| 455 |
components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
|
| 456 |
|
| 457 |
_build_lora_accordion(components)
|
|
|
|
| 458 |
|
| 459 |
components["generate_btn"] = gr.Button(
|
| 460 |
"▶ Apply edit",
|
|
|
|
| 16 |
import tooltips
|
| 17 |
|
| 18 |
|
| 19 |
+
def _build_advanced_accordion(components: dict[str, gr.components.Component]) -> None:
|
| 20 |
+
"""Advanced controls accordion shared by all four song modes.
|
| 21 |
+
|
| 22 |
+
User complaint: "no matter what prompt I write, style is not deviating
|
| 23 |
+
by a lot". Root cause: ``GenerationParams.inference_steps`` defaults
|
| 24 |
+
to 8 (ACE-Step turbo) — too few for the XL SFT model to actually
|
| 25 |
+
express prompt variation. ``guidance_scale``, ``infer_method``,
|
| 26 |
+
``shift``, ``use_adg``, and the CoT flags were all left at dataclass
|
| 27 |
+
defaults too. This accordion surfaces the ~21 most useful knobs in
|
| 28 |
+
four logical groups so the user can lock-and-iterate.
|
| 29 |
+
|
| 30 |
+
Each song-mode pane (Generate / Cover / Extend / Edit) calls this
|
| 31 |
+
right after ``_build_lora_accordion(components)`` so the layout is
|
| 32 |
+
consistent. The Lyrics tab does NOT get this — it's a Qwen path with
|
| 33 |
+
its own LM-params accordion already.
|
| 34 |
+
"""
|
| 35 |
+
with gr.Accordion(
|
| 36 |
+
label="Advanced",
|
| 37 |
+
open=False,
|
| 38 |
+
elem_classes=["ams-advanced"],
|
| 39 |
+
):
|
| 40 |
+
# --- Group A — Diffusion (most impactful) ---
|
| 41 |
+
gr.Markdown("**Diffusion**", elem_classes=["ams-adv-section"])
|
| 42 |
+
components["adv_inference_steps"] = gr.Slider(
|
| 43 |
+
minimum=8,
|
| 44 |
+
maximum=80,
|
| 45 |
+
value=27,
|
| 46 |
+
step=1,
|
| 47 |
+
label="Inference steps",
|
| 48 |
+
info="More steps → richer detail. 8 is turbo, 27-60 is the sweet spot for XL SFT.",
|
| 49 |
+
)
|
| 50 |
+
components["adv_guidance_scale"] = gr.Slider(
|
| 51 |
+
minimum=1.0,
|
| 52 |
+
maximum=15.0,
|
| 53 |
+
value=7.0,
|
| 54 |
+
step=0.5,
|
| 55 |
+
label="Guidance scale (CFG)",
|
| 56 |
+
info="Higher = follow the prompt more strictly. Lower = more creative / weirder.",
|
| 57 |
+
)
|
| 58 |
+
components["adv_infer_method"] = gr.Radio(
|
| 59 |
+
choices=["ode", "sde"],
|
| 60 |
+
value="ode",
|
| 61 |
+
label="Inference method",
|
| 62 |
+
info="ode = deterministic per seed. sde = injects stochastic noise per step → genuinely different outputs each run.",
|
| 63 |
+
)
|
| 64 |
+
components["adv_seed"] = gr.Number(
|
| 65 |
+
value=-1,
|
| 66 |
+
precision=0,
|
| 67 |
+
label="Seed",
|
| 68 |
+
info="-1 = randomize each run. Set a number to lock-and-iterate.",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# --- Group B — CFG schedule + shift + ADG ---
|
| 72 |
+
gr.Markdown("**CFG schedule + shift**", elem_classes=["ams-adv-section"])
|
| 73 |
+
components["adv_cfg_interval_start"] = gr.Slider(
|
| 74 |
+
minimum=0.0,
|
| 75 |
+
maximum=1.0,
|
| 76 |
+
value=0.0,
|
| 77 |
+
step=0.05,
|
| 78 |
+
label="CFG interval start",
|
| 79 |
+
info="Fraction of diffusion at which CFG kicks in.",
|
| 80 |
+
)
|
| 81 |
+
components["adv_cfg_interval_end"] = gr.Slider(
|
| 82 |
+
minimum=0.0,
|
| 83 |
+
maximum=1.0,
|
| 84 |
+
value=1.0,
|
| 85 |
+
step=0.05,
|
| 86 |
+
label="CFG interval end",
|
| 87 |
+
info="Fraction of diffusion at which CFG stops.",
|
| 88 |
+
)
|
| 89 |
+
components["adv_shift"] = gr.Slider(
|
| 90 |
+
minimum=0.5,
|
| 91 |
+
maximum=3.0,
|
| 92 |
+
value=1.0,
|
| 93 |
+
step=0.1,
|
| 94 |
+
label="Shift",
|
| 95 |
+
info="Timestep shift. Try 0.7-1.3 for different feel.",
|
| 96 |
+
)
|
| 97 |
+
components["adv_use_adg"] = gr.Checkbox(
|
| 98 |
+
value=False,
|
| 99 |
+
label="Use Adaptive Dual Guidance (ADG)",
|
| 100 |
+
info="Experimental — sometimes improves base model output.",
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# --- Group C — 5Hz Language Model (CoT reasoning) ---
|
| 104 |
+
gr.Markdown("**5Hz LM (CoT)**", elem_classes=["ams-adv-section"])
|
| 105 |
+
components["adv_thinking"] = gr.Checkbox(
|
| 106 |
+
value=True,
|
| 107 |
+
label="Enable thinking (CoT)",
|
| 108 |
+
info="Let the 5Hz LM reason before generating. Recommended ON.",
|
| 109 |
+
)
|
| 110 |
+
components["adv_use_cot_caption"] = gr.Checkbox(
|
| 111 |
+
value=True,
|
| 112 |
+
label="Let LM rewrite caption",
|
| 113 |
+
info="LM expands/rephrases your prompt. Adds variety.",
|
| 114 |
+
)
|
| 115 |
+
components["adv_use_cot_metas"] = gr.Checkbox(
|
| 116 |
+
value=True,
|
| 117 |
+
label="Let LM infer metadata (bpm/key/time)",
|
| 118 |
+
info="LM picks musical metadata. Turn off to force your manual values below.",
|
| 119 |
+
)
|
| 120 |
+
components["adv_use_cot_language"] = gr.Checkbox(
|
| 121 |
+
value=True,
|
| 122 |
+
label="Let LM detect vocal language",
|
| 123 |
+
info="LM picks vocal language from caption + lyrics.",
|
| 124 |
+
)
|
| 125 |
+
components["adv_lm_temperature"] = gr.Slider(
|
| 126 |
+
minimum=0.0,
|
| 127 |
+
maximum=2.0,
|
| 128 |
+
value=0.85,
|
| 129 |
+
step=0.05,
|
| 130 |
+
label="LM temperature",
|
| 131 |
+
info="Higher = more creative metadata/structure.",
|
| 132 |
+
)
|
| 133 |
+
components["adv_lm_top_p"] = gr.Slider(
|
| 134 |
+
minimum=0.0,
|
| 135 |
+
maximum=1.0,
|
| 136 |
+
value=0.9,
|
| 137 |
+
step=0.05,
|
| 138 |
+
label="LM top-p",
|
| 139 |
+
info="Nucleus sampling.",
|
| 140 |
+
)
|
| 141 |
+
components["adv_lm_top_k"] = gr.Number(
|
| 142 |
+
value=0,
|
| 143 |
+
precision=0,
|
| 144 |
+
label="LM top-k",
|
| 145 |
+
info="0 = disabled.",
|
| 146 |
+
)
|
| 147 |
+
components["adv_lm_cfg_scale"] = gr.Slider(
|
| 148 |
+
minimum=1.0,
|
| 149 |
+
maximum=10.0,
|
| 150 |
+
value=2.0,
|
| 151 |
+
step=0.5,
|
| 152 |
+
label="LM CFG scale",
|
| 153 |
+
info="5Hz LM classifier-free guidance.",
|
| 154 |
+
)
|
| 155 |
+
components["adv_lm_negative_prompt"] = gr.Textbox(
|
| 156 |
+
value="NO USER INPUT",
|
| 157 |
+
label="LM negative prompt",
|
| 158 |
+
info="Steer the LM AWAY from these traits.",
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# --- Group D — Music metadata (manual overrides) ---
|
| 162 |
+
gr.Markdown("**Music metadata**", elem_classes=["ams-adv-section"])
|
| 163 |
+
components["adv_bpm"] = gr.Number(
|
| 164 |
+
value=None,
|
| 165 |
+
precision=0,
|
| 166 |
+
label="BPM",
|
| 167 |
+
info="Empty = auto. 30-300.",
|
| 168 |
+
)
|
| 169 |
+
components["adv_keyscale"] = gr.Textbox(
|
| 170 |
+
value="",
|
| 171 |
+
label="Key / scale",
|
| 172 |
+
info="e.g. 'C Major', 'Am'. Empty = auto.",
|
| 173 |
+
)
|
| 174 |
+
components["adv_timesignature"] = gr.Dropdown(
|
| 175 |
+
choices=["", "2", "3", "4", "6"],
|
| 176 |
+
value="",
|
| 177 |
+
label="Time signature",
|
| 178 |
+
info="2=2/4, 3=3/4, 4=4/4, 6=6/8. Empty = auto.",
|
| 179 |
+
)
|
| 180 |
+
components["adv_vocal_language"] = gr.Dropdown(
|
| 181 |
+
choices=["unknown", "en", "zh", "ja", "ko", "es", "fr", "de", "it", "pt", "ru"],
|
| 182 |
+
value="unknown",
|
| 183 |
+
label="Vocal language",
|
| 184 |
+
info="Hint for the 5Hz LM. unknown = auto.",
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
|
| 189 |
"""LoRA accordion with single-LoRA semantics. Mutates ``components``.
|
| 190 |
|
|
|
|
| 348 |
)
|
| 349 |
|
| 350 |
_build_lora_accordion(components)
|
| 351 |
+
_build_advanced_accordion(components)
|
| 352 |
|
| 353 |
components["generate_btn"] = gr.Button(
|
| 354 |
"▶ Generate",
|
|
|
|
| 410 |
)
|
| 411 |
|
| 412 |
_build_lora_accordion(components)
|
| 413 |
+
_build_advanced_accordion(components)
|
| 414 |
|
| 415 |
components["generate_btn"] = gr.Button(
|
| 416 |
"▶ Generate cover",
|
|
|
|
| 512 |
)
|
| 513 |
|
| 514 |
_build_lora_accordion(components)
|
| 515 |
+
_build_advanced_accordion(components)
|
| 516 |
|
| 517 |
components["generate_btn"] = gr.Button(
|
| 518 |
"▶ Extend",
|
|
|
|
| 627 |
components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
|
| 628 |
|
| 629 |
_build_lora_accordion(components)
|
| 630 |
+
_build_advanced_accordion(components)
|
| 631 |
|
| 632 |
components["generate_btn"] = gr.Button(
|
| 633 |
"▶ Apply edit",
|