techfreakworm commited on
Commit
c287b6a
·
unverified ·
1 Parent(s): 7dd8eb5

feat(ui): add advanced controls accordion — inference steps, cfg, infer method, seed, lm cot, schedule, metadata

Browse files

User feedback: outputs feel samey regardless of prompt. Root cause:
GenerationParams.inference_steps defaults to 8 (ACE-Step turbo), too
few for the XL SFT model to actually express variation. Also
guidance_scale, infer_method, shift, ADG, and CoT flags all left at
dataclass defaults.

New ``Advanced ▼`` accordion under every song mode (Generate / Cover
/ Extend / Edit; lyrics mode is its own Qwen path and doesn't need
it). Four groups:

- Diffusion: inference_steps (8-80, default 27), guidance_scale
(1-15, default 7.0), infer_method (ode|sde), seed (number,
-1=random).
- CFG schedule + shift + ADG.
- 5Hz LM (CoT): thinking + use_cot_caption + use_cot_metas +
use_cot_language now defaulted ON; LM temperature / top_p / top_k /
cfg / negative_prompt sliders.
- Music metadata: bpm / keyscale / timesignature / vocal_language
manual overrides.

ace_pipeline.generate passes all of these through to GenerationParams.
Output metadata JSON now echoes the advanced + lm dicts so user can
see what knobs were active for a given output and lock-iterate from
there (seed shown is the ACTUAL used seed, not -1).

Files changed (6) hide show
  1. ace_pipeline.py +27 -8
  2. app.py +331 -12
  3. backend.py +4 -0
  4. tests/test_ace_pipeline_lazy.py +11 -1
  5. theme.py +79 -0
  6. ui.py +173 -0
ace_pipeline.py CHANGED
@@ -226,6 +226,21 @@ class ACEStepStudio:
226
  )
227
  duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  gen_params = GenerationParams(
230
  task_type=task_type,
231
  caption=caption,
@@ -233,8 +248,10 @@ class ACEStepStudio:
233
  instrumental=instrumental,
234
  duration=duration_s,
235
  seed=int(params.get("seed", -1)),
236
- inference_steps=int(advanced.get("steps", 32)),
237
- guidance_scale=float(advanced.get("cfg", 4.0)),
 
 
238
  shift=float(advanced.get("shift", 1.0)),
239
  bpm=advanced.get("bpm"),
240
  keyscale=advanced.get("keyscale", ""),
@@ -248,16 +265,18 @@ class ACEStepStudio:
248
  audio_cover_strength=audio_cover_strength,
249
  repainting_start=repainting_start,
250
  repainting_end=repainting_end,
251
- # 5Hz language model knobs
252
- thinking=bool(lm_opts.get("thinking", False)),
 
 
253
  lm_temperature=float(lm_opts.get("temperature", 0.85)),
254
  lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
255
  lm_top_k=int(lm_opts.get("top_k", 0)),
256
  lm_top_p=float(lm_opts.get("top_p", 0.9)),
257
- lm_negative_prompt=lm_opts.get("negative_prompt", ""),
258
- use_cot_metas=bool(lm_opts.get("cot_metas", False)),
259
- use_cot_caption=bool(lm_opts.get("cot_caption", False)),
260
- use_cot_language=bool(lm_opts.get("cot_language", False)),
261
  )
262
 
263
  gen_config = GenerationConfig(
 
226
  )
227
  duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
228
 
229
+ # ``advanced``/``lm`` dicts are sent by app.py's
230
+ # ``_build_advanced_params``. Key changes from the prior contract:
231
+ # - ``inference_steps`` (was ``steps``, defaulted to 8 which made the
232
+ # XL SFT model behave too turbo-ish; new default 27).
233
+ # - ``guidance_scale`` (was ``cfg``, default 7.0 for stronger prompt
234
+ # adherence).
235
+ # - ``infer_method`` (new — ``"ode"`` deterministic / ``"sde"``
236
+ # stochastic; the user can now flip to ``sde`` to actually get
237
+ # different output each click even with the same seed).
238
+ # - ``use_adg`` (new — Adaptive Dual Guidance; experimental).
239
+ # - ``thinking`` (5Hz LM CoT — default flips to True so the LM can
240
+ # reason about caption + metadata, which is the actual source of
241
+ # the "no matter what prompt the style barely changes" symptom).
242
+ # - ``use_cot_metas`` / ``use_cot_caption`` / ``use_cot_language``
243
+ # keys renamed from ``cot_*`` for consistency with the dataclass.
244
  gen_params = GenerationParams(
245
  task_type=task_type,
246
  caption=caption,
 
248
  instrumental=instrumental,
249
  duration=duration_s,
250
  seed=int(params.get("seed", -1)),
251
+ inference_steps=int(advanced.get("inference_steps", 27)),
252
+ guidance_scale=float(advanced.get("guidance_scale", 7.0)),
253
+ infer_method=str(advanced.get("infer_method", "ode")),
254
+ use_adg=bool(advanced.get("use_adg", False)),
255
  shift=float(advanced.get("shift", 1.0)),
256
  bpm=advanced.get("bpm"),
257
  keyscale=advanced.get("keyscale", ""),
 
265
  audio_cover_strength=audio_cover_strength,
266
  repainting_start=repainting_start,
267
  repainting_end=repainting_end,
268
+ # 5Hz language model knobs — defaults flipped to True so the
269
+ # LM actually reasons about each prompt instead of returning
270
+ # blank captions / metadata back to the DiT.
271
+ thinking=bool(lm_opts.get("thinking", True)),
272
  lm_temperature=float(lm_opts.get("temperature", 0.85)),
273
  lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
274
  lm_top_k=int(lm_opts.get("top_k", 0)),
275
  lm_top_p=float(lm_opts.get("top_p", 0.9)),
276
+ lm_negative_prompt=lm_opts.get("negative_prompt", "NO USER INPUT"),
277
+ use_cot_metas=bool(lm_opts.get("use_cot_metas", True)),
278
+ use_cot_caption=bool(lm_opts.get("use_cot_caption", True)),
279
+ use_cot_language=bool(lm_opts.get("use_cot_language", True)),
280
  )
281
 
282
  gen_config = GenerationConfig(
app.py CHANGED
@@ -436,6 +436,65 @@ def on_lora_strength_change(state, strength: float):
436
  return new_state, _active_md(new_state["name"], float(strength), kind)
437
 
438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  @_maybe_spaces_gpu("generate")
440
  def on_generate_click(
441
  prompt: str,
@@ -443,9 +502,53 @@ def on_generate_click(
443
  duration_s: float,
444
  instrumental_label: str,
445
  lora_state,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  progress=gr.Progress(track_tqdm=True), # noqa: B008
447
  ):
448
  loras = [lora_state] if lora_state else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  out_path, meta = _safe_call(
450
  modes.generate,
451
  get_backend(),
@@ -454,10 +557,10 @@ def on_generate_click(
454
  "lyrics": lyrics,
455
  "duration_s": int(duration_s),
456
  "instrumental": instrumental_label == "Instrumental",
457
- "seed": random.randint(1, 2_147_483_647),
458
  "loras": loras,
459
- "advanced": {},
460
- "lm": {},
461
  "dcw": {},
462
  },
463
  )
@@ -473,10 +576,54 @@ def on_cover_click(
473
  duration_s: float,
474
  audio_cover_strength: float,
475
  lora_state,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  progress=gr.Progress(track_tqdm=True), # noqa: B008
477
  ):
478
  """Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
479
  loras = [lora_state] if lora_state else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  out_path, meta = _safe_call(
481
  modes.cover,
482
  get_backend(),
@@ -486,10 +633,10 @@ def on_cover_click(
486
  "lyrics": lyrics,
487
  "duration_s": int(duration_s),
488
  "audio_cover_strength": float(audio_cover_strength),
489
- "seed": random.randint(1, 2_147_483_647),
490
  "loras": loras,
491
- "advanced": {},
492
- "lm": {},
493
  "dcw": {},
494
  },
495
  )
@@ -509,10 +656,54 @@ def on_extend_click(
509
  latent_crossfade_frames: float,
510
  chunk_mask_mode: str,
511
  lora_state,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  progress=gr.Progress(track_tqdm=True), # noqa: B008
513
  ):
514
  """Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
515
  loras = [lora_state] if lora_state else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  out_path, meta = _safe_call(
517
  modes.extend,
518
  get_backend(),
@@ -526,10 +717,10 @@ def on_extend_click(
526
  "repaint_strength": float(repaint_strength),
527
  "latent_crossfade_frames": int(latent_crossfade_frames),
528
  "chunk_mask_mode": chunk_mask_mode,
529
- "seed": random.randint(1, 2_147_483_647),
530
  "loras": loras,
531
- "advanced": {},
532
- "lm": {},
533
  "dcw": {},
534
  },
535
  )
@@ -632,10 +823,54 @@ def on_edit_click(
632
  flow_n_max: float,
633
  flow_n_avg: float,
634
  lora_state,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  progress=gr.Progress(track_tqdm=True), # noqa: B008
636
  ):
637
  """Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
638
  loras = [lora_state] if lora_state else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  out_path, meta = _safe_call(
640
  modes.edit,
641
  get_backend(),
@@ -652,10 +887,10 @@ def on_edit_click(
652
  "flow_n_min": float(flow_n_min),
653
  "flow_n_max": float(flow_n_max),
654
  "flow_n_avg": int(flow_n_avg),
655
- "seed": random.randint(1, 2_147_483_647),
656
  "loras": loras,
657
- "advanced": {},
658
- "lm": {},
659
  "dcw": {},
660
  },
661
  )
@@ -799,6 +1034,27 @@ def build_app() -> gr.Blocks:
799
  g["duration_s"],
800
  g["instrumental"],
801
  g["lora_state"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  ],
803
  outputs=[g["output_audio"], g["output_meta"], history_html],
804
  )
@@ -844,6 +1100,27 @@ def build_app() -> gr.Blocks:
844
  c["duration_s"],
845
  c["audio_cover_strength"],
846
  c["lora_state"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
847
  ],
848
  outputs=[c["output_audio"], c["output_meta"], history_html],
849
  )
@@ -893,6 +1170,27 @@ def build_app() -> gr.Blocks:
893
  x["latent_crossfade_frames"],
894
  x["chunk_mask_mode"],
895
  x["lora_state"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896
  ],
897
  outputs=[x["output_audio"], x["output_meta"], history_html],
898
  )
@@ -945,6 +1243,27 @@ def build_app() -> gr.Blocks:
945
  e["flow_n_max"],
946
  e["flow_n_avg"],
947
  e["lora_state"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
  ],
949
  outputs=[e["output_audio"], e["output_meta"], history_html],
950
  )
 
436
  return new_state, _active_md(new_state["name"], float(strength), kind)
437
 
438
 
439
+ def _build_advanced_params(
440
+ adv_inference_steps,
441
+ adv_guidance_scale,
442
+ adv_infer_method,
443
+ adv_seed,
444
+ adv_cfg_interval_start,
445
+ adv_cfg_interval_end,
446
+ adv_shift,
447
+ adv_use_adg,
448
+ adv_thinking,
449
+ adv_use_cot_caption,
450
+ adv_use_cot_metas,
451
+ adv_use_cot_language,
452
+ adv_lm_temperature,
453
+ adv_lm_top_p,
454
+ adv_lm_top_k,
455
+ adv_lm_cfg_scale,
456
+ adv_lm_negative_prompt,
457
+ adv_bpm,
458
+ adv_keyscale,
459
+ adv_timesignature,
460
+ adv_vocal_language,
461
+ ):
462
+ """Pack the 21 Advanced-accordion inputs into the ``advanced`` + ``lm``
463
+ dicts that ``ace_pipeline.ACEStepStudio.generate`` consumes.
464
+
465
+ Centralising this avoids repeating the same dict-construction in each
466
+ of the four song-mode click handlers. Returns ``(seed, advanced, lm)``.
467
+ ``seed`` is the resolved seed (-1 / 0 / None → random 32-bit positive).
468
+ """
469
+ seed_raw = int(adv_seed) if adv_seed is not None else -1
470
+ seed = seed_raw if seed_raw > 0 else random.randint(1, 2_147_483_647)
471
+ advanced = {
472
+ "inference_steps": int(adv_inference_steps),
473
+ "guidance_scale": float(adv_guidance_scale),
474
+ "infer_method": adv_infer_method,
475
+ "cfg_interval_start": float(adv_cfg_interval_start),
476
+ "cfg_interval_end": float(adv_cfg_interval_end),
477
+ "shift": float(adv_shift),
478
+ "use_adg": bool(adv_use_adg),
479
+ "bpm": int(adv_bpm) if adv_bpm else None,
480
+ "keyscale": adv_keyscale or "",
481
+ "timesignature": adv_timesignature or "",
482
+ "vocal_language": adv_vocal_language or "unknown",
483
+ }
484
+ lm = {
485
+ "thinking": bool(adv_thinking),
486
+ "use_cot_caption": bool(adv_use_cot_caption),
487
+ "use_cot_metas": bool(adv_use_cot_metas),
488
+ "use_cot_language": bool(adv_use_cot_language),
489
+ "temperature": float(adv_lm_temperature),
490
+ "top_p": float(adv_lm_top_p),
491
+ "top_k": int(adv_lm_top_k) if adv_lm_top_k else 0,
492
+ "cfg": float(adv_lm_cfg_scale),
493
+ "negative_prompt": adv_lm_negative_prompt or "NO USER INPUT",
494
+ }
495
+ return seed, advanced, lm
496
+
497
+
498
  @_maybe_spaces_gpu("generate")
499
  def on_generate_click(
500
  prompt: str,
 
502
  duration_s: float,
503
  instrumental_label: str,
504
  lora_state,
505
+ adv_inference_steps,
506
+ adv_guidance_scale,
507
+ adv_infer_method,
508
+ adv_seed,
509
+ adv_cfg_interval_start,
510
+ adv_cfg_interval_end,
511
+ adv_shift,
512
+ adv_use_adg,
513
+ adv_thinking,
514
+ adv_use_cot_caption,
515
+ adv_use_cot_metas,
516
+ adv_use_cot_language,
517
+ adv_lm_temperature,
518
+ adv_lm_top_p,
519
+ adv_lm_top_k,
520
+ adv_lm_cfg_scale,
521
+ adv_lm_negative_prompt,
522
+ adv_bpm,
523
+ adv_keyscale,
524
+ adv_timesignature,
525
+ adv_vocal_language,
526
  progress=gr.Progress(track_tqdm=True), # noqa: B008
527
  ):
528
  loras = [lora_state] if lora_state else []
529
+ seed, advanced, lm = _build_advanced_params(
530
+ adv_inference_steps,
531
+ adv_guidance_scale,
532
+ adv_infer_method,
533
+ adv_seed,
534
+ adv_cfg_interval_start,
535
+ adv_cfg_interval_end,
536
+ adv_shift,
537
+ adv_use_adg,
538
+ adv_thinking,
539
+ adv_use_cot_caption,
540
+ adv_use_cot_metas,
541
+ adv_use_cot_language,
542
+ adv_lm_temperature,
543
+ adv_lm_top_p,
544
+ adv_lm_top_k,
545
+ adv_lm_cfg_scale,
546
+ adv_lm_negative_prompt,
547
+ adv_bpm,
548
+ adv_keyscale,
549
+ adv_timesignature,
550
+ adv_vocal_language,
551
+ )
552
  out_path, meta = _safe_call(
553
  modes.generate,
554
  get_backend(),
 
557
  "lyrics": lyrics,
558
  "duration_s": int(duration_s),
559
  "instrumental": instrumental_label == "Instrumental",
560
+ "seed": seed,
561
  "loras": loras,
562
+ "advanced": advanced,
563
+ "lm": lm,
564
  "dcw": {},
565
  },
566
  )
 
576
  duration_s: float,
577
  audio_cover_strength: float,
578
  lora_state,
579
+ adv_inference_steps,
580
+ adv_guidance_scale,
581
+ adv_infer_method,
582
+ adv_seed,
583
+ adv_cfg_interval_start,
584
+ adv_cfg_interval_end,
585
+ adv_shift,
586
+ adv_use_adg,
587
+ adv_thinking,
588
+ adv_use_cot_caption,
589
+ adv_use_cot_metas,
590
+ adv_use_cot_language,
591
+ adv_lm_temperature,
592
+ adv_lm_top_p,
593
+ adv_lm_top_k,
594
+ adv_lm_cfg_scale,
595
+ adv_lm_negative_prompt,
596
+ adv_bpm,
597
+ adv_keyscale,
598
+ adv_timesignature,
599
+ adv_vocal_language,
600
  progress=gr.Progress(track_tqdm=True), # noqa: B008
601
  ):
602
  """Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath')."""
603
  loras = [lora_state] if lora_state else []
604
+ seed, advanced, lm = _build_advanced_params(
605
+ adv_inference_steps,
606
+ adv_guidance_scale,
607
+ adv_infer_method,
608
+ adv_seed,
609
+ adv_cfg_interval_start,
610
+ adv_cfg_interval_end,
611
+ adv_shift,
612
+ adv_use_adg,
613
+ adv_thinking,
614
+ adv_use_cot_caption,
615
+ adv_use_cot_metas,
616
+ adv_use_cot_language,
617
+ adv_lm_temperature,
618
+ adv_lm_top_p,
619
+ adv_lm_top_k,
620
+ adv_lm_cfg_scale,
621
+ adv_lm_negative_prompt,
622
+ adv_bpm,
623
+ adv_keyscale,
624
+ adv_timesignature,
625
+ adv_vocal_language,
626
+ )
627
  out_path, meta = _safe_call(
628
  modes.cover,
629
  get_backend(),
 
633
  "lyrics": lyrics,
634
  "duration_s": int(duration_s),
635
  "audio_cover_strength": float(audio_cover_strength),
636
+ "seed": seed,
637
  "loras": loras,
638
+ "advanced": advanced,
639
+ "lm": lm,
640
  "dcw": {},
641
  },
642
  )
 
656
  latent_crossfade_frames: float,
657
  chunk_mask_mode: str,
658
  lora_state,
659
+ adv_inference_steps,
660
+ adv_guidance_scale,
661
+ adv_infer_method,
662
+ adv_seed,
663
+ adv_cfg_interval_start,
664
+ adv_cfg_interval_end,
665
+ adv_shift,
666
+ adv_use_adg,
667
+ adv_thinking,
668
+ adv_use_cot_caption,
669
+ adv_use_cot_metas,
670
+ adv_use_cot_language,
671
+ adv_lm_temperature,
672
+ adv_lm_top_p,
673
+ adv_lm_top_k,
674
+ adv_lm_cfg_scale,
675
+ adv_lm_negative_prompt,
676
+ adv_bpm,
677
+ adv_keyscale,
678
+ adv_timesignature,
679
+ adv_vocal_language,
680
  progress=gr.Progress(track_tqdm=True), # noqa: B008
681
  ):
682
  """Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath')."""
683
  loras = [lora_state] if lora_state else []
684
+ seed, advanced, lm = _build_advanced_params(
685
+ adv_inference_steps,
686
+ adv_guidance_scale,
687
+ adv_infer_method,
688
+ adv_seed,
689
+ adv_cfg_interval_start,
690
+ adv_cfg_interval_end,
691
+ adv_shift,
692
+ adv_use_adg,
693
+ adv_thinking,
694
+ adv_use_cot_caption,
695
+ adv_use_cot_metas,
696
+ adv_use_cot_language,
697
+ adv_lm_temperature,
698
+ adv_lm_top_p,
699
+ adv_lm_top_k,
700
+ adv_lm_cfg_scale,
701
+ adv_lm_negative_prompt,
702
+ adv_bpm,
703
+ adv_keyscale,
704
+ adv_timesignature,
705
+ adv_vocal_language,
706
+ )
707
  out_path, meta = _safe_call(
708
  modes.extend,
709
  get_backend(),
 
717
  "repaint_strength": float(repaint_strength),
718
  "latent_crossfade_frames": int(latent_crossfade_frames),
719
  "chunk_mask_mode": chunk_mask_mode,
720
+ "seed": seed,
721
  "loras": loras,
722
+ "advanced": advanced,
723
+ "lm": lm,
724
  "dcw": {},
725
  },
726
  )
 
823
  flow_n_max: float,
824
  flow_n_avg: float,
825
  lora_state,
826
+ adv_inference_steps,
827
+ adv_guidance_scale,
828
+ adv_infer_method,
829
+ adv_seed,
830
+ adv_cfg_interval_start,
831
+ adv_cfg_interval_end,
832
+ adv_shift,
833
+ adv_use_adg,
834
+ adv_thinking,
835
+ adv_use_cot_caption,
836
+ adv_use_cot_metas,
837
+ adv_use_cot_language,
838
+ adv_lm_temperature,
839
+ adv_lm_top_p,
840
+ adv_lm_top_k,
841
+ adv_lm_cfg_scale,
842
+ adv_lm_negative_prompt,
843
+ adv_bpm,
844
+ adv_keyscale,
845
+ adv_timesignature,
846
+ adv_vocal_language,
847
  progress=gr.Progress(track_tqdm=True), # noqa: B008
848
  ):
849
  """Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath')."""
850
  loras = [lora_state] if lora_state else []
851
+ seed, advanced, lm = _build_advanced_params(
852
+ adv_inference_steps,
853
+ adv_guidance_scale,
854
+ adv_infer_method,
855
+ adv_seed,
856
+ adv_cfg_interval_start,
857
+ adv_cfg_interval_end,
858
+ adv_shift,
859
+ adv_use_adg,
860
+ adv_thinking,
861
+ adv_use_cot_caption,
862
+ adv_use_cot_metas,
863
+ adv_use_cot_language,
864
+ adv_lm_temperature,
865
+ adv_lm_top_p,
866
+ adv_lm_top_k,
867
+ adv_lm_cfg_scale,
868
+ adv_lm_negative_prompt,
869
+ adv_bpm,
870
+ adv_keyscale,
871
+ adv_timesignature,
872
+ adv_vocal_language,
873
+ )
874
  out_path, meta = _safe_call(
875
  modes.edit,
876
  get_backend(),
 
887
  "flow_n_min": float(flow_n_min),
888
  "flow_n_max": float(flow_n_max),
889
  "flow_n_avg": int(flow_n_avg),
890
+ "seed": seed,
891
  "loras": loras,
892
+ "advanced": advanced,
893
+ "lm": lm,
894
  "dcw": {},
895
  },
896
  )
 
1034
  g["duration_s"],
1035
  g["instrumental"],
1036
  g["lora_state"],
1037
+ g["adv_inference_steps"],
1038
+ g["adv_guidance_scale"],
1039
+ g["adv_infer_method"],
1040
+ g["adv_seed"],
1041
+ g["adv_cfg_interval_start"],
1042
+ g["adv_cfg_interval_end"],
1043
+ g["adv_shift"],
1044
+ g["adv_use_adg"],
1045
+ g["adv_thinking"],
1046
+ g["adv_use_cot_caption"],
1047
+ g["adv_use_cot_metas"],
1048
+ g["adv_use_cot_language"],
1049
+ g["adv_lm_temperature"],
1050
+ g["adv_lm_top_p"],
1051
+ g["adv_lm_top_k"],
1052
+ g["adv_lm_cfg_scale"],
1053
+ g["adv_lm_negative_prompt"],
1054
+ g["adv_bpm"],
1055
+ g["adv_keyscale"],
1056
+ g["adv_timesignature"],
1057
+ g["adv_vocal_language"],
1058
  ],
1059
  outputs=[g["output_audio"], g["output_meta"], history_html],
1060
  )
 
1100
  c["duration_s"],
1101
  c["audio_cover_strength"],
1102
  c["lora_state"],
1103
+ c["adv_inference_steps"],
1104
+ c["adv_guidance_scale"],
1105
+ c["adv_infer_method"],
1106
+ c["adv_seed"],
1107
+ c["adv_cfg_interval_start"],
1108
+ c["adv_cfg_interval_end"],
1109
+ c["adv_shift"],
1110
+ c["adv_use_adg"],
1111
+ c["adv_thinking"],
1112
+ c["adv_use_cot_caption"],
1113
+ c["adv_use_cot_metas"],
1114
+ c["adv_use_cot_language"],
1115
+ c["adv_lm_temperature"],
1116
+ c["adv_lm_top_p"],
1117
+ c["adv_lm_top_k"],
1118
+ c["adv_lm_cfg_scale"],
1119
+ c["adv_lm_negative_prompt"],
1120
+ c["adv_bpm"],
1121
+ c["adv_keyscale"],
1122
+ c["adv_timesignature"],
1123
+ c["adv_vocal_language"],
1124
  ],
1125
  outputs=[c["output_audio"], c["output_meta"], history_html],
1126
  )
 
1170
  x["latent_crossfade_frames"],
1171
  x["chunk_mask_mode"],
1172
  x["lora_state"],
1173
+ x["adv_inference_steps"],
1174
+ x["adv_guidance_scale"],
1175
+ x["adv_infer_method"],
1176
+ x["adv_seed"],
1177
+ x["adv_cfg_interval_start"],
1178
+ x["adv_cfg_interval_end"],
1179
+ x["adv_shift"],
1180
+ x["adv_use_adg"],
1181
+ x["adv_thinking"],
1182
+ x["adv_use_cot_caption"],
1183
+ x["adv_use_cot_metas"],
1184
+ x["adv_use_cot_language"],
1185
+ x["adv_lm_temperature"],
1186
+ x["adv_lm_top_p"],
1187
+ x["adv_lm_top_k"],
1188
+ x["adv_lm_cfg_scale"],
1189
+ x["adv_lm_negative_prompt"],
1190
+ x["adv_bpm"],
1191
+ x["adv_keyscale"],
1192
+ x["adv_timesignature"],
1193
+ x["adv_vocal_language"],
1194
  ],
1195
  outputs=[x["output_audio"], x["output_meta"], history_html],
1196
  )
 
1243
  e["flow_n_max"],
1244
  e["flow_n_avg"],
1245
  e["lora_state"],
1246
+ e["adv_inference_steps"],
1247
+ e["adv_guidance_scale"],
1248
+ e["adv_infer_method"],
1249
+ e["adv_seed"],
1250
+ e["adv_cfg_interval_start"],
1251
+ e["adv_cfg_interval_end"],
1252
+ e["adv_shift"],
1253
+ e["adv_use_adg"],
1254
+ e["adv_thinking"],
1255
+ e["adv_use_cot_caption"],
1256
+ e["adv_use_cot_metas"],
1257
+ e["adv_use_cot_language"],
1258
+ e["adv_lm_temperature"],
1259
+ e["adv_lm_top_p"],
1260
+ e["adv_lm_top_k"],
1261
+ e["adv_lm_cfg_scale"],
1262
+ e["adv_lm_negative_prompt"],
1263
+ e["adv_bpm"],
1264
+ e["adv_keyscale"],
1265
+ e["adv_timesignature"],
1266
+ e["adv_vocal_language"],
1267
  ],
1268
  outputs=[e["output_audio"], e["output_meta"], history_html],
1269
  )
backend.py CHANGED
@@ -64,6 +64,10 @@ class ACEStepStudioBackend:
64
  {"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
65
  for lora in params.get("loras", [])
66
  ],
 
 
 
 
67
  "lm": params.get("lm", {}),
68
  "dcw": params.get("dcw", {}),
69
  }
 
64
  {"name": lora.get("name"), "scale": lora.get("scale"), "sha256": lora.get("sha256")}
65
  for lora in params.get("loras", [])
66
  ],
67
+ # Echo the advanced + lm dicts back so the user can see which
68
+ # knobs were active for a given output and lock-iterate from
69
+ # there. The "seed" above is the resolved seed (never -1).
70
+ "advanced": params.get("advanced", {}),
71
  "lm": params.get("lm", {}),
72
  "dcw": params.get("dcw", {}),
73
  }
tests/test_ace_pipeline_lazy.py CHANGED
@@ -106,7 +106,15 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
106
  "instrumental": False,
107
  "seed": 42,
108
  "loras": [],
109
- "advanced": {"steps": 32, "cfg": 4.0, "bpm": 135},
 
 
 
 
 
 
 
 
110
  "lm": {"thinking": False},
111
  "dcw": {},
112
  }
@@ -118,6 +126,8 @@ def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp
118
  assert captured["gp"]["duration"] == 30
119
  assert captured["gp"]["seed"] == 42
120
  assert captured["gp"]["inference_steps"] == 32
 
 
121
  assert captured["gp"]["bpm"] == 135
122
 
123
 
 
106
  "instrumental": False,
107
  "seed": 42,
108
  "loras": [],
109
+ # New advanced contract: ``inference_steps`` + ``guidance_scale``
110
+ # + ``infer_method`` replace the old ``steps`` + ``cfg`` keys.
111
+ # See ace_pipeline.ACEStepStudio.generate for the full schema.
112
+ "advanced": {
113
+ "inference_steps": 32,
114
+ "guidance_scale": 4.0,
115
+ "infer_method": "ode",
116
+ "bpm": 135,
117
+ },
118
  "lm": {"thinking": False},
119
  "dcw": {},
120
  }
 
126
  assert captured["gp"]["duration"] == 30
127
  assert captured["gp"]["seed"] == 42
128
  assert captured["gp"]["inference_steps"] == 32
129
+ assert captured["gp"]["guidance_scale"] == 4.0
130
+ assert captured["gp"]["infer_method"] == "ode"
131
  assert captured["gp"]["bpm"] == 135
132
 
133
 
theme.py CHANGED
@@ -989,6 +989,85 @@ main, .contain {{
989
  padding:0 12px 12px 12px !important;
990
  }}
991
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
992
  /* ============================================================
993
  * Post-process action row (M5/G2) — sits below the Output Audio.
994
  * Three compact mono pills (separate stems / normalise / mp3 export)
 
989
  padding:0 12px 12px 12px !important;
990
  }}
991
 
992
+ /* ============================================================
993
+ * Advanced controls accordion (M0-X)
994
+ * Bordered chrome matching the LoRA + LM + experimental accordions so
995
+ * the four song-mode panes read consistently. Inside the accordion we
996
+ * additionally render small <h>/<p strong> section headers (Diffusion,
997
+ * CFG schedule, 5Hz LM, Music metadata) to chunk the 21 knobs into
998
+ * logical groups; those need their own mono-uppercase-faint treatment
999
+ * so they don't compete with the form labels for visual weight.
1000
+ * ============================================================ */
1001
+ .ams-content .ams-advanced {{
1002
+ border:1px solid {BORDER} !important;
1003
+ border-radius:3px !important;
1004
+ background:{SURFACE_STRONG} !important;
1005
+ margin-top:10px !important;
1006
+ padding:0 !important;
1007
+ }}
1008
+ .ams-content .ams-advanced > .label-wrap,
1009
+ .ams-content .ams-advanced summary,
1010
+ .ams-content .ams-advanced > button {{
1011
+ font-family: {FONT_MONO} !important;
1012
+ font-size:10px !important;
1013
+ letter-spacing:0.08em !important;
1014
+ text-transform:uppercase !important;
1015
+ color:{INK_MUTED} !important;
1016
+ padding:10px 12px !important;
1017
+ background:transparent !important;
1018
+ border:none !important;
1019
+ }}
1020
+ .ams-content .ams-advanced > .label-wrap span,
1021
+ .ams-content .ams-advanced summary span,
1022
+ .ams-content .ams-advanced > button span {{
1023
+ color:{INK_MUTED} !important;
1024
+ font-family: {FONT_MONO} !important;
1025
+ font-size:10px !important;
1026
+ letter-spacing:0.08em !important;
1027
+ text-transform:uppercase !important;
1028
+ }}
1029
+ .ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
1030
+ padding:0 12px 12px 12px !important;
1031
+ }}
1032
+ /* Section divider Markdown headers inside the accordion. We render them
1033
+ as **Diffusion** (etc) via gr.Markdown — Gradio wraps that in
1034
+ ``.prose strong``. Treat the strong tag as a small mono uppercase
1035
+ header with a subtle underline so the four groups have clear visual
1036
+ boundaries without competing with the actual form labels. */
1037
+ .ams-content .ams-advanced .ams-adv-section .prose p {{
1038
+ margin:14px 0 4px 0 !important;
1039
+ padding:0 0 4px 0 !important;
1040
+ border-bottom:1px solid {BORDER} !important;
1041
+ }}
1042
+ .ams-content .ams-advanced .ams-adv-section .prose p:first-child {{
1043
+ margin-top:6px !important;
1044
+ }}
1045
+ .ams-content .ams-advanced .ams-adv-section .prose strong {{
1046
+ font-family: {FONT_MONO} !important;
1047
+ font-size:10px !important;
1048
+ letter-spacing:0.12em !important;
1049
+ text-transform:uppercase !important;
1050
+ color:{INK} !important;
1051
+ font-weight:600 !important;
1052
+ }}
1053
+ .ams-content .ams-advanced .ams-adv-section .prose {{
1054
+ background:transparent !important;
1055
+ }}
1056
+ @media (max-width: 640px) {{
1057
+ .ams-content .ams-advanced > .label-wrap,
1058
+ .ams-content .ams-advanced summary,
1059
+ .ams-content .ams-advanced > button {{
1060
+ font-size:9px !important;
1061
+ padding:8px 10px !important;
1062
+ }}
1063
+ .ams-content .ams-advanced > div:not(.label-wrap):not(summary) {{
1064
+ padding:0 10px 10px 10px !important;
1065
+ }}
1066
+ .ams-content .ams-advanced .ams-adv-section .prose strong {{
1067
+ font-size:9px !important;
1068
+ }}
1069
+ }}
1070
+
1071
  /* ============================================================
1072
  * Post-process action row (M5/G2) — sits below the Output Audio.
1073
  * Three compact mono pills (separate stems / normalise / mp3 export)
ui.py CHANGED
@@ -16,6 +16,175 @@ import lora_stack
16
  import tooltips
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
20
  """LoRA accordion with single-LoRA semantics. Mutates ``components``.
21
 
@@ -179,6 +348,7 @@ def build_generate_tab() -> dict[str, gr.components.Component]:
179
  )
180
 
181
  _build_lora_accordion(components)
 
182
 
183
  components["generate_btn"] = gr.Button(
184
  "▶ Generate",
@@ -240,6 +410,7 @@ def build_cover_tab() -> dict[str, gr.components.Component]:
240
  )
241
 
242
  _build_lora_accordion(components)
 
243
 
244
  components["generate_btn"] = gr.Button(
245
  "▶ Generate cover",
@@ -341,6 +512,7 @@ def build_extend_tab() -> dict[str, gr.components.Component]:
341
  )
342
 
343
  _build_lora_accordion(components)
 
344
 
345
  components["generate_btn"] = gr.Button(
346
  "▶ Extend",
@@ -455,6 +627,7 @@ def build_edit_tab() -> dict[str, gr.components.Component]:
455
  components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
456
 
457
  _build_lora_accordion(components)
 
458
 
459
  components["generate_btn"] = gr.Button(
460
  "▶ Apply edit",
 
16
  import tooltips
17
 
18
 
19
+ def _build_advanced_accordion(components: dict[str, gr.components.Component]) -> None:
20
+ """Advanced controls accordion shared by all four song modes.
21
+
22
+ User complaint: "no matter what prompt I write, style is not deviating
23
+ by a lot". Root cause: ``GenerationParams.inference_steps`` defaults
24
+ to 8 (ACE-Step turbo) — too few for the XL SFT model to actually
25
+ express prompt variation. ``guidance_scale``, ``infer_method``,
26
+ ``shift``, ``use_adg``, and the CoT flags were all left at dataclass
27
+ defaults too. This accordion surfaces the ~21 most useful knobs in
28
+ four logical groups so the user can lock-and-iterate.
29
+
30
+ Each song-mode pane (Generate / Cover / Extend / Edit) calls this
31
+ right after ``_build_lora_accordion(components)`` so the layout is
32
+ consistent. The Lyrics tab does NOT get this — it's a Qwen path with
33
+ its own LM-params accordion already.
34
+ """
35
+ with gr.Accordion(
36
+ label="Advanced",
37
+ open=False,
38
+ elem_classes=["ams-advanced"],
39
+ ):
40
+ # --- Group A — Diffusion (most impactful) ---
41
+ gr.Markdown("**Diffusion**", elem_classes=["ams-adv-section"])
42
+ components["adv_inference_steps"] = gr.Slider(
43
+ minimum=8,
44
+ maximum=80,
45
+ value=27,
46
+ step=1,
47
+ label="Inference steps",
48
+ info="More steps → richer detail. 8 is turbo, 27-60 is the sweet spot for XL SFT.",
49
+ )
50
+ components["adv_guidance_scale"] = gr.Slider(
51
+ minimum=1.0,
52
+ maximum=15.0,
53
+ value=7.0,
54
+ step=0.5,
55
+ label="Guidance scale (CFG)",
56
+ info="Higher = follow the prompt more strictly. Lower = more creative / weirder.",
57
+ )
58
+ components["adv_infer_method"] = gr.Radio(
59
+ choices=["ode", "sde"],
60
+ value="ode",
61
+ label="Inference method",
62
+ info="ode = deterministic per seed. sde = injects stochastic noise per step → genuinely different outputs each run.",
63
+ )
64
+ components["adv_seed"] = gr.Number(
65
+ value=-1,
66
+ precision=0,
67
+ label="Seed",
68
+ info="-1 = randomize each run. Set a number to lock-and-iterate.",
69
+ )
70
+
71
+ # --- Group B — CFG schedule + shift + ADG ---
72
+ gr.Markdown("**CFG schedule + shift**", elem_classes=["ams-adv-section"])
73
+ components["adv_cfg_interval_start"] = gr.Slider(
74
+ minimum=0.0,
75
+ maximum=1.0,
76
+ value=0.0,
77
+ step=0.05,
78
+ label="CFG interval start",
79
+ info="Fraction of diffusion at which CFG kicks in.",
80
+ )
81
+ components["adv_cfg_interval_end"] = gr.Slider(
82
+ minimum=0.0,
83
+ maximum=1.0,
84
+ value=1.0,
85
+ step=0.05,
86
+ label="CFG interval end",
87
+ info="Fraction of diffusion at which CFG stops.",
88
+ )
89
+ components["adv_shift"] = gr.Slider(
90
+ minimum=0.5,
91
+ maximum=3.0,
92
+ value=1.0,
93
+ step=0.1,
94
+ label="Shift",
95
+ info="Timestep shift. Try 0.7-1.3 for different feel.",
96
+ )
97
+ components["adv_use_adg"] = gr.Checkbox(
98
+ value=False,
99
+ label="Use Adaptive Dual Guidance (ADG)",
100
+ info="Experimental — sometimes improves base model output.",
101
+ )
102
+
103
+ # --- Group C — 5Hz Language Model (CoT reasoning) ---
104
+ gr.Markdown("**5Hz LM (CoT)**", elem_classes=["ams-adv-section"])
105
+ components["adv_thinking"] = gr.Checkbox(
106
+ value=True,
107
+ label="Enable thinking (CoT)",
108
+ info="Let the 5Hz LM reason before generating. Recommended ON.",
109
+ )
110
+ components["adv_use_cot_caption"] = gr.Checkbox(
111
+ value=True,
112
+ label="Let LM rewrite caption",
113
+ info="LM expands/rephrases your prompt. Adds variety.",
114
+ )
115
+ components["adv_use_cot_metas"] = gr.Checkbox(
116
+ value=True,
117
+ label="Let LM infer metadata (bpm/key/time)",
118
+ info="LM picks musical metadata. Turn off to force your manual values below.",
119
+ )
120
+ components["adv_use_cot_language"] = gr.Checkbox(
121
+ value=True,
122
+ label="Let LM detect vocal language",
123
+ info="LM picks vocal language from caption + lyrics.",
124
+ )
125
+ components["adv_lm_temperature"] = gr.Slider(
126
+ minimum=0.0,
127
+ maximum=2.0,
128
+ value=0.85,
129
+ step=0.05,
130
+ label="LM temperature",
131
+ info="Higher = more creative metadata/structure.",
132
+ )
133
+ components["adv_lm_top_p"] = gr.Slider(
134
+ minimum=0.0,
135
+ maximum=1.0,
136
+ value=0.9,
137
+ step=0.05,
138
+ label="LM top-p",
139
+ info="Nucleus sampling.",
140
+ )
141
+ components["adv_lm_top_k"] = gr.Number(
142
+ value=0,
143
+ precision=0,
144
+ label="LM top-k",
145
+ info="0 = disabled.",
146
+ )
147
+ components["adv_lm_cfg_scale"] = gr.Slider(
148
+ minimum=1.0,
149
+ maximum=10.0,
150
+ value=2.0,
151
+ step=0.5,
152
+ label="LM CFG scale",
153
+ info="5Hz LM classifier-free guidance.",
154
+ )
155
+ components["adv_lm_negative_prompt"] = gr.Textbox(
156
+ value="NO USER INPUT",
157
+ label="LM negative prompt",
158
+ info="Steer the LM AWAY from these traits.",
159
+ )
160
+
161
+ # --- Group D — Music metadata (manual overrides) ---
162
+ gr.Markdown("**Music metadata**", elem_classes=["ams-adv-section"])
163
+ components["adv_bpm"] = gr.Number(
164
+ value=None,
165
+ precision=0,
166
+ label="BPM",
167
+ info="Empty = auto. 30-300.",
168
+ )
169
+ components["adv_keyscale"] = gr.Textbox(
170
+ value="",
171
+ label="Key / scale",
172
+ info="e.g. 'C Major', 'Am'. Empty = auto.",
173
+ )
174
+ components["adv_timesignature"] = gr.Dropdown(
175
+ choices=["", "2", "3", "4", "6"],
176
+ value="",
177
+ label="Time signature",
178
+ info="2=2/4, 3=3/4, 4=4/4, 6=6/8. Empty = auto.",
179
+ )
180
+ components["adv_vocal_language"] = gr.Dropdown(
181
+ choices=["unknown", "en", "zh", "ja", "ko", "es", "fr", "de", "it", "pt", "ru"],
182
+ value="unknown",
183
+ label="Vocal language",
184
+ info="Hint for the 5Hz LM. unknown = auto.",
185
+ )
186
+
187
+
188
  def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
189
  """LoRA accordion with single-LoRA semantics. Mutates ``components``.
190
 
 
348
  )
349
 
350
  _build_lora_accordion(components)
351
+ _build_advanced_accordion(components)
352
 
353
  components["generate_btn"] = gr.Button(
354
  "▶ Generate",
 
410
  )
411
 
412
  _build_lora_accordion(components)
413
+ _build_advanced_accordion(components)
414
 
415
  components["generate_btn"] = gr.Button(
416
  "▶ Generate cover",
 
512
  )
513
 
514
  _build_lora_accordion(components)
515
+ _build_advanced_accordion(components)
516
 
517
  components["generate_btn"] = gr.Button(
518
  "▶ Extend",
 
627
  components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
628
 
629
  _build_lora_accordion(components)
630
+ _build_advanced_accordion(components)
631
 
632
  components["generate_btn"] = gr.Button(
633
  "▶ Apply edit",