Spaces:
Running on Zero
Running on Zero
File size: 8,756 Bytes
52f41b8 9c07a74 52f41b8 9c07a74 52f41b8 a2eae29 9c07a74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | """Pure mode handlers — one function per generation mode.
Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
pipeline lifecycle keeps these handlers cheap to test.
The ``lyrics()`` handler is the odd one out: it does NOT touch the ACE-Step
backend at all. It calls ``lyrics_lm.generate_lyrics`` directly, since the
Qwen 2.5 7B LM is its own lazy singleton and doesn't share the DiT / 5Hz
pipeline lifecycle with the audio modes.
"""
from __future__ import annotations
from typing import Any
import lyrics_lm
def _require(params: dict[str, Any], field: str) -> Any:
v = params.get(field)
if v is None or (isinstance(v, str) and not v.strip()):
raise ValueError(f"Missing required field: {field}")
return v
def generate(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Text → song. Vocals + instruments in one stream."""
prompt = _require(params, "prompt")
lyrics = params.get("lyrics", "")
duration_s = int(params.get("duration_s", 30))
instrumental = bool(params.get("instrumental", False))
return backend.dispatch(
mode="generate",
params={
"prompt": prompt,
"lyrics": lyrics,
"duration_s": duration_s,
"instrumental": instrumental,
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def cover(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Audio-reference cover — reference audio + new prompt -> song in that style.
Maps to ACE-Step's ``GenerationParams(task_type="cover")`` with
``reference_audio`` set to the uploaded clip and ``audio_cover_strength``
controlling how tightly the new song hugs the reference timbre/structure.
"""
ref_audio = _require(params, "ref_audio")
prompt = params.get("prompt", "")
lyrics = params.get("lyrics", "")
duration_s = int(params.get("duration_s", 30))
return backend.dispatch(
mode="cover",
params={
"prompt": prompt,
"ref_audio": ref_audio,
"lyrics": lyrics,
"duration_s": duration_s,
"audio_cover_strength": float(params.get("audio_cover_strength", 0.93)),
"cover_noise_strength": float(params.get("cover_noise_strength", 0.0)),
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def extend(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Song continuation — seed audio + extension prompt -> extended song.
Maps to ACE-Step's ``GenerationParams(task_type="repaint")`` with
``src_audio`` set to the seed and ``repainting_start``/``repainting_end``
pointing past the end of the seed so the model paints new audio after it.
"""
seed_audio = _require(params, "seed_audio")
extra_prompt = params.get("extra_prompt", "")
extra_duration_s = int(params.get("extra_duration_s", 60))
return backend.dispatch(
mode="extend",
params={
"seed_audio": seed_audio,
"extra_prompt": extra_prompt,
"extension_lyrics": params.get("extension_lyrics", ""),
"extra_duration_s": extra_duration_s,
"repaint_mode": params.get("repaint_mode", "balanced"),
"repaint_strength": float(params.get("repaint_strength", 0.5)),
"wav_crossfade_s": float(params.get("wav_crossfade_s", 2.0)),
"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def edit(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Segment-level edit — repaint a region OR morph caption-to-caption.
Two sub-modes:
- ``"repaint"`` (default): paint over ``[segment_start_s, segment_end_s]``
using ACE-Step's ``task_type="repaint"`` with the segment bounds wired
into ``repainting_start`` / ``repainting_end``.
- ``"flow_edit"``: caption-to-caption morph. The installed ACE-Step
``GenerationParams`` dataclass has no native ``flow_edit_*`` fields, so
flow-edit is implemented downstream as a ``task_type="repaint"`` pass
with a lower ``audio_cover_strength`` to allow more style drift. The
``flow_source_caption`` / ``flow_n_*`` knobs are carried through the
internal params dict so the pipeline wrapper can use them if/when the
upstream dataclass grows native support.
"""
source_audio = _require(params, "source_audio")
sub_mode = params.get("sub_mode", "repaint")
out_params: dict[str, Any] = {
"source_audio": source_audio,
"source_lyrics": params.get("source_lyrics", ""),
"target_lyrics": params.get("target_lyrics", ""),
"segment_start_s": float(params.get("segment_start_s", 0.0)),
"segment_end_s": float(params.get("segment_end_s", 30.0)),
"sub_mode": sub_mode,
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
}
if sub_mode == "repaint":
out_params.update(
{
"repaint_mode": params.get("repaint_mode", "balanced"),
"repaint_strength": float(params.get("repaint_strength", 0.5)),
"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
"wav_crossfade_s": float(params.get("wav_crossfade_s", 0.0)),
}
)
elif sub_mode == "flow_edit":
out_params.update(
{
"flow_source_caption": params.get("flow_source_caption", ""),
"flow_n_min": float(params.get("flow_n_min", 0.0)),
"flow_n_max": float(params.get("flow_n_max", 1.0)),
"flow_n_avg": int(params.get("flow_n_avg", 1)),
}
)
return backend.dispatch(mode="edit", params=out_params)
def lyrics(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Lyrics-only mode. Returns ``(drafted_text, metadata_dict)``.
Does NOT touch the ACE-Step backend — Qwen 2.5 7B Instruct is owned
by ``lyrics_lm`` as its own lazy singleton. The ``backend`` argument
is kept in the signature for parity with the other mode handlers but
is unused here.
"""
del backend # signature parity with generate/cover/extend/edit
brief = _require(params, "brief")
structure = params.get("structure", "intro, verse, chorus, verse, chorus, bridge, chorus, outro")
language = params.get("language", "en")
tone = params.get("tone", "")
verse_lines = int(params.get("verse_lines", 6))
chorus_lines = int(params.get("chorus_lines", 4))
bridge_lines = int(params.get("bridge_lines", 2))
rhyme = params.get("rhyme", "loose")
temperature = float(params.get("temperature", 0.85))
top_p = float(params.get("top_p", 0.9))
top_k = int(params.get("top_k", 40))
max_new_tokens = int(params.get("max_new_tokens", 600))
seed = params.get("seed")
text = lyrics_lm.generate_lyrics(
brief=brief,
structure=structure,
language=language,
tone=tone,
verse_lines=verse_lines,
chorus_lines=chorus_lines,
bridge_lines=bridge_lines,
rhyme=rhyme,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
seed=seed,
)
meta = {
"mode": "lyrics",
"model": lyrics_lm._DEFAULT_MAC_ID,
"brief_first_line": brief.splitlines()[0] if brief else "",
"structure": structure,
"language": language,
"tone": tone,
"verse_lines": verse_lines,
"chorus_lines": chorus_lines,
"bridge_lines": bridge_lines,
"rhyme": rhyme,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"max_new_tokens": max_new_tokens,
"seed": seed,
}
return text, meta
|