ACE-Music-Studio / modes.py
techfreakworm's picture
feat(lyrics): add qwen 2.5 7b lazy loader with mlx and transformers backends
9c07a74 unverified
"""Pure mode handlers — one function per generation mode.
Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
pipeline lifecycle keeps these handlers cheap to test.
The ``lyrics()`` handler is the odd one out: it does NOT touch the ACE-Step
backend at all. It calls ``lyrics_lm.generate_lyrics`` directly, since the
Qwen 2.5 7B LM is its own lazy singleton and doesn't share the DiT / 5Hz
pipeline lifecycle with the audio modes.
"""
from __future__ import annotations
from typing import Any
import lyrics_lm
def _require(params: dict[str, Any], field: str) -> Any:
v = params.get(field)
if v is None or (isinstance(v, str) and not v.strip()):
raise ValueError(f"Missing required field: {field}")
return v
def generate(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Text → song. Vocals + instruments in one stream."""
prompt = _require(params, "prompt")
lyrics = params.get("lyrics", "")
duration_s = int(params.get("duration_s", 30))
instrumental = bool(params.get("instrumental", False))
return backend.dispatch(
mode="generate",
params={
"prompt": prompt,
"lyrics": lyrics,
"duration_s": duration_s,
"instrumental": instrumental,
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def cover(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Audio-reference cover — reference audio + new prompt -> song in that style.
Maps to ACE-Step's ``GenerationParams(task_type="cover")`` with
``reference_audio`` set to the uploaded clip and ``audio_cover_strength``
controlling how tightly the new song hugs the reference timbre/structure.
"""
ref_audio = _require(params, "ref_audio")
prompt = params.get("prompt", "")
lyrics = params.get("lyrics", "")
duration_s = int(params.get("duration_s", 30))
return backend.dispatch(
mode="cover",
params={
"prompt": prompt,
"ref_audio": ref_audio,
"lyrics": lyrics,
"duration_s": duration_s,
"audio_cover_strength": float(params.get("audio_cover_strength", 0.93)),
"cover_noise_strength": float(params.get("cover_noise_strength", 0.0)),
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def extend(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Song continuation — seed audio + extension prompt -> extended song.
Maps to ACE-Step's ``GenerationParams(task_type="repaint")`` with
``src_audio`` set to the seed and ``repainting_start``/``repainting_end``
pointing past the end of the seed so the model paints new audio after it.
"""
seed_audio = _require(params, "seed_audio")
extra_prompt = params.get("extra_prompt", "")
extra_duration_s = int(params.get("extra_duration_s", 60))
return backend.dispatch(
mode="extend",
params={
"seed_audio": seed_audio,
"extra_prompt": extra_prompt,
"extension_lyrics": params.get("extension_lyrics", ""),
"extra_duration_s": extra_duration_s,
"repaint_mode": params.get("repaint_mode", "balanced"),
"repaint_strength": float(params.get("repaint_strength", 0.5)),
"wav_crossfade_s": float(params.get("wav_crossfade_s", 2.0)),
"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
},
)
def edit(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Segment-level edit — repaint a region OR morph caption-to-caption.
Two sub-modes:
- ``"repaint"`` (default): paint over ``[segment_start_s, segment_end_s]``
using ACE-Step's ``task_type="repaint"`` with the segment bounds wired
into ``repainting_start`` / ``repainting_end``.
- ``"flow_edit"``: caption-to-caption morph. The installed ACE-Step
``GenerationParams`` dataclass has no native ``flow_edit_*`` fields, so
flow-edit is implemented downstream as a ``task_type="repaint"`` pass
with a lower ``audio_cover_strength`` to allow more style drift. The
``flow_source_caption`` / ``flow_n_*`` knobs are carried through the
internal params dict so the pipeline wrapper can use them if/when the
upstream dataclass grows native support.
"""
source_audio = _require(params, "source_audio")
sub_mode = params.get("sub_mode", "repaint")
out_params: dict[str, Any] = {
"source_audio": source_audio,
"source_lyrics": params.get("source_lyrics", ""),
"target_lyrics": params.get("target_lyrics", ""),
"segment_start_s": float(params.get("segment_start_s", 0.0)),
"segment_end_s": float(params.get("segment_end_s", 30.0)),
"sub_mode": sub_mode,
"seed": params.get("seed"),
"loras": params.get("loras", []),
"advanced": params.get("advanced", {}),
"lm": params.get("lm", {}),
"dcw": params.get("dcw", {}),
}
if sub_mode == "repaint":
out_params.update(
{
"repaint_mode": params.get("repaint_mode", "balanced"),
"repaint_strength": float(params.get("repaint_strength", 0.5)),
"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
"wav_crossfade_s": float(params.get("wav_crossfade_s", 0.0)),
}
)
elif sub_mode == "flow_edit":
out_params.update(
{
"flow_source_caption": params.get("flow_source_caption", ""),
"flow_n_min": float(params.get("flow_n_min", 0.0)),
"flow_n_max": float(params.get("flow_n_max", 1.0)),
"flow_n_avg": int(params.get("flow_n_avg", 1)),
}
)
return backend.dispatch(mode="edit", params=out_params)
def lyrics(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Lyrics-only mode. Returns ``(drafted_text, metadata_dict)``.
Does NOT touch the ACE-Step backend — Qwen 2.5 7B Instruct is owned
by ``lyrics_lm`` as its own lazy singleton. The ``backend`` argument
is kept in the signature for parity with the other mode handlers but
is unused here.
"""
del backend # signature parity with generate/cover/extend/edit
brief = _require(params, "brief")
structure = params.get("structure", "intro, verse, chorus, verse, chorus, bridge, chorus, outro")
language = params.get("language", "en")
tone = params.get("tone", "")
verse_lines = int(params.get("verse_lines", 6))
chorus_lines = int(params.get("chorus_lines", 4))
bridge_lines = int(params.get("bridge_lines", 2))
rhyme = params.get("rhyme", "loose")
temperature = float(params.get("temperature", 0.85))
top_p = float(params.get("top_p", 0.9))
top_k = int(params.get("top_k", 40))
max_new_tokens = int(params.get("max_new_tokens", 600))
seed = params.get("seed")
text = lyrics_lm.generate_lyrics(
brief=brief,
structure=structure,
language=language,
tone=tone,
verse_lines=verse_lines,
chorus_lines=chorus_lines,
bridge_lines=bridge_lines,
rhyme=rhyme,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
seed=seed,
)
meta = {
"mode": "lyrics",
"model": lyrics_lm._DEFAULT_MAC_ID,
"brief_first_line": brief.splitlines()[0] if brief else "",
"structure": structure,
"language": language,
"tone": tone,
"verse_lines": verse_lines,
"chorus_lines": chorus_lines,
"bridge_lines": bridge_lines,
"rhyme": rhyme,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"max_new_tokens": max_new_tokens,
"seed": seed,
}
return text, meta