Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

File size: 8,756 Bytes

"""Pure mode handlers — one function per generation mode.

Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
pipeline lifecycle keeps these handlers cheap to test.

The ``lyrics()`` handler is the odd one out: it does NOT touch the ACE-Step
backend at all. It calls ``lyrics_lm.generate_lyrics`` directly, since the
Qwen 2.5 7B LM is its own lazy singleton and doesn't share the DiT / 5Hz
pipeline lifecycle with the audio modes.
"""

from __future__ import annotations

from typing import Any

import lyrics_lm


def _require(params: dict[str, Any], field: str) -> Any:
    v = params.get(field)
    if v is None or (isinstance(v, str) and not v.strip()):
        raise ValueError(f"Missing required field: {field}")
    return v


def generate(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
    """Text → song. Vocals + instruments in one stream."""
    prompt = _require(params, "prompt")
    lyrics = params.get("lyrics", "")
    duration_s = int(params.get("duration_s", 30))
    instrumental = bool(params.get("instrumental", False))

    return backend.dispatch(
        mode="generate",
        params={
            "prompt": prompt,
            "lyrics": lyrics,
            "duration_s": duration_s,
            "instrumental": instrumental,
            "seed": params.get("seed"),
            "loras": params.get("loras", []),
            "advanced": params.get("advanced", {}),
            "lm": params.get("lm", {}),
            "dcw": params.get("dcw", {}),
        },
    )


def cover(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
    """Audio-reference cover — reference audio + new prompt -> song in that style.

    Maps to ACE-Step's ``GenerationParams(task_type="cover")`` with
    ``reference_audio`` set to the uploaded clip and ``audio_cover_strength``
    controlling how tightly the new song hugs the reference timbre/structure.
    """
    ref_audio = _require(params, "ref_audio")
    prompt = params.get("prompt", "")
    lyrics = params.get("lyrics", "")
    duration_s = int(params.get("duration_s", 30))

    return backend.dispatch(
        mode="cover",
        params={
            "prompt": prompt,
            "ref_audio": ref_audio,
            "lyrics": lyrics,
            "duration_s": duration_s,
            "audio_cover_strength": float(params.get("audio_cover_strength", 0.93)),
            "cover_noise_strength": float(params.get("cover_noise_strength", 0.0)),
            "seed": params.get("seed"),
            "loras": params.get("loras", []),
            "advanced": params.get("advanced", {}),
            "lm": params.get("lm", {}),
            "dcw": params.get("dcw", {}),
        },
    )


def extend(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
    """Song continuation — seed audio + extension prompt -> extended song.

    Maps to ACE-Step's ``GenerationParams(task_type="repaint")`` with
    ``src_audio`` set to the seed and ``repainting_start``/``repainting_end``
    pointing past the end of the seed so the model paints new audio after it.
    """
    seed_audio = _require(params, "seed_audio")
    extra_prompt = params.get("extra_prompt", "")
    extra_duration_s = int(params.get("extra_duration_s", 60))

    return backend.dispatch(
        mode="extend",
        params={
            "seed_audio": seed_audio,
            "extra_prompt": extra_prompt,
            "extension_lyrics": params.get("extension_lyrics", ""),
            "extra_duration_s": extra_duration_s,
            "repaint_mode": params.get("repaint_mode", "balanced"),
            "repaint_strength": float(params.get("repaint_strength", 0.5)),
            "wav_crossfade_s": float(params.get("wav_crossfade_s", 2.0)),
            "latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
            "chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
            "seed": params.get("seed"),
            "loras": params.get("loras", []),
            "advanced": params.get("advanced", {}),
            "lm": params.get("lm", {}),
            "dcw": params.get("dcw", {}),
        },
    )


def edit(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
    """Segment-level edit — repaint a region OR morph caption-to-caption.

    Two sub-modes:

    - ``"repaint"`` (default): paint over ``[segment_start_s, segment_end_s]``
      using ACE-Step's ``task_type="repaint"`` with the segment bounds wired
      into ``repainting_start`` / ``repainting_end``.
    - ``"flow_edit"``: caption-to-caption morph. The installed ACE-Step
      ``GenerationParams`` dataclass has no native ``flow_edit_*`` fields, so
      flow-edit is implemented downstream as a ``task_type="repaint"`` pass
      with a lower ``audio_cover_strength`` to allow more style drift. The
      ``flow_source_caption`` / ``flow_n_*`` knobs are carried through the
      internal params dict so the pipeline wrapper can use them if/when the
      upstream dataclass grows native support.
    """
    source_audio = _require(params, "source_audio")
    sub_mode = params.get("sub_mode", "repaint")

    out_params: dict[str, Any] = {
        "source_audio": source_audio,
        "source_lyrics": params.get("source_lyrics", ""),
        "target_lyrics": params.get("target_lyrics", ""),
        "segment_start_s": float(params.get("segment_start_s", 0.0)),
        "segment_end_s": float(params.get("segment_end_s", 30.0)),
        "sub_mode": sub_mode,
        "seed": params.get("seed"),
        "loras": params.get("loras", []),
        "advanced": params.get("advanced", {}),
        "lm": params.get("lm", {}),
        "dcw": params.get("dcw", {}),
    }
    if sub_mode == "repaint":
        out_params.update(
            {
                "repaint_mode": params.get("repaint_mode", "balanced"),
                "repaint_strength": float(params.get("repaint_strength", 0.5)),
                "chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
                "latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
                "wav_crossfade_s": float(params.get("wav_crossfade_s", 0.0)),
            }
        )
    elif sub_mode == "flow_edit":
        out_params.update(
            {
                "flow_source_caption": params.get("flow_source_caption", ""),
                "flow_n_min": float(params.get("flow_n_min", 0.0)),
                "flow_n_max": float(params.get("flow_n_max", 1.0)),
                "flow_n_avg": int(params.get("flow_n_avg", 1)),
            }
        )

    return backend.dispatch(mode="edit", params=out_params)


def lyrics(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
    """Lyrics-only mode. Returns ``(drafted_text, metadata_dict)``.

    Does NOT touch the ACE-Step backend — Qwen 2.5 7B Instruct is owned
    by ``lyrics_lm`` as its own lazy singleton. The ``backend`` argument
    is kept in the signature for parity with the other mode handlers but
    is unused here.
    """
    del backend  # signature parity with generate/cover/extend/edit
    brief = _require(params, "brief")
    structure = params.get("structure", "intro, verse, chorus, verse, chorus, bridge, chorus, outro")
    language = params.get("language", "en")
    tone = params.get("tone", "")
    verse_lines = int(params.get("verse_lines", 6))
    chorus_lines = int(params.get("chorus_lines", 4))
    bridge_lines = int(params.get("bridge_lines", 2))
    rhyme = params.get("rhyme", "loose")
    temperature = float(params.get("temperature", 0.85))
    top_p = float(params.get("top_p", 0.9))
    top_k = int(params.get("top_k", 40))
    max_new_tokens = int(params.get("max_new_tokens", 600))
    seed = params.get("seed")

    text = lyrics_lm.generate_lyrics(
        brief=brief,
        structure=structure,
        language=language,
        tone=tone,
        verse_lines=verse_lines,
        chorus_lines=chorus_lines,
        bridge_lines=bridge_lines,
        rhyme=rhyme,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        max_new_tokens=max_new_tokens,
        seed=seed,
    )
    meta = {
        "mode": "lyrics",
        "model": lyrics_lm._DEFAULT_MAC_ID,
        "brief_first_line": brief.splitlines()[0] if brief else "",
        "structure": structure,
        "language": language,
        "tone": tone,
        "verse_lines": verse_lines,
        "chorus_lines": chorus_lines,
        "bridge_lines": bridge_lines,
        "rhyme": rhyme,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "max_new_tokens": max_new_tokens,
        "seed": seed,
    }
    return text, meta