Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 2 days ago

Commit

9c07a74

unverified ·

1 Parent(s): adb7693

feat(lyrics): add qwen 2.5 7b lazy loader with mlx and transformers backends

Browse files

Files changed (3) hide show

lyrics_lm.py +222 -0
modes.py +65 -0
tests/test_lyrics_lm.py +50 -0

lyrics_lm.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""Qwen 2.5 7B Instruct as the lyrics writer.
+Mac path: ``mlx-lm`` with the 4-bit MLX quantisation (``mlx-community/
+Qwen2.5-7B-Instruct-4bit``) for speed and a low VRAM footprint on Apple
+Silicon. The 4-bit pack is ~4 GB on disk and runs in ~8-12 s per draft
+on an M5 Max after the first warm-up.
+CUDA / CPU path: ``transformers`` with the full ``Qwen/Qwen2.5-7B-Instruct``
+checkpoint, ``apply_chat_template`` for the prompt, and ``do_sample=True``
+generation.
+Loading is lazy — the module-level ``_LM`` singleton is constructed on the
+first call to ``_get_lm()`` so module import stays fast for CI and so the
+~4 GB MLX download is only triggered when the user actually clicks
+"▶ Draft lyrics" in the Lyrics tab.
+Tests in ``tests/test_lyrics_lm.py`` mock ``_get_lm`` at the module
+boundary so the real Qwen weights are never loaded in CI.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Any
+import ace_pipeline as ap
+_DEFAULT_MAC_ID = "mlx-community/Qwen2.5-7B-Instruct-4bit"
+_DEFAULT_CUDA_ID = "Qwen/Qwen2.5-7B-Instruct"
+_LM = None  # lazy module-level singleton
+def build_system_prompt() -> str:
+    """Locked songwriter system prompt for the Lyrics tab.
+    Returns a single multi-line string that instructs Qwen to emit ONLY
+    structurally-tagged lyrics (``[intro]`` ``[verse 1]`` ``[chorus]``
+    etc.). The exact tag vocabulary is what ACE-Step's 5Hz LM planner
+    expects downstream when the user pipes the draft into the Generate
+    tab via the "Use these in Generate" button.
+    """
+    return (
+        "You are a songwriter. Output ONLY structured lyrics for an AI music generator.\n"
+        "Use these section tags exactly: [intro] [verse 1] [verse 2] [chorus] [bridge] [outro] (etc.)\n"
+        "Each section is on its own line, followed by the lyrics for that section. "
+        "Keep verses 4-8 lines, choruses 4 lines, bridges 2-4 lines. "
+        "Match the requested tone and language. "
+        "Do not include commentary, headers, or markdown."
+    )
+def _build_user_prompt(
+    brief: str,
+    structure: str,
+    language: str,
+    tone: str,
+    verse_lines: int,
+    chorus_lines: int,
+    bridge_lines: int,
+    rhyme: str,
+) -> str:
+    return (
+        f"Write lyrics with this structure: {structure}.\n"
+        f"Language: {language}. Tone: {tone or 'neutral'}. Rhyme: {rhyme}.\n"
+        f"Verse: {verse_lines} lines. Chorus: {chorus_lines} lines. Bridge: {bridge_lines} lines.\n\n"
+        f"Brief:\n{brief}\n"
+    )
+def _normalise(text: str) -> str:
+    """Lowercase section tags and strip outer whitespace.
+    Qwen occasionally emits ``[Verse 1]`` or ``[CHORUS]`` despite the
+    system prompt asking for lowercase tags. ACE-Step's 5Hz LM expects
+    lowercase, so we coerce here rather than relying on every downstream
+    consumer to lowercase before parsing.
+    """
+    def lower_tag(match: re.Match[str]) -> str:
+        return "[" + match.group(1).lower() + "]"
+    return re.sub(r"\[([^\]]+)\]", lower_tag, text).strip()
+def _get_lm():
+    """Return the lazy module-level LM singleton.
+    Tests in ``tests/test_lyrics_lm.py`` monkeypatch this function so
+    ``_load_lm()`` is never invoked under pytest. In production the
+    first call constructs the singleton once and caches it for the
+    process lifetime.
+    """
+    global _LM
+    if _LM is None:
+        _LM = _load_lm()
+    return _LM
+def _load_lm():
+    """Construct the per-device LM wrapper.
+    On MPS we use ``mlx-lm`` which expects a model ID and returns
+    ``(model, tokenizer)``. On CUDA / CPU we use ``transformers`` with
+    ``apply_chat_template`` for the prompt.
+    """
+    device = ap.detect_device()
+    if device == "mps":
+        from mlx_lm import load  # type: ignore[import-not-found]
+        model, tokenizer = load(_DEFAULT_MAC_ID)
+        return _MLXLM(model=model, tokenizer=tokenizer)
+    # CUDA / CPU fallback path. Use bfloat16 on CUDA, float32 on CPU.
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(_DEFAULT_CUDA_ID)
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(_DEFAULT_CUDA_ID, torch_dtype=dtype)
+    if device == "cuda":
+        model = model.to("cuda")
+    return _HFLM(model=model, tokenizer=tok)
+@dataclass
+class _MLXLM:
+    """mlx-lm wrapper. ``generate`` returns a plain string (post-decode)."""
+    model: Any
+    tokenizer: Any
+    def generate(self, system: str, user: str, **kw: Any) -> str:
+        from mlx_lm import generate  # type: ignore[import-not-found]
+        # Qwen's ChatML template — mlx-lm doesn't expose apply_chat_template
+        # the way HF does, so build the prompt manually here.
+        prompt = (
+            f"<|im_start|>system\n{system}<|im_end|>\n"
+            f"<|im_start|>user\n{user}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        return generate(
+            self.model,
+            self.tokenizer,
+            prompt=prompt,
+            max_tokens=int(kw.get("max_new_tokens", 600)),
+        )
+@dataclass
+class _HFLM:
+    """transformers wrapper. ``generate`` returns the assistant continuation."""
+    model: Any
+    tokenizer: Any
+    def generate(self, system: str, user: str, **kw: Any) -> str:
+        msgs = [
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ]
+        prompt = self.tokenizer.apply_chat_template(
+            msgs,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        out = self.model.generate(
+            **inputs,
+            max_new_tokens=int(kw.get("max_new_tokens", 600)),
+            temperature=float(kw.get("temperature", 0.85)),
+            top_p=float(kw.get("top_p", 0.9)),
+            top_k=int(kw.get("top_k", 40)),
+            repetition_penalty=float(kw.get("repetition_penalty", 1.1)),
+            do_sample=True,
+        )
+        full = self.tokenizer.decode(out[0], skip_special_tokens=True)
+        # Strip the prompt prefix so only the generated text remains.
+        return full[len(prompt) :] if full.startswith(prompt) else full
+def generate_lyrics(
+    brief: str,
+    structure: str,
+    language: str,
+    tone: str,
+    verse_lines: int,
+    chorus_lines: int,
+    bridge_lines: int,
+    rhyme: str,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    max_new_tokens: int,
+    seed: int | None = None,
+) -> str:
+    """Draft structurally-tagged lyrics for the Lyrics tab.
+    Builds the user prompt from the form fields, asks the LM to generate,
+    and runs the output through ``_normalise()`` so section tags are
+    lowercase. ``seed`` is accepted for parity with the UI but is not
+    threaded through the mlx-lm / transformers ``generate`` calls because
+    neither backend's high-level ``generate(...)`` helper accepts a seed
+    in the version we ship with — deterministic seeding would require
+    dropping to the per-step token loop, which we'll add if reproducibility
+    becomes a hard requirement.
+    """
+    lm = _get_lm()
+    user = _build_user_prompt(
+        brief, structure, language, tone, verse_lines, chorus_lines, bridge_lines, rhyme
+    )
+    raw = lm.generate(
+        system=build_system_prompt(),
+        user=user,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        max_new_tokens=max_new_tokens,
+    )
+    return _normalise(raw)

modes.py CHANGED Viewed

@@ -3,12 +3,19 @@
 Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
 hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
 pipeline lifecycle keeps these handlers cheap to test.
 """
 from __future__ import annotations
 from typing import Any
 def _require(params: dict[str, Any], field: str) -> Any:
     v = params.get(field)
@@ -155,3 +162,61 @@ def edit(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
         )
     return backend.dispatch(mode="edit", params=out_params)

 Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
 hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
 pipeline lifecycle keeps these handlers cheap to test.
+The ``lyrics()`` handler is the odd one out: it does NOT touch the ACE-Step
+backend at all. It calls ``lyrics_lm.generate_lyrics`` directly, since the
+Qwen 2.5 7B LM is its own lazy singleton and doesn't share the DiT / 5Hz
+pipeline lifecycle with the audio modes.
 """
 from __future__ import annotations
 from typing import Any
+import lyrics_lm
 def _require(params: dict[str, Any], field: str) -> Any:
     v = params.get(field)
         )
     return backend.dispatch(mode="edit", params=out_params)
+def lyrics(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+    """Lyrics-only mode. Returns ``(drafted_text, metadata_dict)``.
+    Does NOT touch the ACE-Step backend — Qwen 2.5 7B Instruct is owned
+    by ``lyrics_lm`` as its own lazy singleton. The ``backend`` argument
+    is kept in the signature for parity with the other mode handlers but
+    is unused here.
+    """
+    del backend  # signature parity with generate/cover/extend/edit
+    brief = _require(params, "brief")
+    structure = params.get("structure", "intro, verse, chorus, verse, chorus, bridge, chorus, outro")
+    language = params.get("language", "en")
+    tone = params.get("tone", "")
+    verse_lines = int(params.get("verse_lines", 6))
+    chorus_lines = int(params.get("chorus_lines", 4))
+    bridge_lines = int(params.get("bridge_lines", 2))
+    rhyme = params.get("rhyme", "loose")
+    temperature = float(params.get("temperature", 0.85))
+    top_p = float(params.get("top_p", 0.9))
+    top_k = int(params.get("top_k", 40))
+    max_new_tokens = int(params.get("max_new_tokens", 600))
+    seed = params.get("seed")
+    text = lyrics_lm.generate_lyrics(
+        brief=brief,
+        structure=structure,
+        language=language,
+        tone=tone,
+        verse_lines=verse_lines,
+        chorus_lines=chorus_lines,
+        bridge_lines=bridge_lines,
+        rhyme=rhyme,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        max_new_tokens=max_new_tokens,
+        seed=seed,
+    )
+    meta = {
+        "mode": "lyrics",
+        "model": lyrics_lm._DEFAULT_MAC_ID,
+        "brief_first_line": brief.splitlines()[0] if brief else "",
+        "structure": structure,
+        "language": language,
+        "tone": tone,
+        "verse_lines": verse_lines,
+        "chorus_lines": chorus_lines,
+        "bridge_lines": bridge_lines,
+        "rhyme": rhyme,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "max_new_tokens": max_new_tokens,
+        "seed": seed,
+    }
+    return text, meta

tests/test_lyrics_lm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""L2 tests for lyrics LM — generation is mocked at the model boundary.
+The real Qwen 2.5 7B model is never loaded in CI. We only verify the prompt
+shape, the call boundary to ``_get_lm()``, and the normalisation pass that
+lowercases section tags before returning to the caller.
+"""
+from __future__ import annotations
+from unittest.mock import MagicMock
+import lyrics_lm as ll
+def test_build_system_prompt_includes_tag_format():
+    sp = ll.build_system_prompt()
+    low = sp.lower()
+    assert "[verse" in low
+    assert "[chorus" in low
+def test_generate_lyrics_calls_lm_and_returns_text(monkeypatch):
+    fake_lm = MagicMock()
+    fake_lm.generate.return_value = "[verse] x\n[chorus] y\n"
+    monkeypatch.setattr(ll, "_get_lm", lambda: fake_lm)
+    out = ll.generate_lyrics(
+        brief="a song",
+        structure="intro, verse, chorus, outro",
+        language="en",
+        tone="upbeat",
+        verse_lines=4,
+        chorus_lines=4,
+        bridge_lines=2,
+        rhyme="loose",
+        temperature=0.85,
+        top_p=0.9,
+        top_k=40,
+        max_new_tokens=200,
+        seed=42,
+    )
+    assert "[verse]" in out
+    fake_lm.generate.assert_called_once()
+def test_normalise_lyrics_lowercases_tags():
+    norm = ll._normalise(" [Verse 1]\nhello\n[Chorus]\nworld ")
+    assert "[verse 1]" in norm
+    assert "[chorus]" in norm
+    assert "[Verse" not in norm