Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 2 days ago

Commit

26dc3a4

unverified ·

1 Parent(s): a2eae29

feat(pipeline): wire cover/extend/edit task_types in studio.generate

Browse files

Files changed (3) hide show

ace_pipeline.py +83 -10
backend.py +8 -5
tests/test_backend.py +48 -0

ace_pipeline.py CHANGED Viewed

@@ -135,12 +135,30 @@ class ACEStepStudio:
         self._llm = llm
     def generate(self, params: dict) -> str:
-        """Run a single text→song generation.
-        ``params`` is the dict produced by ``modes.generate``:
-        ``{"prompt", "lyrics", "duration_s", "instrumental", "seed",
-        "loras", "advanced", "lm", "dcw"}``. Returns the path to the
-        produced audio file.
         """
         self._ensure_loaded()
@@ -152,20 +170,68 @@ class ACEStepStudio:
         advanced = params.get("advanced", {}) or {}
         lm_opts = params.get("lm", {}) or {}
         # Map our internal dict to ACE-Step's GenerationParams.
         # Lyrics "[Instrumental]" is the ACE-Step convention for instrumental.
-        lyrics = params.get("lyrics", "") or ""
         instrumental = bool(params.get("instrumental", False))
         if instrumental and not lyrics:
             lyrics = "[Instrumental]"
         gen_params = GenerationParams(
-            task_type="text2music",
-            caption=params.get("prompt", ""),
             lyrics=lyrics,
             instrumental=instrumental,
-            duration=int(params.get("duration_s", 30)),
             seed=int(params.get("seed", -1)),
             inference_steps=int(advanced.get("steps", 32)),
             guidance_scale=float(advanced.get("cfg", 4.0)),
@@ -176,6 +242,13 @@ class ACEStepStudio:
             vocal_language=advanced.get("vocal_language", "unknown"),
             cfg_interval_start=float(advanced.get("cfg_interval_start", 0.0)),
             cfg_interval_end=float(advanced.get("cfg_interval_end", 1.0)),
             thinking=bool(lm_opts.get("thinking", False)),
             lm_temperature=float(lm_opts.get("temperature", 0.85)),
             lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),

         self._llm = llm
     def generate(self, params: dict) -> str:
+        """Run a single song generation across all four modes.
+        ``params`` is the dict produced by the mode handlers in ``modes.py``.
+        The ``params["mode"]`` key (``generate`` | ``cover`` | ``extend`` |
+        ``edit``) selects the ACE-Step ``task_type`` and which audio inputs
+        get wired through to ``GenerationParams``:
+        - ``generate``: ``task_type="text2music"``
+        - ``cover``:    ``task_type="cover"`` + ``reference_audio`` +
+          ``audio_cover_strength``
+        - ``extend``:   ``task_type="repaint"`` + ``src_audio`` set to the
+          seed, with ``repainting_start=-1`` / ``repainting_end=-1`` as a
+          sentinel meaning "paint after the end of the seed". The actual
+          mask shaping ultimately lives inside ACE-Step's repaint path.
+        - ``edit``:     ``task_type="repaint"`` + ``src_audio`` + explicit
+          ``[segment_start_s, segment_end_s]`` segment bounds.
+        Flow-edit (``sub_mode="flow_edit"``) is implemented as a repaint
+        pass: the installed ACE-Step ``GenerationParams`` dataclass has no
+        native ``flow_edit_*`` fields, so the extra flow-edit knobs carried
+        in the internal params dict are ignored at the ``GenerationParams``
+        instantiation level and will need wiring once upstream grows them.
+        Returns the path to the produced audio file.
         """
         self._ensure_loaded()
         advanced = params.get("advanced", {}) or {}
         lm_opts = params.get("lm", {}) or {}
+        mode = params.get("mode", "generate")
         # Map our internal dict to ACE-Step's GenerationParams.
         # Lyrics "[Instrumental]" is the ACE-Step convention for instrumental.
+        lyrics = params.get("lyrics", "") or params.get("extension_lyrics", "") or ""
+        if mode == "edit":
+            lyrics = params.get("target_lyrics", "") or lyrics
         instrumental = bool(params.get("instrumental", False))
         if instrumental and not lyrics:
             lyrics = "[Instrumental]"
+        # Mode-specific task_type + audio inputs.
+        # All five fields below MUST resolve before we instantiate
+        # GenerationParams so that the dataclass ctor sees consistent values.
+        ref_audio: str | None = None
+        src_audio: str | None = None
+        audio_cover_strength = 0.0
+        repainting_start = 0.0
+        repainting_end = -1.0
+        if mode == "generate":
+            task_type = "text2music"
+        elif mode == "cover":
+            task_type = "cover"
+            ref_audio = params.get("ref_audio")
+            audio_cover_strength = float(params.get("audio_cover_strength", 0.93))
+        elif mode == "extend":
+            task_type = "repaint"
+            src_audio = params.get("seed_audio")
+            # Sentinel: -1 / -1 means "append after the seed audio's end".
+            # ACE-Step's repaint path interprets these bounds against the
+            # src_audio duration; the actual semantics need verifying once
+            # we run a full pass on real hardware (M3 GPU smoke).
+            repainting_start = -1.0
+            repainting_end = -1.0
+        elif mode == "edit":
+            task_type = "repaint"
+            src_audio = params.get("source_audio")
+            repainting_start = float(params.get("segment_start_s", 0.0))
+            repainting_end = float(params.get("segment_end_s", 30.0))
+            # flow_edit sub-mode: lower audio_cover_strength to allow style
+            # drift while still using the repaint task type. The extra
+            # flow_* fields in our internal params dict are kept around for
+            # future use but not forwarded to GenerationParams (no native
+            # support in the installed dataclass).
+            if params.get("sub_mode") == "flow_edit":
+                audio_cover_strength = 0.3
+        else:
+            raise ValueError(f"Unknown mode: {mode!r}")
+        # Caption can come from the per-mode handlers under different keys.
+        caption = (
+            params.get("prompt") or params.get("extra_prompt") or params.get("flow_source_caption") or ""
+        )
+        duration_s = int(params.get("duration_s") or params.get("extra_duration_s") or 30)
         gen_params = GenerationParams(
+            task_type=task_type,
+            caption=caption,
             lyrics=lyrics,
             instrumental=instrumental,
+            duration=duration_s,
             seed=int(params.get("seed", -1)),
             inference_steps=int(advanced.get("steps", 32)),
             guidance_scale=float(advanced.get("cfg", 4.0)),
             vocal_language=advanced.get("vocal_language", "unknown"),
             cfg_interval_start=float(advanced.get("cfg_interval_start", 0.0)),
             cfg_interval_end=float(advanced.get("cfg_interval_end", 1.0)),
+            # Mode-specific audio inputs + repaint bounds
+            reference_audio=ref_audio,
+            src_audio=src_audio,
+            audio_cover_strength=audio_cover_strength,
+            repainting_start=repainting_start,
+            repainting_end=repainting_end,
+            # 5Hz language model knobs
             thinking=bool(lm_opts.get("thinking", False)),
             lm_temperature=float(lm_opts.get("temperature", 0.85)),
             lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),

backend.py CHANGED Viewed

@@ -77,10 +77,13 @@ class ACEStepStudioBackend:
         ``generate(params)`` method that handles the underlying
         AceStepHandler + LLMHandler + generate_music plumbing.
-        Cover / Extend / Edit / Lyrics task_types are mapped here at
-        M3 / M4 by switching ``params["task_type"]`` before calling.
         """
-        if mode == "generate":
-            return pipe.generate(params)
-        # cover / extend / edit / lyrics get filled in at M3 / M4
         raise NotImplementedError(f"Mode {mode!r} is not wired yet")

         ``generate(params)`` method that handles the underlying
         AceStepHandler + LLMHandler + generate_music plumbing.
+        All four song modes (``generate``, ``cover``, ``extend``, ``edit``)
+        flow through ``pipe.generate(params)``. The pipeline wrapper
+        switches its ``GenerationParams.task_type`` based on ``params["mode"]``
+        — see ``ace_pipeline.ACEStepStudio.generate`` for the mapping. The
+        ``lyrics`` mode is wired separately at M4.
         """
+        if mode in ("generate", "cover", "extend", "edit"):
+            params_with_mode = {**params, "mode": mode}
+            return pipe.generate(params_with_mode)
         raise NotImplementedError(f"Mode {mode!r} is not wired yet")

tests/test_backend.py CHANGED Viewed

@@ -4,6 +4,8 @@ from __future__ import annotations
 from unittest.mock import MagicMock
 import backend as be
@@ -98,3 +100,49 @@ def test_dispatch_applies_lora_stack(monkeypatch, tmp_path):
     )
     apply_mock.assert_called_once_with(fake_pipe, stack)

 from unittest.mock import MagicMock
+import pytest
 import backend as be
     )
     apply_mock.assert_called_once_with(fake_pipe, stack)
+@pytest.mark.parametrize(
+    "mode,extra",
+    [
+        ("cover", {"ref_audio": "/tmp/ref.wav", "audio_cover_strength": 0.9}),
+        ("extend", {"seed_audio": "/tmp/seed.wav", "extra_duration_s": 60}),
+        (
+            "edit",
+            {
+                "source_audio": "/tmp/src.wav",
+                "segment_start_s": 50.0,
+                "segment_end_s": 90.0,
+                "sub_mode": "repaint",
+            },
+        ),
+    ],
+)
+def test_dispatch_forwards_mode_to_pipe_generate(monkeypatch, tmp_path, mode, extra):
+    fake_pipe = MagicMock()
+    fake_pipe.generate.return_value = str(tmp_path / "x.wav")
+    (tmp_path / "x.wav").write_bytes(b"RIFF")
+    monkeypatch.setattr("ace_pipeline.get_pipeline", lambda: fake_pipe)
+    monkeypatch.setattr("lora_stack.apply_stack", MagicMock())
+    b = be.ACEStepStudioBackend()
+    params = {
+        "prompt": "p",
+        "lyrics": "",
+        "duration_s": 10,
+        "instrumental": True,
+        "seed": 42,
+        "loras": [],
+        "advanced": {},
+        "lm": {},
+        "dcw": {},
+        **extra,
+    }
+    b.dispatch(mode=mode, params=params)
+    fake_pipe.generate.assert_called_once()
+    sent_params = fake_pipe.generate.call_args.args[0]
+    assert sent_params["mode"] == mode
+    # Mode-specific keys propagate to pipe.generate
+    for k, v in extra.items():
+        assert sent_params[k] == v