Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 2 days ago

Commit

99375d0

unverified ·

1 Parent(s): 65ab3e7

refactor(pipeline): rewrite for real acestep AceStepHandler+LLMHandler api

Browse files

The original ace_pipeline assumed ``ACEStepPipeline.from_pretrained()`` —
a clean library entry point that does NOT exist in the installed
``acestep`` package (apple-silicon fork on Mac, upstream on CUDA). The
real API is a split-handler pattern:

from acestep.handler import AceStepHandler
from acestep.llm_inference import LLMHandler
from acestep.inference import GenerationParams, GenerationConfig, generate_music

dit = AceStepHandler(); dit.initialize_service(project_root, config_path, device)
lm = LLMHandler(); lm.initialize(checkpoint_dir, lm_model_path, backend, device)
result = generate_music(dit, lm, GenerationParams(...), GenerationConfig(...))

To keep ``modes.py`` and ``backend.py`` clean, ``ace_pipeline`` now
exposes a single ``ACEStepStudio`` wrapper that owns both handlers and
exposes ``generate(params: dict) -> str`` returning the audio path.

Defaults:
- DiT: ACE-Step/acestep-v15-xl-sft (~16 GB) → ``./checkpoints/acestep-v15-xl-sft/``
- LM: ACE-Step/acestep-5Hz-lm-0.6B (~1.4 GB) → ``./checkpoints/acestep-5Hz-lm-0.6B/``

The fork auto-routes ``backend='vllm'`` to ``mlx`` on ``device='mps'``
when mlx-lm is installed, so the same code path works on Mac and CUDA.

Tests updated to mock the wrapper interface: ``pipe.generate(params)``
instead of ``pipe(...)``. 17/17 L1+L2 pass; the GPU smoke (deselected
by default) exercises the real pipeline once checkpoints are downloaded.

Closes spec §14.1 open question (canonical ACE-Step Python API).

Files changed (5) hide show

ace_pipeline.py +173 -27
backend.py +11 -8
tests/test_ace_pipeline_lazy.py +147 -20
tests/test_backend.py +17 -7
tests/test_smoke_gpu.py +22 -15

ace_pipeline.py CHANGED Viewed

@@ -1,12 +1,49 @@
 """ACE-Step pipeline lifecycle: device autodetect, lazy load, cache mirror.
-Mirrors z-image-studio's `models.py` pattern. M0 only implements device
-detection — the pipeline class itself is filled in at M1.
 """
 from __future__ import annotations
 import os
 def detect_device() -> str:
@@ -27,9 +64,9 @@ def detect_device() -> str:
 def vram_limit_for(device: str) -> int | None:
     """Returns a VRAM cap in bytes for CUDA, None otherwise.
-    `torch.mps` has no `mem_get_info` — calling DiffSynth-style free-VRAM
-    gates with a numeric limit would crash on MPS. Returning None lets the
-    pipeline short-circuit those checks.
     """
     if device != "cuda":
         return None
@@ -43,30 +80,139 @@ def vram_limit_for(device: str) -> int | None:
         return None
-_PIPELINE = None  # module-level lazy singleton
-_DEFAULT_MODEL_ID = "ACE-Step/acestep-v15-xl-sft"
-def _load_pipeline(device: str, model_path: str):
-    """Construct the ACE-Step pipeline. Heavy import is local so unit tests can mock."""
-    from ace_step import ACEStepPipeline  # type: ignore[import-not-found]
-    # On Mac, the apple-silicon fork sets dtype + backend automatically.
-    # On CUDA we pass bf16 explicitly.
-    if device == "cuda":
-        pipe = ACEStepPipeline.from_pretrained(model_path, torch_dtype="bf16")
-    else:
-        pipe = ACEStepPipeline.from_pretrained(model_path)
-    pipe.to(device)
-    return pipe
-def get_pipeline():
-    """Lazy-load the ACE-Step pipeline once per process."""
     global _PIPELINE
     if _PIPELINE is None:
-        device = detect_device()
-        model_path = os.environ.get("ACE_MODEL_PATH", _DEFAULT_MODEL_ID)
-        _PIPELINE = _load_pipeline(device, model_path)
     return _PIPELINE

 """ACE-Step pipeline lifecycle: device autodetect, lazy load, cache mirror.
+The installed ``acestep`` package (apple-silicon fork on Mac, upstream on
+CUDA) does NOT expose a single ``ACEStepPipeline.from_pretrained`` entry
+point. The real API is a split-handler pattern:
+  from acestep.handler import AceStepHandler           # DiT side
+  from acestep.llm_inference import LLMHandler         # 5Hz LM planner
+  from acestep.inference import (
+      GenerationParams, GenerationConfig, generate_music,
+  )
+  dit = AceStepHandler()
+  dit.initialize_service(project_root=..., config_path="acestep-v15-xl-sft",
+                         device="mps")
+  lm = LLMHandler()
+  lm.initialize(checkpoint_dir=..., lm_model_path="acestep-5Hz-lm-0.6B",
+                backend="vllm",      # auto-routes to mlx on mps
+                device="mps")
+  params = GenerationParams(caption=..., lyrics=..., duration=..., seed=...)
+  cfg = GenerationConfig(batch_size=1, audio_format="wav")
+  result = generate_music(dit, lm, params, cfg)
+  # result.audios[0]["path"] is the WAV file
+To keep ``backend.py`` and ``modes.py`` clean, this module exposes a
+single ``ACEStepStudio`` wrapper that owns both handlers and exposes a
+``generate(params: dict) -> str`` method returning the audio path.
+``get_pipeline()`` returns the lazy singleton wrapper.
+Checkpoints live under ``{project_root}/checkpoints/{config_path}/``.
+On Mac with the apple-silicon fork, the fork auto-downloads from
+HuggingFace if a checkpoint is missing, but in practice we pre-download
+via ``hf download`` before the first inference call to avoid pytest
+timeouts.
 """
 from __future__ import annotations
 import os
+from pathlib import Path
+_REPO_ROOT = Path(__file__).resolve().parent
+_CHECKPOINTS_DIR = _REPO_ROOT / "checkpoints"
+_DEFAULT_DIT_CONFIG = "acestep-v15-xl-sft"
+_DEFAULT_LM_MODEL = "acestep-5Hz-lm-0.6B"
 def detect_device() -> str:
 def vram_limit_for(device: str) -> int | None:
     """Returns a VRAM cap in bytes for CUDA, None otherwise.
+    ``torch.mps`` has no ``mem_get_info`` — calling DiffSynth-style
+    free-VRAM gates with a numeric limit would crash on MPS. Returning
+    None lets the pipeline short-circuit those checks.
     """
     if device != "cuda":
         return None
         return None
+class ACEStepStudio:
+    """Wrapper around the apple-silicon fork's split-handler API.
+    Owns one ``AceStepHandler`` (DiT) and one ``LLMHandler`` (5Hz LM
+    planner). Both are lazy-loaded on the first ``generate(...)`` call.
+    """
+    def __init__(
+        self,
+        dit_config: str | None = None,
+        lm_model: str | None = None,
+        device: str | None = None,
+    ) -> None:
+        self._dit = None
+        self._llm = None
+        self._dit_config = dit_config or os.environ.get("ACE_DIT_CONFIG", _DEFAULT_DIT_CONFIG)
+        self._lm_model = lm_model or os.environ.get("ACE_LM_MODEL", _DEFAULT_LM_MODEL)
+        self._device = device or detect_device()
+    @property
+    def device(self) -> str:
+        return self._device
+    @property
+    def is_loaded(self) -> bool:
+        return self._dit is not None and self._llm is not None
+    def _ensure_loaded(self) -> None:
+        """First-call lazy load of both handlers. Heavy imports stay local."""
+        if self.is_loaded:
+            return
+        from acestep.handler import AceStepHandler
+        from acestep.llm_inference import LLMHandler
+        dit = AceStepHandler()
+        dit.initialize_service(
+            project_root=str(_REPO_ROOT),
+            config_path=self._dit_config,
+            device=self._device,
+        )
+        llm = LLMHandler()
+        llm.initialize(
+            checkpoint_dir=str(_CHECKPOINTS_DIR),
+            lm_model_path=self._lm_model,
+            backend="vllm",  # fork auto-routes to mlx on mps + mlx-lm installed
+            device=self._device,
+        )
+        self._dit = dit
+        self._llm = llm
+    def generate(self, params: dict) -> str:
+        """Run a single text→song generation.
+        ``params`` is the dict produced by ``modes.generate``:
+        ``{"prompt", "lyrics", "duration_s", "instrumental", "seed",
+        "loras", "advanced", "lm", "dcw"}``. Returns the path to the
+        produced audio file.
+        """
+        self._ensure_loaded()
+        from acestep.inference import (
+            GenerationConfig,
+            GenerationParams,
+            generate_music,
+        )
+        advanced = params.get("advanced", {}) or {}
+        lm_opts = params.get("lm", {}) or {}
+        # Map our internal dict to ACE-Step's GenerationParams.
+        # Lyrics "[Instrumental]" is the ACE-Step convention for instrumental.
+        lyrics = params.get("lyrics", "") or ""
+        instrumental = bool(params.get("instrumental", False))
+        if instrumental and not lyrics:
+            lyrics = "[Instrumental]"
+        gen_params = GenerationParams(
+            task_type="text2music",
+            caption=params.get("prompt", ""),
+            lyrics=lyrics,
+            instrumental=instrumental,
+            duration=int(params.get("duration_s", 30)),
+            seed=int(params.get("seed", -1)),
+            inference_steps=int(advanced.get("steps", 32)),
+            guidance_scale=float(advanced.get("cfg", 4.0)),
+            shift=float(advanced.get("shift", 1.0)),
+            bpm=advanced.get("bpm"),
+            keyscale=advanced.get("keyscale", ""),
+            timesignature=advanced.get("timesignature", ""),
+            vocal_language=advanced.get("vocal_language", "unknown"),
+            cfg_interval_start=float(advanced.get("cfg_interval_start", 0.0)),
+            cfg_interval_end=float(advanced.get("cfg_interval_end", 1.0)),
+            thinking=bool(lm_opts.get("thinking", False)),
+            lm_temperature=float(lm_opts.get("temperature", 0.85)),
+            lm_cfg_scale=float(lm_opts.get("cfg", 2.0)),
+            lm_top_k=int(lm_opts.get("top_k", 0)),
+            lm_top_p=float(lm_opts.get("top_p", 0.9)),
+            lm_negative_prompt=lm_opts.get("negative_prompt", ""),
+            use_cot_metas=bool(lm_opts.get("cot_metas", False)),
+            use_cot_caption=bool(lm_opts.get("cot_caption", False)),
+            use_cot_language=bool(lm_opts.get("cot_language", False)),
+        )
+        gen_config = GenerationConfig(
+            batch_size=1,
+            audio_format=advanced.get("audio_format", "wav"),
+            use_random_seed=False,
+            seeds=[int(params.get("seed", 1))],
+        )
+        result = generate_music(self._dit, self._llm, gen_params, gen_config)
+        if not result.success:
+            raise RuntimeError(f"ACE-Step generation failed: {result.error}")
+        if not result.audios:
+            raise RuntimeError("ACE-Step returned no audio outputs")
+        return result.audios[0]["path"]
+_PIPELINE: ACEStepStudio | None = None  # module-level lazy singleton
+def get_pipeline() -> ACEStepStudio:
+    """Lazy-construct the ACE Music Studio wrapper.
+    The wrapper itself is cheap to construct; both handlers (DiT, LM)
+    are only loaded on the first ``generate(...)`` call.
+    """
     global _PIPELINE
     if _PIPELINE is None:
+        _PIPELINE = ACEStepStudio()
     return _PIPELINE

backend.py CHANGED Viewed

@@ -68,14 +68,17 @@ class ACEStepStudioBackend:
         return out_path, meta
     def _call_pipe_for_mode(self, pipe, mode: str, params: dict[str, Any]) -> str:
-        """Mode-specific kwargs translation. Filled out per milestone."""
         if mode == "generate":
-            return pipe(
-                prompt=params["prompt"],
-                lyrics=params.get("lyrics", ""),
-                duration_s=params["duration_s"],
-                instrumental=params.get("instrumental", False),
-                seed=params["seed"],
-            )
         # cover / extend / edit / lyrics get filled in at M3 / M4
         raise NotImplementedError(f"Mode {mode!r} is not wired yet")

         return out_path, meta
     def _call_pipe_for_mode(self, pipe, mode: str, params: dict[str, Any]) -> str:
+        """Dispatch to the pipeline wrapper.
+        ``pipe`` is the ``ACEStepStudio`` wrapper returned by
+        ``ace_pipeline.get_pipeline()``. It exposes a single
+        ``generate(params)`` method that handles the underlying
+        AceStepHandler + LLMHandler + generate_music plumbing.
+        Cover / Extend / Edit / Lyrics task_types are mapped here at
+        M3 / M4 by switching ``params["task_type"]`` before calling.
+        """
         if mode == "generate":
+            return pipe.generate(params)
         # cover / extend / edit / lyrics get filled in at M3 / M4
         raise NotImplementedError(f"Mode {mode!r} is not wired yet")

tests/test_ace_pipeline_lazy.py CHANGED Viewed

@@ -1,39 +1,166 @@
-"""L2 tests for pipeline lazy load — mock the heavy ACE-Step import."""
 from __future__ import annotations
 from unittest.mock import MagicMock
 import ace_pipeline as ap
-def test_get_pipeline_loads_lazily_first_call_only(monkeypatch):
-    fake_pipe = MagicMock(name="fake_ace_pipeline")
-    loader = MagicMock(return_value=fake_pipe)
-    monkeypatch.setattr(ap, "_load_pipeline", loader)
     monkeypatch.setattr(ap, "_PIPELINE", None, raising=False)
     p1 = ap.get_pipeline()
     p2 = ap.get_pipeline()
-    assert p1 is fake_pipe
-    assert p2 is fake_pipe
-    assert loader.call_count == 1, "pipeline should load exactly once"
-def test_get_pipeline_uses_detected_device(monkeypatch):
-    monkeypatch.setattr(ap, "_PIPELINE", None, raising=False)
     monkeypatch.setattr(ap, "detect_device", lambda: "mps")
-    captured = {}
-    def fake_load(device, model_path):
-        captured["device"] = device
-        captured["model_path"] = model_path
-        return MagicMock()
-    monkeypatch.setattr(ap, "_load_pipeline", fake_load)
-    ap.get_pipeline()
-    assert captured["device"] == "mps"
-    assert captured["model_path"] is not None

+"""L2 tests for the ACEStepStudio wrapper — mocks the heavy acestep imports."""
 from __future__ import annotations
+import sys
 from unittest.mock import MagicMock
+import pytest
 import ace_pipeline as ap
+def test_get_pipeline_returns_singleton(monkeypatch):
     monkeypatch.setattr(ap, "_PIPELINE", None, raising=False)
     p1 = ap.get_pipeline()
     p2 = ap.get_pipeline()
+    assert p1 is p2
+    assert isinstance(p1, ap.ACEStepStudio)
+def test_studio_constructor_uses_detected_device(monkeypatch):
+    monkeypatch.setattr(ap, "detect_device", lambda: "mps")
+    studio = ap.ACEStepStudio()
+    assert studio.device == "mps"
+    assert studio.is_loaded is False  # handlers are lazy
+def test_studio_constructor_respects_env_overrides(monkeypatch):
+    monkeypatch.setenv("ACE_DIT_CONFIG", "custom-dit")
+    monkeypatch.setenv("ACE_LM_MODEL", "custom-lm")
+    monkeypatch.setattr(ap, "detect_device", lambda: "cpu")
+    studio = ap.ACEStepStudio()
+    assert studio._dit_config == "custom-dit"
+    assert studio._lm_model == "custom-lm"
+def test_studio_ensure_loaded_constructs_both_handlers(monkeypatch):
+    fake_dit_cls = MagicMock(name="AceStepHandler")
+    fake_lm_cls = MagicMock(name="LLMHandler")
+    fake_dit = MagicMock()
+    fake_lm = MagicMock()
+    fake_dit_cls.return_value = fake_dit
+    fake_lm_cls.return_value = fake_lm
+    handler_mod = MagicMock()
+    handler_mod.AceStepHandler = fake_dit_cls
+    llm_mod = MagicMock()
+    llm_mod.LLMHandler = fake_lm_cls
+    monkeypatch.setitem(sys.modules, "acestep.handler", handler_mod)
+    monkeypatch.setitem(sys.modules, "acestep.llm_inference", llm_mod)
     monkeypatch.setattr(ap, "detect_device", lambda: "mps")
+    studio = ap.ACEStepStudio()
+    studio._ensure_loaded()
+    fake_dit_cls.assert_called_once()
+    fake_lm_cls.assert_called_once()
+    fake_dit.initialize_service.assert_called_once()
+    fake_lm.initialize.assert_called_once()
+    assert fake_dit.initialize_service.call_args.kwargs["device"] == "mps"
+    assert fake_lm.initialize.call_args.kwargs["device"] == "mps"
+    assert fake_dit.initialize_service.call_args.kwargs["config_path"] == "acestep-v15-xl-sft"
+    assert fake_lm.initialize.call_args.kwargs["lm_model_path"] == "acestep-5Hz-lm-0.6B"
+def _install_fake_inference(monkeypatch, success=True, audios=None, error=None):
+    """Plant a fake ``acestep.inference`` module and return the spies."""
+    if audios is None:
+        audios = [{"path": "/tmp/x.wav"}]
+    fake_result = MagicMock(success=success, audios=audios, error=error)
+    fake_generate = MagicMock(return_value=fake_result)
+    captured = {"gp": {}, "gc": {}}
+    def fake_gp(**kw):
+        captured["gp"] = kw
+        return kw
+    def fake_gc(**kw):
+        captured["gc"] = kw
+        return kw
+    fake_inference = MagicMock()
+    fake_inference.generate_music = fake_generate
+    fake_inference.GenerationParams = MagicMock(side_effect=fake_gp)
+    fake_inference.GenerationConfig = MagicMock(side_effect=fake_gc)
+    monkeypatch.setitem(sys.modules, "acestep.inference", fake_inference)
+    return fake_generate, captured
+def test_studio_generate_builds_params_and_calls_generate_music(monkeypatch, tmp_path):
+    out_wav = tmp_path / "out.wav"
+    out_wav.write_bytes(b"RIFF" + b"\0" * 100)
+    fake_generate, captured = _install_fake_inference(monkeypatch, audios=[{"path": str(out_wav)}])
+    studio = ap.ACEStepStudio()
+    studio._dit = MagicMock(name="dit")
+    studio._llm = MagicMock(name="llm")
+    result_path = studio.generate(
+        {
+            "prompt": "psytrance",
+            "lyrics": "[verse]",
+            "duration_s": 30,
+            "instrumental": False,
+            "seed": 42,
+            "loras": [],
+            "advanced": {"steps": 32, "cfg": 4.0, "bpm": 135},
+            "lm": {"thinking": False},
+            "dcw": {},
+        }
+    )
+    assert result_path == str(out_wav)
+    fake_generate.assert_called_once()
+    assert captured["gp"]["caption"] == "psytrance"
+    assert captured["gp"]["duration"] == 30
+    assert captured["gp"]["seed"] == 42
+    assert captured["gp"]["inference_steps"] == 32
+    assert captured["gp"]["bpm"] == 135
+def test_studio_generate_raises_on_failure(monkeypatch):
+    _install_fake_inference(monkeypatch, success=False, audios=[], error="OOM")
+    studio = ap.ACEStepStudio()
+    studio._dit = MagicMock()
+    studio._llm = MagicMock()
+    with pytest.raises(RuntimeError, match="OOM"):
+        studio.generate(
+            {
+                "prompt": "p",
+                "lyrics": "",
+                "duration_s": 5,
+                "instrumental": True,
+                "seed": 1,
+                "advanced": {},
+                "lm": {},
+                "dcw": {},
+            }
+        )
+def test_studio_generate_uses_instrumental_marker_when_lyrics_empty(monkeypatch):
+    _fake_generate, captured = _install_fake_inference(monkeypatch)
+    studio = ap.ACEStepStudio()
+    studio._dit = MagicMock()
+    studio._llm = MagicMock()
+    studio.generate(
+        {
+            "prompt": "drone",
+            "lyrics": "",
+            "duration_s": 5,
+            "instrumental": True,
+            "seed": 1,
+            "advanced": {},
+            "lm": {},
+            "dcw": {},
+        }
+    )
+    # Instrumental + empty lyrics → ACE-Step convention is "[Instrumental]"
+    assert captured["gp"]["lyrics"] == "[Instrumental]"
+    assert captured["gp"]["instrumental"] is True

tests/test_backend.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""L2 tests for backend.dispatch — pipeline is mocked at the boundary."""
 from __future__ import annotations
@@ -7,12 +7,13 @@ from unittest.mock import MagicMock
 import backend as be
-def test_dispatch_generate_calls_pipeline_with_expected_kwargs(monkeypatch, tmp_path):
-    fake_pipe = MagicMock()
     fake_out = tmp_path / "out.wav"
     fake_out.write_bytes(b"RIFF" + b"\0" * 1000)
-    fake_pipe.return_value = str(fake_out)
     monkeypatch.setattr("ace_pipeline.get_pipeline", lambda: fake_pipe)
     b = be.ACEStepStudioBackend()
@@ -34,13 +35,19 @@ def test_dispatch_generate_calls_pipeline_with_expected_kwargs(monkeypatch, tmp_
     assert out_path == str(fake_out)
     assert meta["mode"] == "generate"
     assert meta["seed"] == 42
-    fake_pipe.assert_called_once()
 def test_dispatch_random_seed_if_zero(monkeypatch, tmp_path):
-    fake_pipe = MagicMock(return_value=str(tmp_path / "x.wav"))
     monkeypatch.setattr("ace_pipeline.get_pipeline", lambda: fake_pipe)
-    (tmp_path / "x.wav").write_bytes(b"RIFF")
     b = be.ACEStepStudioBackend()
     _, meta = b.dispatch(
@@ -59,3 +66,6 @@ def test_dispatch_random_seed_if_zero(monkeypatch, tmp_path):
     )
     assert 1 <= meta["seed"] <= 2_147_483_647

+"""L2 tests for backend.dispatch — pipeline is mocked at the wrapper boundary."""
 from __future__ import annotations
 import backend as be
+def test_dispatch_generate_calls_pipeline_generate(monkeypatch, tmp_path):
+    """Backend should call ``pipe.generate(params)`` and return its path."""
     fake_out = tmp_path / "out.wav"
     fake_out.write_bytes(b"RIFF" + b"\0" * 1000)
+    fake_pipe = MagicMock()
+    fake_pipe.generate.return_value = str(fake_out)
     monkeypatch.setattr("ace_pipeline.get_pipeline", lambda: fake_pipe)
     b = be.ACEStepStudioBackend()
     assert out_path == str(fake_out)
     assert meta["mode"] == "generate"
     assert meta["seed"] == 42
+    fake_pipe.generate.assert_called_once()
+    # The full params dict is forwarded to pipe.generate
+    sent_params = fake_pipe.generate.call_args.args[0]
+    assert sent_params["prompt"] == "psytrance"
+    assert sent_params["seed"] == 42
 def test_dispatch_random_seed_if_zero(monkeypatch, tmp_path):
+    out = tmp_path / "x.wav"
+    out.write_bytes(b"RIFF")
+    fake_pipe = MagicMock()
+    fake_pipe.generate.return_value = str(out)
     monkeypatch.setattr("ace_pipeline.get_pipeline", lambda: fake_pipe)
     b = be.ACEStepStudioBackend()
     _, meta = b.dispatch(
     )
     assert 1 <= meta["seed"] <= 2_147_483_647
+    # The seed-resolved value is the one forwarded to the wrapper
+    sent_params = fake_pipe.generate.call_args.args[0]
+    assert sent_params["seed"] == meta["seed"]

tests/test_smoke_gpu.py CHANGED Viewed

@@ -6,14 +6,16 @@ pipeline. Run before each release tag.
 Skipped automatically in CI by the pyproject ``addopts = -m 'not gpu'``
 default. Requires:
-- ``ace-step`` installed (Apple Silicon fork on Mac, upstream on CUDA)
-- First run downloads ACE-Step 1.5 XL SFT weights (~16 GB) into the HF cache
 - A real MPS / CUDA device — CPU inference is functionally untested
 """
 from __future__ import annotations
-import os
 from pathlib import Path
 import pytest
@@ -21,31 +23,36 @@ import pytest
 pytestmark = pytest.mark.gpu
-def test_generate_minimum_song(tmp_path):
-    """Smallest end-to-end: 5 s instrumental drone, seed=1."""
-    os.environ.setdefault("ACE_MODEL_PATH", "ACE-Step/acestep-v15-xl-sft")
     from backend import ACEStepStudioBackend
     b = ACEStepStudioBackend()
     out_path, meta = b.dispatch(
         mode="generate",
         params={
-            "prompt": "test tone, simple drone",
-            "lyrics": "[intro] tone",
-            "duration_s": 5,
             "instrumental": True,
             "seed": 1,
             "loras": [],
-            "advanced": {},
-            "lm": {},
             "dcw": {},
         },
     )
-    assert Path(out_path).exists()
-    assert Path(out_path).stat().st_size > 0
     assert meta["mode"] == "generate"
     assert meta["seed"] == 1
-    # Wall time should be < 5 min even on first cold run + 16 GB weight download.
-    # Subsequent runs should be < 30 s on M5 Max.
     assert meta["wall_seconds"] > 0

 Skipped automatically in CI by the pyproject ``addopts = -m 'not gpu'``
 default. Requires:
+- ``acestep`` package installed (Apple Silicon fork on Mac, upstream on CUDA)
+- DiT checkpoint at ``./checkpoints/acestep-v15-xl-sft/`` (~16 GB) — download via
+  ``hf download ACE-Step/acestep-v15-xl-sft --local-dir checkpoints/acestep-v15-xl-sft``
+- LM checkpoint at ``./checkpoints/acestep-5Hz-lm-0.6B/`` (~1.4 GB) — download via
+  ``hf download ACE-Step/acestep-5Hz-lm-0.6B --local-dir checkpoints/acestep-5Hz-lm-0.6B``
 - A real MPS / CUDA device — CPU inference is functionally untested
 """
 from __future__ import annotations
 from pathlib import Path
 import pytest
 pytestmark = pytest.mark.gpu
+def test_generate_minimum_song():
+    """Smallest end-to-end: 10 s instrumental drone, seed=1, 16 diffusion steps.
+    Asserts the pipeline produces a non-empty audio file. Wall time on
+    cold start (handlers + weight loading) should be < 5 min on M5 Max
+    with checkpoints pre-downloaded; subsequent calls in the same process
+    are bounded by the diffusion compute itself (~10-30 s for these settings).
+    """
     from backend import ACEStepStudioBackend
     b = ACEStepStudioBackend()
     out_path, meta = b.dispatch(
         mode="generate",
         params={
+            "prompt": "ambient drone, sine pad, slow swell",
+            "lyrics": "",
+            "duration_s": 10,
             "instrumental": True,
             "seed": 1,
             "loras": [],
+            # Tune for smoke speed: fewer steps, lower CFG, skip LM CoT
+            "advanced": {"steps": 16, "cfg": 3.0, "audio_format": "wav"},
+            "lm": {"thinking": False},
             "dcw": {},
         },
     )
+    p = Path(out_path)
+    assert p.exists(), f"generated file missing: {out_path}"
+    assert p.stat().st_size > 0, "generated file is empty"
     assert meta["mode"] == "generate"
     assert meta["seed"] == 1
     assert meta["wall_seconds"] > 0