Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 2 days ago

Commit

b992e76

unverified ·

1 Parent(s): a81cc03

fix(post): use lower-level demucs API + stringify stem paths for gr.Files

Browse files

demucs 4.0.x has no demucs.api.Separator (added in 4.1). Refactor
separate_stems() to use demucs.pretrained.get_model +
demucs.apply.apply_model so the installed wheel works without a forced
upgrade of the apple-silicon torch stack. Also stringify PosixPath
values before handing to gr.Files — Gradio 6.14's pydantic FileData
model rejects PosixPath inputs.

Files changed (2) hide show

app.py +3 -1
post_process.py +44 -18

app.py CHANGED Viewed

@@ -529,7 +529,9 @@ def on_separate_stems(audio_path):
         stems = post_process.separate_stems(audio_path)
     except Exception as e:
         raise gr.Error(f"Demucs failed: {e}") from e
-    return gr.Files(value=list(stems.values()), visible=True)
 def on_normalise(audio_path):

         stems = post_process.separate_stems(audio_path)
     except Exception as e:
         raise gr.Error(f"Demucs failed: {e}") from e
+    # gr.Files's pydantic FileData model only accepts str paths in Gradio
+    # 6.14; PosixPath objects from separate_stems() trip its validator.
+    return gr.Files(value=[str(p) for p in stems.values()], visible=True)
 def on_normalise(audio_path):

post_process.py CHANGED Viewed

@@ -11,39 +11,65 @@ _DEMUCS = None
 def _get_demucs() -> Any:
     global _DEMUCS
     if _DEMUCS is None:
-        from demucs.api import Separator
-        _DEMUCS = Separator(model="htdemucs_ft")
     return _DEMUCS
 def separate_stems(audio_path: Path | str) -> dict[str, Path]:
-    """Split into vocals/drums/bass/other via htdemucs_ft.
-    Returns a dict mapping stem name to written file path.
     """
-    sep = _get_demucs()
-    result = sep.separate_audio_file(str(audio_path))
-    # `result` may be either {name: path} OR (origin, separated) tuple
-    # depending on demucs version. Normalise to dict[str, Path].
-    if isinstance(result, dict):
-        return {name: Path(p) for name, p in result.items()}
-    # Newer demucs returns (origin_tensor, separated_dict_of_tensors)
-    # We persist tensors next to the input file with stem suffixes.
     import soundfile as sf
-    _origin, sep_tensors = result
     base = Path(audio_path).with_suffix("")
     stems: dict[str, Path] = {}
-    for name, tensor in sep_tensors.items():
-        out = base.with_name(f"{base.name}.{name}.wav")
-        data = tensor.detach().cpu().numpy()
         if data.ndim == 2 and data.shape[0] in (1, 2):
             data = data.T
-        sf.write(str(out), data, sep.samplerate)
-        stems[name] = out
     return stems

 def _get_demucs() -> Any:
+    """Lazy-load the htdemucs model.
+    Demucs 4.0.x exposes ``demucs.pretrained.get_model`` and
+    ``demucs.apply.apply_model`` — the higher-level
+    ``demucs.api.Separator`` convenience wrapper only appears in 4.1+.
+    We pin to the lower-level API so this works across both pip-installable
+    lines without forcing an upgrade on the apple-silicon torch stack.
+    """
     global _DEMUCS
     if _DEMUCS is None:
+        from demucs.pretrained import get_model
+        _DEMUCS = get_model("htdemucs")
     return _DEMUCS
 def separate_stems(audio_path: Path | str) -> dict[str, Path]:
+    """Split into vocals/drums/bass/other via htdemucs.
+    Uses the lower-level ``demucs.apply.apply_model`` so we don't depend
+    on the ``demucs.api.Separator`` wrapper (which only ships with
+    demucs >= 4.1). Returns a dict mapping stem name to written file path.
     """
     import soundfile as sf
+    import torch
+    import torchaudio
+    from demucs.apply import apply_model
+    model = _get_demucs()
+    target_sr = int(getattr(model, "samplerate", 44100))
+    sources = list(getattr(model, "sources", ["drums", "bass", "other", "vocals"]))
+    audio_channels = int(getattr(model, "audio_channels", 2))
+    waveform, sr = torchaudio.load(str(audio_path))  # (channels, frames)
+    if sr != target_sr:
+        waveform = torchaudio.functional.resample(waveform, sr, target_sr)
+    # Match the model's expected channel count (htdemucs is stereo).
+    if waveform.shape[0] == 1 and audio_channels == 2:
+        waveform = waveform.repeat(2, 1)
+    elif waveform.shape[0] > audio_channels:
+        waveform = waveform[:audio_channels]
+    # apply_model expects shape (batch, channels, frames).
+    batch = waveform.unsqueeze(0)
+    with torch.no_grad():
+        # apply_model returns (batch, sources, channels, frames).
+        out = apply_model(model, batch, device="cpu", progress=False)
+    out = out[0]  # drop batch dim -> (sources, channels, frames)
     base = Path(audio_path).with_suffix("")
     stems: dict[str, Path] = {}
+    for idx, name in enumerate(sources):
+        out_path = base.with_name(f"{base.name}.{name}.wav")
+        data = out[idx].cpu().numpy()
+        # soundfile expects (frames, channels); demucs gives (channels, frames)
         if data.ndim == 2 and data.shape[0] in (1, 2):
             data = data.T
+        sf.write(str(out_path), data, target_sr)
+        stems[name] = out_path
     return stems