voice-clone-rvc / pipeline /separation.py
dimensionalpulsar's picture
feat: add robust logging and diagnostics for ZeroGPU troubleshooting; fix '0 seconds' issue with explicit checks
067e4a7
"""
Audio separation module: uses Demucs to separate vocals from instruments.
"""
import os
import logging
import torch
logger = logging.getLogger(__name__)
try:
import spaces
except ImportError:
class spaces:
@staticmethod
def GPU(duration=60, **kwargs):
def decorator(fn):
return fn
return decorator
from .constants import RESULTS_DIR
OUTPUT_DIR = os.path.join(RESULTS_DIR, "separation")
def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
"""
Core separation logic (no GPU decorator).
Called directly from the master @spaces.GPU pipeline in app.py.
"""
import torchaudio
from demucs.pretrained import get_model
from demucs.apply import apply_model
os.makedirs(OUTPUT_DIR, exist_ok=True)
logger.info(f"Loading Demucs model '{model_name}'...")
model = get_model(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
logger.info(f"Loading audio: {audio_path}")
waveform, sr = torchaudio.load(audio_path)
# Resample if needed
if sr != model.samplerate:
resampler = torchaudio.transforms.Resample(sr, model.samplerate)
waveform = resampler(waveform)
sr = model.samplerate
# Ensure stereo
if waveform.shape[0] == 1:
waveform = waveform.repeat(2, 1)
elif waveform.shape[0] > 2:
waveform = waveform[:2]
# Apply model
logger.info("Separating audio...")
ref = waveform.mean(0)
std = ref.std()
if std < 1e-6:
std = torch.tensor(1e-6)
waveform = (waveform - ref.mean()) / std
sources = apply_model(
model,
waveform[None].to(device),
device=device,
progress=True,
num_workers=0,
)
sources = sources * std + ref.mean()
sources = sources[0] # Remove batch dimension
# Demucs sources order: drums, bass, other, vocals
source_names = model.sources
vocals_idx = source_names.index("vocals")
vocals = sources[vocals_idx].cpu()
# Instruments = everything except vocals
instruments = torch.zeros_like(vocals)
for i, name in enumerate(source_names):
if name != "vocals":
instruments += sources[i].cpu()
# Save outputs
base_name = os.path.splitext(os.path.basename(audio_path))[0]
vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
logger.info(f"Saving separated vocals to {vocals_path} (shape: {vocals.shape})")
if vocals.numel() == 0:
logger.error("Vocals tensor is EMPTY!")
torchaudio.save(vocals_path, vocals, sr)
torchaudio.save(instruments_path, instruments, sr)
# Cleanup GPU memory
del sources, model
torch.cuda.empty_cache()
logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
return vocals_path, instruments_path
@spaces.GPU(duration=120)
def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
"""
GPU-decorated standalone wrapper around _separate_audio_impl.
Use this only when calling separation independently (not from app.py pipeline).
"""
return _separate_audio_impl(audio_path, model_name)