Spaces:

dimensionalpulsar
/

voice-clone-rvc

Sleeping

App Files Files Community

voice-clone-rvc / pipeline /separation.py

dimensionalpulsar

feat: add robust logging and diagnostics for ZeroGPU troubleshooting; fix '0 seconds' issue with explicit checks

067e4a7 23 days ago

raw

history blame contribute delete

3.33 kB

	"""
	Audio separation module: uses Demucs to separate vocals from instruments.
	"""

	import os
	import logging
	import torch

	logger = logging.getLogger(__name__)

	try:
	import spaces
	except ImportError:
	class spaces:
	@staticmethod
	def GPU(duration=60, **kwargs):
	def decorator(fn):
	return fn
	return decorator


	from .constants import RESULTS_DIR
	OUTPUT_DIR = os.path.join(RESULTS_DIR, "separation")


	def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
	"""
	Core separation logic (no GPU decorator).
	Called directly from the master @spaces.GPU pipeline in app.py.
	"""
	import torchaudio
	from demucs.pretrained import get_model
	from demucs.apply import apply_model

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	logger.info(f"Loading Demucs model '{model_name}'...")
	model = get_model(model_name)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	logger.info(f"Loading audio: {audio_path}")
	waveform, sr = torchaudio.load(audio_path)

	# Resample if needed
	if sr != model.samplerate:
	resampler = torchaudio.transforms.Resample(sr, model.samplerate)
	waveform = resampler(waveform)
	sr = model.samplerate

	# Ensure stereo
	if waveform.shape[0] == 1:
	waveform = waveform.repeat(2, 1)
	elif waveform.shape[0] > 2:
	waveform = waveform[:2]

	# Apply model
	logger.info("Separating audio...")
	ref = waveform.mean(0)
	std = ref.std()
	if std < 1e-6:
	std = torch.tensor(1e-6)
	waveform = (waveform - ref.mean()) / std

	sources = apply_model(
	model,
	waveform[None].to(device),
	device=device,
	progress=True,
	num_workers=0,
	)

	sources = sources * std + ref.mean()
	sources = sources[0] # Remove batch dimension

	# Demucs sources order: drums, bass, other, vocals
	source_names = model.sources
	vocals_idx = source_names.index("vocals")

	vocals = sources[vocals_idx].cpu()

	# Instruments = everything except vocals
	instruments = torch.zeros_like(vocals)
	for i, name in enumerate(source_names):
	if name != "vocals":
	instruments += sources[i].cpu()

	# Save outputs
	base_name = os.path.splitext(os.path.basename(audio_path))[0]
	vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
	instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")

	logger.info(f"Saving separated vocals to {vocals_path} (shape: {vocals.shape})")
	if vocals.numel() == 0:
	logger.error("Vocals tensor is EMPTY!")

	torchaudio.save(vocals_path, vocals, sr)
	torchaudio.save(instruments_path, instruments, sr)

	# Cleanup GPU memory
	del sources, model
	torch.cuda.empty_cache()

	logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
	return vocals_path, instruments_path


	@spaces.GPU(duration=120)
	def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
	"""
	GPU-decorated standalone wrapper around _separate_audio_impl.
	Use this only when calling separation independently (not from app.py pipeline).
	"""
	return _separate_audio_impl(audio_path, model_name)