Spaces:

dimensionalpulsar
/

voice-clone-rvc

Sleeping

App Files Files Community

dimensionalpulsar commited on 24 days ago

Commit

10addd5

1 Parent(s): b4b21bd

fix: consolidate Demucs+SeedVC+Mix into single @spaces.GPU function to fix ZeroGPU nested GPU call error; expose _separate_audio_impl; add libsox-dev to packages.txt

Browse files

Files changed (3) hide show

app.py +65 -25
packages.txt +1 -0
pipeline/separation.py +12 -4

app.py CHANGED Viewed

@@ -52,8 +52,60 @@ if HF_MODELS_REPO:
     logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
 from pipeline.training import save_voice_reference, _gpu_warmup
-from pipeline.separation import separate_audio
-from pipeline.inference import convert_voice
 def train_voice_model(audio_file, model_name, progress=gr.Progress()):
     """Controlador: guardar referencia de voz."""
@@ -104,17 +156,15 @@ def convert_song(
     instrumental_volume,
     progress=gr.Progress(),
 ):
-    """Pipeline completo: separar + convertir + mezclar."""
     if song_file is None:
         return "Error: Por favor, sube un archivo de audio.", None, None, None
     if model_choice == "(ningún modelo)" or not model_choice:
         return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
-    from pipeline.mixing import mix_audio
     try:
-        progress(0.05, desc="Cargando el modelo...")
         pth_path, ref_or_index = download_model(model_choice)
         if not pth_path:
             return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
@@ -123,26 +173,16 @@ def convert_song(
         if not reference_path:
             return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
-        progress(0.10, desc="Separación de pistas (Demucs)...")
-        vocals_path, instruments_path = separate_audio(song_file)
-        progress(0.40, desc="Conversión de voz (Seed-VC)...")
-        converted_path = convert_voice(
-            audio_path=vocals_path,
             reference_path=reference_path,
-            pitch=int(pitch),
-            diffusion_steps=int(diffusion_steps),
-            similarity=float(similarity),
-        )
-        progress(0.85, desc="Mezcla final...")
-        final_path = mix_audio(
-            vocals_path=converted_path,
-            instruments_path=instruments_path,
-            vocal_volume=float(vocal_volume),
-            instrumental_volume=float(instrumental_volume),
         )
         progress(1.0, desc="¡Terminado!")
@@ -159,7 +199,7 @@ def convert_song(
         tb = traceback.format_exc()
         logger.error("Error en la conversión: {}".format(tb))
         return "Error : {}: {}\n\nDetalles:\n{}".format(
-            type(e).__name__, str(e), tb[-500:]
         ), None, None, None
 def refresh_models():

     logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
 from pipeline.training import save_voice_reference, _gpu_warmup
+from pipeline.separation import _separate_audio_impl
+from pipeline.inference import _convert_voice_impl
+from pipeline.mixing import mix_audio
+try:
+    import spaces
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(duration=60, **kwargs):
+            def decorator(fn):
+                return fn
+            return decorator
+@spaces.GPU(duration=600)
+def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
+                        vocal_volume, instrumental_volume):
+    """
+    Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
+    ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
+    so we consolidate the entire pipeline here.
+    """
+    import torch
+    import os
+    import sys
+    # Ensure app dir is in path for the ZeroGPU worker
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    if app_dir not in sys.path:
+        sys.path.insert(0, app_dir)
+    os.chdir(app_dir)
+    # 1. Separate vocals / instruments (Demucs)
+    vocals_path, instruments_path = _separate_audio_impl(song_file)
+    # 2. Convert voice (Seed-VC)
+    converted_path = _convert_voice_impl(
+        audio_path=vocals_path,
+        reference_path=reference_path,
+        pitch=int(pitch),
+        diffusion_steps=int(diffusion_steps),
+        similarity=float(similarity),
+    )
+    # 3. Mix
+    final_path = mix_audio(
+        vocals_path=converted_path,
+        instruments_path=instruments_path,
+        vocal_volume=float(vocal_volume),
+        instrumental_volume=float(instrumental_volume),
+    )
+    return vocals_path, converted_path, final_path
 def train_voice_model(audio_file, model_name, progress=gr.Progress()):
     """Controlador: guardar referencia de voz."""
     instrumental_volume,
     progress=gr.Progress(),
 ):
+    """Pipeline completo: separar + convertir + mezclar (single GPU session)."""
     if song_file is None:
         return "Error: Por favor, sube un archivo de audio.", None, None, None
     if model_choice == "(ningún modelo)" or not model_choice:
         return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
     try:
+        progress(0.05, desc="Cargando modelo...")
         pth_path, ref_or_index = download_model(model_choice)
         if not pth_path:
             return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
         if not reference_path:
             return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
+        progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
+        vocals_path, converted_path, final_path = _full_pipeline_gpu(
+            song_file=song_file,
             reference_path=reference_path,
+            pitch=pitch,
+            diffusion_steps=diffusion_steps,
+            similarity=similarity,
+            vocal_volume=vocal_volume,
+            instrumental_volume=instrumental_volume,
         )
         progress(1.0, desc="¡Terminado!")
         tb = traceback.format_exc()
         logger.error("Error en la conversión: {}".format(tb))
         return "Error : {}: {}\n\nDetalles:\n{}".format(
+            type(e).__name__, str(e), tb[-800:]
         ), None, None, None
 def refresh_models():

packages.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 ffmpeg
 libsndfile1-dev

 ffmpeg
 libsndfile1-dev
+libsox-dev

pipeline/separation.py CHANGED Viewed

@@ -22,11 +22,10 @@ except ImportError:
 OUTPUT_DIR = "/tmp/demucs_output"
-@spaces.GPU(duration=60)
-def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
     """
-    Separate audio into vocals and instruments using Demucs.
-    Returns (vocals_path, instruments_path).
     """
     import torchaudio
     from demucs.pretrained import get_model
@@ -96,3 +95,12 @@ def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path

 OUTPUT_DIR = "/tmp/demucs_output"
+def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
     """
+    Core separation logic (no GPU decorator).
+    Called directly from the master @spaces.GPU pipeline in app.py.
     """
     import torchaudio
     from demucs.pretrained import get_model
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path
+@spaces.GPU(duration=120)
+def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
+    """
+    GPU-decorated standalone wrapper around _separate_audio_impl.
+    Use this only when calling separation independently (not from app.py pipeline).
+    """
+    return _separate_audio_impl(audio_path, model_name)