Spaces:
Sleeping
Sleeping
Commit ·
10addd5
1
Parent(s): b4b21bd
fix: consolidate Demucs+SeedVC+Mix into single @spaces.GPU function to fix ZeroGPU nested GPU call error; expose _separate_audio_impl; add libsox-dev to packages.txt
Browse files- app.py +65 -25
- packages.txt +1 -0
- pipeline/separation.py +12 -4
app.py
CHANGED
|
@@ -52,8 +52,60 @@ if HF_MODELS_REPO:
|
|
| 52 |
logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
|
| 53 |
|
| 54 |
from pipeline.training import save_voice_reference, _gpu_warmup
|
| 55 |
-
from pipeline.separation import
|
| 56 |
-
from pipeline.inference import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def train_voice_model(audio_file, model_name, progress=gr.Progress()):
|
| 59 |
"""Controlador: guardar referencia de voz."""
|
|
@@ -104,17 +156,15 @@ def convert_song(
|
|
| 104 |
instrumental_volume,
|
| 105 |
progress=gr.Progress(),
|
| 106 |
):
|
| 107 |
-
"""Pipeline completo: separar + convertir + mezclar."""
|
| 108 |
if song_file is None:
|
| 109 |
return "Error: Por favor, sube un archivo de audio.", None, None, None
|
| 110 |
|
| 111 |
if model_choice == "(ningún modelo)" or not model_choice:
|
| 112 |
return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
|
| 113 |
|
| 114 |
-
from pipeline.mixing import mix_audio
|
| 115 |
-
|
| 116 |
try:
|
| 117 |
-
progress(0.05, desc="Cargando
|
| 118 |
pth_path, ref_or_index = download_model(model_choice)
|
| 119 |
if not pth_path:
|
| 120 |
return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
|
|
@@ -123,26 +173,16 @@ def convert_song(
|
|
| 123 |
if not reference_path:
|
| 124 |
return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
|
| 125 |
|
| 126 |
-
progress(0.10, desc="
|
| 127 |
-
vocals_path, instruments_path = separate_audio(song_file)
|
| 128 |
-
|
| 129 |
-
progress(0.40, desc="Conversión de voz (Seed-VC)...")
|
| 130 |
|
| 131 |
-
converted_path =
|
| 132 |
-
|
| 133 |
reference_path=reference_path,
|
| 134 |
-
pitch=
|
| 135 |
-
diffusion_steps=
|
| 136 |
-
similarity=
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
progress(0.85, desc="Mezcla final...")
|
| 140 |
-
|
| 141 |
-
final_path = mix_audio(
|
| 142 |
-
vocals_path=converted_path,
|
| 143 |
-
instruments_path=instruments_path,
|
| 144 |
-
vocal_volume=float(vocal_volume),
|
| 145 |
-
instrumental_volume=float(instrumental_volume),
|
| 146 |
)
|
| 147 |
|
| 148 |
progress(1.0, desc="¡Terminado!")
|
|
@@ -159,7 +199,7 @@ def convert_song(
|
|
| 159 |
tb = traceback.format_exc()
|
| 160 |
logger.error("Error en la conversión: {}".format(tb))
|
| 161 |
return "Error : {}: {}\n\nDetalles:\n{}".format(
|
| 162 |
-
type(e).__name__, str(e), tb[-
|
| 163 |
), None, None, None
|
| 164 |
|
| 165 |
def refresh_models():
|
|
|
|
| 52 |
logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
|
| 53 |
|
| 54 |
from pipeline.training import save_voice_reference, _gpu_warmup
|
| 55 |
+
from pipeline.separation import _separate_audio_impl
|
| 56 |
+
from pipeline.inference import _convert_voice_impl
|
| 57 |
+
from pipeline.mixing import mix_audio
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
import spaces
|
| 61 |
+
except ImportError:
|
| 62 |
+
class spaces:
|
| 63 |
+
@staticmethod
|
| 64 |
+
def GPU(duration=60, **kwargs):
|
| 65 |
+
def decorator(fn):
|
| 66 |
+
return fn
|
| 67 |
+
return decorator
|
| 68 |
+
|
| 69 |
+
@spaces.GPU(duration=600)
|
| 70 |
+
def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
|
| 71 |
+
vocal_volume, instrumental_volume):
|
| 72 |
+
"""
|
| 73 |
+
Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
|
| 74 |
+
ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
|
| 75 |
+
so we consolidate the entire pipeline here.
|
| 76 |
+
"""
|
| 77 |
+
import torch
|
| 78 |
+
import os
|
| 79 |
+
import sys
|
| 80 |
+
|
| 81 |
+
# Ensure app dir is in path for the ZeroGPU worker
|
| 82 |
+
app_dir = os.path.dirname(os.path.abspath(__file__))
|
| 83 |
+
if app_dir not in sys.path:
|
| 84 |
+
sys.path.insert(0, app_dir)
|
| 85 |
+
os.chdir(app_dir)
|
| 86 |
+
|
| 87 |
+
# 1. Separate vocals / instruments (Demucs)
|
| 88 |
+
vocals_path, instruments_path = _separate_audio_impl(song_file)
|
| 89 |
+
|
| 90 |
+
# 2. Convert voice (Seed-VC)
|
| 91 |
+
converted_path = _convert_voice_impl(
|
| 92 |
+
audio_path=vocals_path,
|
| 93 |
+
reference_path=reference_path,
|
| 94 |
+
pitch=int(pitch),
|
| 95 |
+
diffusion_steps=int(diffusion_steps),
|
| 96 |
+
similarity=float(similarity),
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# 3. Mix
|
| 100 |
+
final_path = mix_audio(
|
| 101 |
+
vocals_path=converted_path,
|
| 102 |
+
instruments_path=instruments_path,
|
| 103 |
+
vocal_volume=float(vocal_volume),
|
| 104 |
+
instrumental_volume=float(instrumental_volume),
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
return vocals_path, converted_path, final_path
|
| 108 |
+
|
| 109 |
|
| 110 |
def train_voice_model(audio_file, model_name, progress=gr.Progress()):
|
| 111 |
"""Controlador: guardar referencia de voz."""
|
|
|
|
| 156 |
instrumental_volume,
|
| 157 |
progress=gr.Progress(),
|
| 158 |
):
|
| 159 |
+
"""Pipeline completo: separar + convertir + mezclar (single GPU session)."""
|
| 160 |
if song_file is None:
|
| 161 |
return "Error: Por favor, sube un archivo de audio.", None, None, None
|
| 162 |
|
| 163 |
if model_choice == "(ningún modelo)" or not model_choice:
|
| 164 |
return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
|
| 165 |
|
|
|
|
|
|
|
| 166 |
try:
|
| 167 |
+
progress(0.05, desc="Cargando modelo...")
|
| 168 |
pth_path, ref_or_index = download_model(model_choice)
|
| 169 |
if not pth_path:
|
| 170 |
return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
|
|
|
|
| 173 |
if not reference_path:
|
| 174 |
return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
|
| 175 |
|
| 176 |
+
progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
+
vocals_path, converted_path, final_path = _full_pipeline_gpu(
|
| 179 |
+
song_file=song_file,
|
| 180 |
reference_path=reference_path,
|
| 181 |
+
pitch=pitch,
|
| 182 |
+
diffusion_steps=diffusion_steps,
|
| 183 |
+
similarity=similarity,
|
| 184 |
+
vocal_volume=vocal_volume,
|
| 185 |
+
instrumental_volume=instrumental_volume,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
)
|
| 187 |
|
| 188 |
progress(1.0, desc="¡Terminado!")
|
|
|
|
| 199 |
tb = traceback.format_exc()
|
| 200 |
logger.error("Error en la conversión: {}".format(tb))
|
| 201 |
return "Error : {}: {}\n\nDetalles:\n{}".format(
|
| 202 |
+
type(e).__name__, str(e), tb[-800:]
|
| 203 |
), None, None, None
|
| 204 |
|
| 205 |
def refresh_models():
|
packages.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
ffmpeg
|
| 2 |
libsndfile1-dev
|
|
|
|
|
|
| 1 |
ffmpeg
|
| 2 |
libsndfile1-dev
|
| 3 |
+
libsox-dev
|
pipeline/separation.py
CHANGED
|
@@ -22,11 +22,10 @@ except ImportError:
|
|
| 22 |
OUTPUT_DIR = "/tmp/demucs_output"
|
| 23 |
|
| 24 |
|
| 25 |
-
|
| 26 |
-
def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
|
| 27 |
"""
|
| 28 |
-
|
| 29 |
-
|
| 30 |
"""
|
| 31 |
import torchaudio
|
| 32 |
from demucs.pretrained import get_model
|
|
@@ -96,3 +95,12 @@ def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
|
|
| 96 |
|
| 97 |
logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
|
| 98 |
return vocals_path, instruments_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
OUTPUT_DIR = "/tmp/demucs_output"
|
| 23 |
|
| 24 |
|
| 25 |
+
def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
|
|
|
|
| 26 |
"""
|
| 27 |
+
Core separation logic (no GPU decorator).
|
| 28 |
+
Called directly from the master @spaces.GPU pipeline in app.py.
|
| 29 |
"""
|
| 30 |
import torchaudio
|
| 31 |
from demucs.pretrained import get_model
|
|
|
|
| 95 |
|
| 96 |
logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
|
| 97 |
return vocals_path, instruments_path
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@spaces.GPU(duration=120)
|
| 101 |
+
def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
|
| 102 |
+
"""
|
| 103 |
+
GPU-decorated standalone wrapper around _separate_audio_impl.
|
| 104 |
+
Use this only when calling separation independently (not from app.py pipeline).
|
| 105 |
+
"""
|
| 106 |
+
return _separate_audio_impl(audio_path, model_name)
|