dimensionalpulsar commited on
Commit
10addd5
·
1 Parent(s): b4b21bd

fix: consolidate Demucs+SeedVC+Mix into single @spaces.GPU function to fix ZeroGPU nested GPU call error; expose _separate_audio_impl; add libsox-dev to packages.txt

Browse files
Files changed (3) hide show
  1. app.py +65 -25
  2. packages.txt +1 -0
  3. pipeline/separation.py +12 -4
app.py CHANGED
@@ -52,8 +52,60 @@ if HF_MODELS_REPO:
52
  logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
53
 
54
  from pipeline.training import save_voice_reference, _gpu_warmup
55
- from pipeline.separation import separate_audio
56
- from pipeline.inference import convert_voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def train_voice_model(audio_file, model_name, progress=gr.Progress()):
59
  """Controlador: guardar referencia de voz."""
@@ -104,17 +156,15 @@ def convert_song(
104
  instrumental_volume,
105
  progress=gr.Progress(),
106
  ):
107
- """Pipeline completo: separar + convertir + mezclar."""
108
  if song_file is None:
109
  return "Error: Por favor, sube un archivo de audio.", None, None, None
110
 
111
  if model_choice == "(ningún modelo)" or not model_choice:
112
  return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
113
 
114
- from pipeline.mixing import mix_audio
115
-
116
  try:
117
- progress(0.05, desc="Cargando el modelo...")
118
  pth_path, ref_or_index = download_model(model_choice)
119
  if not pth_path:
120
  return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
@@ -123,26 +173,16 @@ def convert_song(
123
  if not reference_path:
124
  return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
125
 
126
- progress(0.10, desc="Separación de pistas (Demucs)...")
127
- vocals_path, instruments_path = separate_audio(song_file)
128
-
129
- progress(0.40, desc="Conversión de voz (Seed-VC)...")
130
 
131
- converted_path = convert_voice(
132
- audio_path=vocals_path,
133
  reference_path=reference_path,
134
- pitch=int(pitch),
135
- diffusion_steps=int(diffusion_steps),
136
- similarity=float(similarity),
137
- )
138
-
139
- progress(0.85, desc="Mezcla final...")
140
-
141
- final_path = mix_audio(
142
- vocals_path=converted_path,
143
- instruments_path=instruments_path,
144
- vocal_volume=float(vocal_volume),
145
- instrumental_volume=float(instrumental_volume),
146
  )
147
 
148
  progress(1.0, desc="¡Terminado!")
@@ -159,7 +199,7 @@ def convert_song(
159
  tb = traceback.format_exc()
160
  logger.error("Error en la conversión: {}".format(tb))
161
  return "Error : {}: {}\n\nDetalles:\n{}".format(
162
- type(e).__name__, str(e), tb[-500:]
163
  ), None, None, None
164
 
165
  def refresh_models():
 
52
  logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
53
 
54
  from pipeline.training import save_voice_reference, _gpu_warmup
55
+ from pipeline.separation import _separate_audio_impl
56
+ from pipeline.inference import _convert_voice_impl
57
+ from pipeline.mixing import mix_audio
58
+
59
+ try:
60
+ import spaces
61
+ except ImportError:
62
+ class spaces:
63
+ @staticmethod
64
+ def GPU(duration=60, **kwargs):
65
+ def decorator(fn):
66
+ return fn
67
+ return decorator
68
+
69
+ @spaces.GPU(duration=600)
70
+ def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
71
+ vocal_volume, instrumental_volume):
72
+ """
73
+ Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
74
+ ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
75
+ so we consolidate the entire pipeline here.
76
+ """
77
+ import torch
78
+ import os
79
+ import sys
80
+
81
+ # Ensure app dir is in path for the ZeroGPU worker
82
+ app_dir = os.path.dirname(os.path.abspath(__file__))
83
+ if app_dir not in sys.path:
84
+ sys.path.insert(0, app_dir)
85
+ os.chdir(app_dir)
86
+
87
+ # 1. Separate vocals / instruments (Demucs)
88
+ vocals_path, instruments_path = _separate_audio_impl(song_file)
89
+
90
+ # 2. Convert voice (Seed-VC)
91
+ converted_path = _convert_voice_impl(
92
+ audio_path=vocals_path,
93
+ reference_path=reference_path,
94
+ pitch=int(pitch),
95
+ diffusion_steps=int(diffusion_steps),
96
+ similarity=float(similarity),
97
+ )
98
+
99
+ # 3. Mix
100
+ final_path = mix_audio(
101
+ vocals_path=converted_path,
102
+ instruments_path=instruments_path,
103
+ vocal_volume=float(vocal_volume),
104
+ instrumental_volume=float(instrumental_volume),
105
+ )
106
+
107
+ return vocals_path, converted_path, final_path
108
+
109
 
110
  def train_voice_model(audio_file, model_name, progress=gr.Progress()):
111
  """Controlador: guardar referencia de voz."""
 
156
  instrumental_volume,
157
  progress=gr.Progress(),
158
  ):
159
+ """Pipeline completo: separar + convertir + mezclar (single GPU session)."""
160
  if song_file is None:
161
  return "Error: Por favor, sube un archivo de audio.", None, None, None
162
 
163
  if model_choice == "(ningún modelo)" or not model_choice:
164
  return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
165
 
 
 
166
  try:
167
+ progress(0.05, desc="Cargando modelo...")
168
  pth_path, ref_or_index = download_model(model_choice)
169
  if not pth_path:
170
  return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
 
173
  if not reference_path:
174
  return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
175
 
176
+ progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
 
 
 
177
 
178
+ vocals_path, converted_path, final_path = _full_pipeline_gpu(
179
+ song_file=song_file,
180
  reference_path=reference_path,
181
+ pitch=pitch,
182
+ diffusion_steps=diffusion_steps,
183
+ similarity=similarity,
184
+ vocal_volume=vocal_volume,
185
+ instrumental_volume=instrumental_volume,
 
 
 
 
 
 
 
186
  )
187
 
188
  progress(1.0, desc="¡Terminado!")
 
199
  tb = traceback.format_exc()
200
  logger.error("Error en la conversión: {}".format(tb))
201
  return "Error : {}: {}\n\nDetalles:\n{}".format(
202
+ type(e).__name__, str(e), tb[-800:]
203
  ), None, None, None
204
 
205
  def refresh_models():
packages.txt CHANGED
@@ -1,2 +1,3 @@
1
  ffmpeg
2
  libsndfile1-dev
 
 
1
  ffmpeg
2
  libsndfile1-dev
3
+ libsox-dev
pipeline/separation.py CHANGED
@@ -22,11 +22,10 @@ except ImportError:
22
  OUTPUT_DIR = "/tmp/demucs_output"
23
 
24
 
25
- @spaces.GPU(duration=60)
26
- def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
27
  """
28
- Separate audio into vocals and instruments using Demucs.
29
- Returns (vocals_path, instruments_path).
30
  """
31
  import torchaudio
32
  from demucs.pretrained import get_model
@@ -96,3 +95,12 @@ def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
96
 
97
  logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
98
  return vocals_path, instruments_path
 
 
 
 
 
 
 
 
 
 
22
  OUTPUT_DIR = "/tmp/demucs_output"
23
 
24
 
25
+ def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
 
26
  """
27
+ Core separation logic (no GPU decorator).
28
+ Called directly from the master @spaces.GPU pipeline in app.py.
29
  """
30
  import torchaudio
31
  from demucs.pretrained import get_model
 
95
 
96
  logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
97
  return vocals_path, instruments_path
98
+
99
+
100
+ @spaces.GPU(duration=120)
101
+ def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
102
+ """
103
+ GPU-decorated standalone wrapper around _separate_audio_impl.
104
+ Use this only when calling separation independently (not from app.py pipeline).
105
+ """
106
+ return _separate_audio_impl(audio_path, model_name)