dimensionalpulsar commited on
Commit
662c1d3
·
2 Parent(s): 73e9040915c9a5

Resolve merge conflict: combine GPU optimization with Spanish UI and refresh button

Browse files
Files changed (3) hide show
  1. app.py +264 -481
  2. pipeline/inference.py +10 -2
  3. pipeline/separation.py +8 -0
app.py CHANGED
@@ -1,481 +1,264 @@
1
- import os
2
- import sys
3
- import logging
4
- import tempfile
5
- import shutil
6
- import gradio as gr
7
-
8
- try:
9
- import gradio_client.utils as _gc_utils
10
-
11
- _orig_get_type = _gc_utils.get_type
12
-
13
- def _patched_get_type(schema, *args, **kwargs):
14
- if not isinstance(schema, dict):
15
- return "Any"
16
- return _orig_get_type(schema, *args, **kwargs)
17
-
18
- _gc_utils.get_type = _patched_get_type
19
-
20
- _orig_json_schema = _gc_utils._json_schema_to_python_type
21
-
22
- def _patched_json_schema(schema, *args, **kwargs):
23
- if not isinstance(schema, dict):
24
- return "Any"
25
- return _orig_json_schema(schema, *args, **kwargs)
26
-
27
- _gc_utils._json_schema_to_python_type = _patched_json_schema
28
- _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(
29
- schema, defs
30
- )
31
- except Exception:
32
- pass
33
-
34
- # Configuración de logs
35
- logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
36
- logger = logging.getLogger(__name__)
37
-
38
- # Inicio: clonar Seed-VC
39
- logger.info("Inicializando la aplicación...")
40
-
41
- from pipeline.setup import setup_seed_vc
42
- from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
43
-
44
- try:
45
- setup_seed_vc()
46
- except Exception as e:
47
- logger.error("Error durante la configuración: {}".format(e))
48
-
49
- HF_MODELS_REPO = os.environ.get("HF_MODELS_REPO", "")
50
- if HF_MODELS_REPO:
51
- init_storage(HF_MODELS_REPO)
52
- logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
53
-
54
- from pipeline.training import save_voice_reference, _gpu_warmup
55
- from pipeline.separation import _separate_audio_impl
56
- from pipeline.inference import _convert_voice_impl
57
- from pipeline.mixing import mix_audio
58
-
59
- try:
60
- import spaces
61
- except ImportError:
62
- class spaces:
63
- @staticmethod
64
- def GPU(duration=60, **kwargs):
65
- def decorator(fn):
66
- return fn
67
- return decorator
68
-
69
- @spaces.GPU(duration=600)
70
- def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
71
- vocal_volume, instrumental_volume):
72
- """
73
- Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
74
- ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
75
- so we consolidate the entire pipeline here.
76
- """
77
- import torch
78
- import os
79
- import sys
80
-
81
- # Ensure app dir is in path for the ZeroGPU worker
82
- app_dir = os.path.dirname(os.path.abspath(__file__))
83
- if app_dir not in sys.path:
84
- sys.path.insert(0, app_dir)
85
- os.chdir(app_dir)
86
-
87
- # 1. Separate vocals / instruments (Demucs)
88
- vocals_path, instruments_path = _separate_audio_impl(song_file)
89
-
90
- # 2. Convert voice (Seed-VC)
91
- converted_path = _convert_voice_impl(
92
- audio_path=vocals_path,
93
- reference_path=reference_path,
94
- pitch=int(pitch),
95
- diffusion_steps=int(diffusion_steps),
96
- similarity=float(similarity),
97
- )
98
-
99
- # 3. Mix
100
- final_path = mix_audio(
101
- vocals_path=converted_path,
102
- instruments_path=instruments_path,
103
- vocal_volume=float(vocal_volume),
104
- instrumental_volume=float(instrumental_volume),
105
- )
106
-
107
- import librosa
108
- # Load back the audio data to return it directly.
109
- # This bypasses ZeroGPU filesystem sync issues.
110
- v_data, v_sr = librosa.load(vocals_path, sr=None)
111
- c_data, c_sr = librosa.load(converted_path, sr=None)
112
- f_data, f_sr = librosa.load(final_path, sr=None)
113
-
114
- return (v_sr, v_data), (c_sr, c_data), (f_sr, f_data)
115
-
116
-
117
- def train_voice_model(audio_file, model_name, progress=gr.Progress()):
118
- """Controlador: guardar referencia de voz."""
119
- if audio_file is None:
120
- return "Error: Por favor, sube un archivo de audio.", None
121
-
122
- if not model_name or not model_name.strip():
123
- return "Error: Por favor, ingresa un nombre para el modelo.", None
124
-
125
- model_name = model_name.strip().replace(" ", "_")
126
-
127
- def progress_callback(value, desc):
128
- progress(value, desc=desc)
129
-
130
- try:
131
- progress(0.0, desc="Iniciando...")
132
- pth_path, ref_path = save_voice_reference(
133
- audio_path=audio_file,
134
- model_name=model_name,
135
- progress_callback=progress_callback,
136
- )
137
-
138
- return "¡Referencia de voz '{}' guardada con éxito!".format(model_name), ref_path
139
-
140
- except Exception as e:
141
- import traceback
142
- tb = traceback.format_exc()
143
- logger.error("Error en el entrenamiento: {}".format(tb))
144
- return "Error : {}: {}\n\nDetalles:\n{}".format(
145
- type(e).__name__, str(e), tb[-500:]
146
- ), None
147
-
148
- def get_model_choices():
149
- """Obtener lista de nombres de modelos entrenados para el menú desplegable."""
150
- models = list_models()
151
- if not models:
152
- return ["(ningún modelo)"]
153
- return models
154
-
155
-
156
- def convert_song(
157
- model_choice,
158
- song_file,
159
- pitch,
160
- similarity,
161
- diffusion_steps,
162
- vocal_volume,
163
- instrumental_volume,
164
- progress=gr.Progress(),
165
- ):
166
- """Pipeline completo: separar + convertir + mezclar (single GPU session)."""
167
- if song_file is None:
168
- return "Error: Por favor, sube un archivo de audio.", None, None, None
169
-
170
- if model_choice == "(ningún modelo)" or not model_choice:
171
- return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
172
-
173
- try:
174
- progress(0.05, desc="Cargando modelo...")
175
- pth_path, ref_or_index = download_model(model_choice)
176
- if not pth_path:
177
- return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
178
-
179
- reference_path = get_reference_path(model_choice)
180
- if not reference_path:
181
- return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
182
-
183
- progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
184
-
185
- vocals_path, converted_path, final_path = _full_pipeline_gpu(
186
- song_file=song_file,
187
- reference_path=reference_path,
188
- pitch=pitch,
189
- diffusion_steps=diffusion_steps,
190
- similarity=similarity,
191
- vocal_volume=vocal_volume,
192
- instrumental_volume=instrumental_volume,
193
- )
194
-
195
- progress(1.0, desc="¡Terminado!")
196
-
197
- return (
198
- "¡Conversión completada con éxito!",
199
- vocals_path,
200
- converted_path,
201
- final_path,
202
- )
203
-
204
- except Exception as e:
205
- import traceback
206
- tb = traceback.format_exc()
207
- logger.error("Error en la conversión: {}".format(tb))
208
- return "Error : {}: {}\n\nDetalles:\n{}".format(
209
- type(e).__name__, str(e), tb[-800:]
210
- ), None, None, None
211
-
212
- def refresh_models():
213
- """Actualizar la lista de modelos como HTML."""
214
- models = list_models()
215
- if not models:
216
- return "<p style='color:gray;'>Ningún modelo guardado</p>"
217
- rows = "".join(
218
- "<tr><td>{}</td><td>Disponible</td></tr>".format(m) for m in models
219
- )
220
- return (
221
- "<table style='width:100%;border-collapse:collapse;'>"
222
- "<tr><th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Nombre</th>"
223
- "<th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Estado</th></tr>"
224
- "{}</table>".format(rows)
225
- )
226
-
227
-
228
- def delete_selected_model(model_name_to_delete):
229
- """Eliminar un modelo."""
230
- if not model_name_to_delete or model_name_to_delete == "(ningún modelo)":
231
- return "Por favor, selecciona un modelo para eliminar.", refresh_models()
232
- try:
233
- delete_model(model_name_to_delete)
234
- return "Modelo '{}' eliminado.".format(model_name_to_delete), refresh_models()
235
- except Exception as e:
236
- return "Error : {}".format(e), refresh_models()
237
-
238
- with gr.Blocks(
239
- title="Clon de Voz",
240
- theme=gr.themes.Soft(),
241
- ) as app:
242
-
243
- gr.Markdown(
244
- "# 🎤 Aplicación de Clonación de Voz (Seed-VC)\n"
245
- "> Powered by [Seed-VC](https://github.com/Plachta/seed-vc) + [Demucs](https://github.com/facebookresearch/demucs) · ZeroGPU · Zero-shot"
246
- )
247
-
248
- with gr.Tabs():
249
- # Pestaña 1: Referencia de voz
250
- with gr.TabItem("Mi voz"):
251
- gr.Markdown("### Guardar tu referencia de voz")
252
-
253
- with gr.Row():
254
- with gr.Column(scale=2):
255
- train_audio = gr.Audio(
256
- label="Extracto de tu voz (WAV o MP3, 3-30 segundos)",
257
- type="filepath",
258
- sources=["upload"],
259
- )
260
- train_model_name = gr.Textbox(
261
- label="Nombre del perfil",
262
- placeholder="ej: mi_voz",
263
- max_lines=1,
264
- )
265
- train_btn = gr.Button(
266
- "Guardar",
267
- variant="primary",
268
- size="lg",
269
- )
270
-
271
- with gr.Column(scale=1):
272
- train_status = gr.Textbox(
273
- label="Estado",
274
- interactive=False,
275
- lines=3,
276
- )
277
- train_download = gr.File(
278
- label="Archivo de referencia",
279
- interactive=False,
280
- )
281
-
282
- gr.Markdown(
283
- "**Consejos:**\n"
284
- "- Usa una grabación limpia (sin ruido de fondo, sin música)\n"
285
- "- Habla o canta naturalmente durante 3 a 30 segundos\n"
286
- "- Mientras más largo y variado sea el extracto, mejor será el resultado\n"
287
- "- Se aceptan formatos WAV o MP3"
288
- )
289
-
290
- with gr.Accordion("Perfiles guardados", open=False):
291
- clonacion_refresh_btn = gr.Button("🔄 Actualizar lista de perfiles", size="sm")
292
- clonacion_models_table = gr.HTML(value=refresh_models())
293
-
294
- train_btn.click(
295
- fn=train_voice_model,
296
- inputs=[train_audio, train_model_name],
297
- outputs=[train_status, train_download],
298
- ).then(
299
- fn=refresh_models,
300
- outputs=[clonacion_models_table],
301
- ).then(
302
- fn=refresh_models,
303
- outputs=[models_table],
304
- ).then(
305
- fn=lambda: gr.Dropdown(choices=get_model_choices()),
306
- outputs=[convert_model],
307
- ).then(
308
- fn=lambda: gr.Dropdown(choices=get_model_choices()),
309
- outputs=[models_delete_name],
310
- )
311
-
312
- clonacion_refresh_btn.click(
313
- fn=refresh_models,
314
- outputs=[clonacion_models_table],
315
- )
316
-
317
- # Pestaña 2: Conversión
318
- with gr.TabItem("Convertir una canción"):
319
- gr.Markdown("### Reemplazar la voz de una canción por la tuya")
320
-
321
- with gr.Row():
322
- with gr.Column(scale=2):
323
- convert_model = gr.Dropdown(
324
- choices=get_model_choices(),
325
- label="Perfil de voz",
326
- interactive=True,
327
- )
328
- refresh_btn = gr.Button("Actualizar lista", size="sm")
329
- convert_audio = gr.Audio(
330
- label="Canción a convertir (WAV o MP3)",
331
- type="filepath",
332
- sources=["upload"],
333
- )
334
-
335
- with gr.Accordion("Parámetros avanzados", open=False):
336
- convert_pitch = gr.Slider(
337
- minimum=-24,
338
- maximum=24,
339
- value=0,
340
- step=1,
341
- label="Transposición (semitonos)",
342
- )
343
- convert_similarity = gr.Slider(
344
- minimum=0.0,
345
- maximum=1.0,
346
- value=0.7,
347
- step=0.05,
348
- label="Similitud de voz (0.5=natural, 0.7=equilibrado, 0.9=más fiel)",
349
- )
350
- convert_diffusion = gr.Slider(
351
- minimum=5,
352
- maximum=100,
353
- value=25,
354
- step=5,
355
- label="Calidad (10=rápido, 25=equilibrado, 50=alta calidad)",
356
- )
357
- convert_vocal_vol = gr.Slider(
358
- minimum=0.0,
359
- maximum=2.0,
360
- value=1.0,
361
- step=0.1,
362
- label="Volumen de la voz",
363
- )
364
- convert_inst_vol = gr.Slider(
365
- minimum=0.0,
366
- maximum=2.0,
367
- value=1.0,
368
- step=0.1,
369
- label="Volumen de los instrumentos",
370
- )
371
-
372
- convert_btn = gr.Button(
373
- "Convertir y mezclar",
374
- variant="primary",
375
- size="lg",
376
- )
377
-
378
- with gr.Column(scale=1):
379
- convert_status = gr.Textbox(
380
- label="Estado",
381
- interactive=False,
382
- lines=3,
383
- )
384
- gr.Markdown("**Vista previa de las pistas:**")
385
- preview_vocals = gr.Audio(
386
- label="Voz original (separada)",
387
- interactive=False,
388
- )
389
- preview_converted = gr.Audio(
390
- label="Voz convertida",
391
- interactive=False,
392
- )
393
- gr.Markdown("**Resultado final:**")
394
- final_output = gr.Audio(
395
- label="Canción final (voz + instrumentos)",
396
- interactive=False,
397
- )
398
-
399
- refresh_btn.click(
400
- fn=lambda: gr.Dropdown(choices=get_model_choices()),
401
- outputs=[convert_model],
402
- )
403
-
404
- convert_btn.click(
405
- fn=convert_song,
406
- inputs=[
407
- convert_model,
408
- convert_audio,
409
- convert_pitch,
410
- convert_similarity,
411
- convert_diffusion,
412
- convert_vocal_vol,
413
- convert_inst_vol,
414
- ],
415
- outputs=[convert_status, preview_vocals, preview_converted, final_output],
416
- )
417
-
418
- # Pestaña 3: Modelos
419
- with gr.TabItem("Mis modelos"):
420
- gr.Markdown("### Gestionar tus perfiles de voz")
421
-
422
- models_table = gr.HTML(
423
- value=refresh_models(),
424
- label="Modelos guardados",
425
- )
426
-
427
- with gr.Row():
428
- models_refresh_btn = gr.Button("Actualizar", size="sm")
429
- models_delete_name = gr.Dropdown(
430
- choices=get_model_choices(),
431
- label="Modelo a eliminar",
432
- interactive=True,
433
- )
434
- models_delete_btn = gr.Button("Eliminar", variant="stop", size="sm")
435
-
436
- models_delete_status = gr.Textbox(label="Estado", interactive=False)
437
-
438
- models_refresh_btn.click(
439
- fn=refresh_models,
440
- outputs=[models_table],
441
- )
442
- models_refresh_btn.click(
443
- fn=lambda: gr.Dropdown(choices=get_model_choices()),
444
- outputs=[models_delete_name],
445
- )
446
-
447
- models_delete_btn.click(
448
- fn=delete_selected_model,
449
- inputs=[models_delete_name],
450
- outputs=[models_delete_status, models_table],
451
- )
452
-
453
- # Pestaña 4: Debug (temporal)
454
- with gr.TabItem("Depuración GPU"):
455
- gr.Markdown("### Logs del Trabajador GPU (para diagnóstico)")
456
- debug_output = gr.Textbox(
457
- label="Últimos logs de GPU",
458
- interactive=False,
459
- lines=20,
460
- )
461
- debug_btn = gr.Button("Leer los logs", size="sm")
462
-
463
- def read_debug_log():
464
- log_path = "/home/user/app/debug_gpu.log"
465
- if os.path.exists(log_path):
466
- with open(log_path, "r") as f:
467
- return f.read()
468
- return "Ningún log disponible. Ejecuta una conversión primero."
469
-
470
- debug_btn.click(fn=read_debug_log, outputs=[debug_output])
471
-
472
-
473
- if __name__ == "__main__":
474
- os.makedirs("./results", exist_ok=True)
475
- os.makedirs("./checkpoints/models", exist_ok=True)
476
- app.launch(
477
- allowed_paths=[
478
- os.path.abspath("./results"),
479
- os.path.abspath("./checkpoints"),
480
- ]
481
- )
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ import tempfile
5
+ import shutil
6
+ import gradio as gr
7
+ import gc
8
+ import time
9
+ import numpy as np
10
+ import torch
11
+
12
+ # Patches para Gradio
13
+ try:
14
+ import gradio_client.utils as _gc_utils
15
+ _orig_get_type = _gc_utils.get_type
16
+ def _patched_get_type(schema, *args, **kwargs):
17
+ if not isinstance(schema, dict): return "Any"
18
+ return _orig_get_type(schema, *args, **kwargs)
19
+ _gc_utils.get_type = _patched_get_type
20
+ _orig_json_schema = _gc_utils._json_schema_to_python_type
21
+ def _patched_json_schema(schema, *args, **kwargs):
22
+ if not isinstance(schema, dict): return "Any"
23
+ return _orig_json_schema(schema, *args, **kwargs)
24
+ _gc_utils._json_schema_to_python_type = _patched_json_schema
25
+ _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(schema, defs)
26
+ except Exception:
27
+ pass
28
+
29
+ # Configuración de logs
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
31
+ logger = logging.getLogger(__name__)
32
+
33
+ from pipeline.setup import setup_seed_vc
34
+ from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
35
+ from pipeline.training import save_voice_reference
36
+ from pipeline.separation import _separate_audio_impl
37
+ from pipeline.inference import _convert_voice_impl
38
+ from pipeline.mixing import mix_audio
39
+
40
+ try:
41
+ import spaces
42
+ except ImportError:
43
+ class spaces:
44
+ @staticmethod
45
+ def GPU(duration=60, **kwargs):
46
+ def decorator(fn): return fn
47
+ return decorator
48
+
49
+ def check_file(path, label, logs):
50
+ if os.path.exists(path):
51
+ size = os.path.getsize(path)
52
+ logs.append(f" {label} generado: {os.path.basename(path)} ({size} bytes)")
53
+ return size > 44
54
+ else:
55
+ logs.append(f"❌ ERROR: {label} NO se encontró en {path}")
56
+ return False
57
+
58
+ @spaces.GPU(duration=600)
59
+ def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
60
+ vocal_volume, instrumental_volume):
61
+ import torch
62
+ import librosa
63
+ import soundfile as sf
64
+
65
+ logs = []
66
+ logs.append(f"🚀 Iniciando pipeline en GPU...")
67
+
68
+ # Asegurar directorio de trabajo
69
+ app_dir = os.path.dirname(os.path.abspath(__file__))
70
+ os.chdir(app_dir)
71
+
72
+ try:
73
+ # 1. Separación
74
+ logs.append("⏳ Paso 1/3: Separando voces (Demucs)...")
75
+ vocals_path, instruments_path = _separate_audio_impl(song_file)
76
+ if not check_file(vocals_path, "Vocales", logs): return None, None, None, "\n".join(logs)
77
+
78
+ torch.cuda.empty_cache()
79
+ gc.collect()
80
+
81
+ # 2. Conversión
82
+ logs.append("⏳ Paso 2/3: Convirtiendo voz (Seed-VC)...")
83
+ converted_path = _convert_voice_impl(vocals_path, reference_path, int(pitch), int(diffusion_steps), float(similarity))
84
+ if not check_file(converted_path, "Voz convertida", logs): return None, None, None, "\n".join(logs)
85
+
86
+ torch.cuda.empty_cache()
87
+ gc.collect()
88
+
89
+ # 3. Mezcla
90
+ logs.append("⏳ Paso 3/3: Mezclando pistas...")
91
+ final_path = mix_audio(converted_path, instruments_path, float(vocal_volume), float(instrumental_volume))
92
+ if not check_file(final_path, "Resultado final", logs): return None, None, None, "\n".join(logs)
93
+
94
+ # 4. Retornar DATOS (para evitar problemas de sincronización de archivos en ZeroGPU)
95
+ logs.append("📦 Preparando audios para el reproductor...")
96
+
97
+ def load_audio_to_numpy(p):
98
+ data, sr = librosa.load(p, sr=None)
99
+ data = np.nan_to_num(data)
100
+ return (sr, data.astype(np.float32))
101
+
102
+ v_out = load_audio_to_numpy(vocals_path)
103
+ c_out = load_audio_to_numpy(converted_path)
104
+ f_out = load_audio_to_numpy(final_path)
105
+
106
+ logs.append("✨ Proceso completado. Enviando al navegador...")
107
+ return v_out, c_out, f_out, "\n".join(logs)
108
+
109
+ except Exception as e:
110
+ import traceback
111
+ logs.append(f"💥 ERROR: {str(e)}\n{traceback.format_exc()}")
112
+ return None, None, None, "\n".join(logs)
113
+
114
+ def train_voice_model(audio_file, model_name, progress=gr.Progress()):
115
+ if not audio_file or not model_name: return "Error: Datos incompletos.", None
116
+ model_name = model_name.strip().replace(" ", "_")
117
+ try:
118
+ pth_path, ref_path = save_voice_reference(audio_path=audio_file, model_name=model_name)
119
+ return f"¡Perfil '{model_name}' guardado!", ref_path
120
+ except Exception as e:
121
+ return f"Error: {str(e)}", None
122
+
123
+ def get_model_choices():
124
+ models = list_models()
125
+ if not models:
126
+ return ["(ningún modelo)"]
127
+ return models
128
+
129
+ def refresh_models():
130
+ models = list_models()
131
+ if not models:
132
+ return "<p style='color:gray;'>Ningún modelo guardado</p>"
133
+ rows = "".join(
134
+ "<tr><td>{}</td><td>Disponible</td></tr>".format(m) for m in models
135
+ )
136
+ return (
137
+ "<table style='width:100%;border-collapse:collapse;'>"
138
+ "<tr><th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Nombre</th>"
139
+ "<th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Estado</th></tr>"
140
+ "{}</table>".format(rows)
141
+ )
142
+
143
+ def delete_selected_model(model_name_to_delete):
144
+ if not model_name_to_delete or model_name_to_delete == "(ningún modelo)":
145
+ return "Por favor, selecciona un modelo para eliminar.", refresh_models()
146
+ try:
147
+ delete_model(model_name_to_delete)
148
+ return "Modelo '{}' eliminado.".format(model_name_to_delete), refresh_models()
149
+ except Exception as e:
150
+ return "Error : {}".format(e), refresh_models()
151
+
152
+ def convert_song(model_choice, song_file, pitch, similarity, diffusion_steps, vocal_volume, instrumental_volume, progress=gr.Progress()):
153
+ if not song_file or not model_choice or model_choice == "(ningún modelo)":
154
+ return "Error: Faltan datos.", None, None, None, "Esperando..."
155
+
156
+ try:
157
+ progress(0.1, desc="Iniciando...")
158
+ reference_path = get_reference_path(model_choice)
159
+
160
+ v_out, c_out, f_out, logs = _full_pipeline_gpu(
161
+ song_file, reference_path, pitch, diffusion_steps, similarity, vocal_volume, instrumental_volume
162
+ )
163
+
164
+ status = "✅ Completado" if f_out is not None else "❌ Error (revisa logs)"
165
+ return status, v_out, c_out, f_out, logs
166
+
167
+ except Exception as e:
168
+ import traceback
169
+ return f"Error: {str(e)}", None, None, None, traceback.format_exc()
170
+
171
+ # --- UI Layout ---
172
+ with gr.Blocks(title="Voice Clone RVC", theme=gr.themes.Soft()) as app:
173
+ gr.Markdown("# 🎤 Aplicación de Clonación de Voz (Seed-VC)\n> Powered by Seed-VC + Demucs · ZeroGPU")
174
+
175
+ with gr.Tabs():
176
+ # Pestaña 1: Perfil
177
+ with gr.TabItem("1. Perfil"):
178
+ gr.Markdown("### Guardar tu referencia de voz")
179
+ with gr.Row():
180
+ with gr.Column():
181
+ train_audio = gr.Audio(label="Sube tu voz (3-30 seg)", type="filepath")
182
+ train_name = gr.Textbox(label="Nombre del perfil", placeholder="ej: mi_voz")
183
+ train_btn = gr.Button("Guardar Perfil", variant="primary")
184
+ with gr.Column():
185
+ train_status = gr.Textbox(label="Estado")
186
+ train_file = gr.File(label="Archivo de Referencia")
187
+
188
+ with gr.Accordion("📋 Perfiles guardados", open=False):
189
+ clonacion_refresh_btn = gr.Button("🔄 Actualizar lista", size="sm")
190
+ clonacion_models_table = gr.HTML(value=refresh_models())
191
+
192
+ train_btn.click(
193
+ fn=train_voice_model,
194
+ inputs=[train_audio, train_name],
195
+ outputs=[train_status, train_file]
196
+ ).then(
197
+ fn=refresh_models, outputs=[clonacion_models_table]
198
+ ).then(
199
+ fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[clonacion_refresh_btn] # Dummy to trigger something if needed
200
+ )
201
+
202
+ clonacion_refresh_btn.click(fn=refresh_models, outputs=[clonacion_models_table])
203
+
204
+ # Pestaña 2: Conversión
205
+ with gr.TabItem("2. Conversión"):
206
+ gr.Markdown("### Reemplazar la voz de una canción")
207
+ with gr.Row():
208
+ with gr.Column(scale=2):
209
+ model_sel = gr.Dropdown(choices=get_model_choices(), label="Selecciona Perfil")
210
+ refresh_btn_conv = gr.Button("🔄 Actualizar lista", size="sm")
211
+ song_input = gr.Audio(label="Canción a convertir", type="filepath")
212
+ with gr.Accordion("Ajustes Avanzados", open=False):
213
+ pitch_shift = gr.Slider(-12, 12, 0, step=1, label="Tono (Pitch)")
214
+ sim_slider = gr.Slider(0, 1, 0.7, step=0.1, label="Fidelidad/Similitud")
215
+ diff_steps = gr.Slider(5, 50, 25, step=5, label="Calidad (Pasos de difusión)")
216
+ v_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Voz")
217
+ i_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Música")
218
+ convert_btn = gr.Button("🚀 Iniciar Conversión", variant="primary", size="lg")
219
+
220
+ with gr.Column(scale=3):
221
+ conv_status = gr.Textbox(label="Estado")
222
+ out_vocals = gr.Audio(label="Voz Original (Separada)")
223
+ out_conv = gr.Audio(label="Voz Clonada")
224
+ out_final = gr.Audio(label="Resultado Final (Mezclado)")
225
+ debug_logs = gr.Textbox(label="🔍 Logs de Procesamiento", lines=10)
226
+
227
+ refresh_btn_conv.click(fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[model_sel])
228
+
229
+ convert_btn.click(convert_song,
230
+ [model_sel, song_input, pitch_shift, sim_slider, diff_steps, v_vol, i_vol],
231
+ [conv_status, out_vocals, out_conv, out_final, debug_logs])
232
+
233
+ # Pestaña 3: Gestión de Modelos
234
+ with gr.TabItem("3. Mis Modelos"):
235
+ gr.Markdown("### Gestionar perfiles guardados")
236
+ models_table_mg = gr.HTML(value=refresh_models())
237
+ with gr.Row():
238
+ models_refresh_btn = gr.Button("Actualizar", size="sm")
239
+ models_delete_name = gr.Dropdown(choices=get_model_choices(), label="Eliminar perfil")
240
+ models_delete_btn = gr.Button("Eliminar", variant="stop", size="sm")
241
+ models_delete_status = gr.Textbox(label="Resultado")
242
+
243
+ models_refresh_btn.click(fn=refresh_models, outputs=[models_table_mg])
244
+ models_refresh_btn.click(fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[models_delete_name])
245
+ models_delete_btn.click(fn=delete_selected_model, inputs=[models_delete_name], outputs=[models_delete_status, models_table_mg])
246
+
247
+ # Pestaña 4: Debug
248
+ with gr.TabItem("Depuración"):
249
+ gr.Markdown("### Diagnóstico del sistema")
250
+ debug_view = gr.Textbox(label="Logs de sistema", lines=20, interactive=False)
251
+ debug_btn = gr.Button("Ver Logs")
252
+
253
+ def read_logs():
254
+ log_path = "debug_gpu.log" # Or wherever it's saved
255
+ if os.path.exists(log_path):
256
+ with open(log_path, "r") as f: return f.read()
257
+ return "No hay logs disponibles."
258
+
259
+ debug_btn.click(read_logs, outputs=[debug_view])
260
+
261
+ if __name__ == "__main__":
262
+ setup_seed_vc()
263
+ os.makedirs("./results", exist_ok=True)
264
+ app.launch(allowed_paths=[os.path.abspath("./results"), os.path.abspath("./pipeline/results")])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline/inference.py CHANGED
@@ -434,7 +434,15 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
434
  processed_frames += vc_target.size(2) - overlap_frame_len
435
 
436
  # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
437
- audio_out = np.concatenate(generated_wave_chunks)
 
 
 
 
 
 
 
 
438
  rms = np.sqrt(np.mean(audio_out ** 2))
439
  target_rms = 10 ** (-18.0 / 20.0) # -18 dBFS
440
  if rms > 1e-6:
@@ -444,5 +452,5 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
444
 
445
  # Save
446
  sf.write(output_path, audio_out, sr, subtype="PCM_16")
447
- logger.info("Conversion complete: {} ({:.1f}s)".format(output_path, len(audio_out) / sr))
448
  return output_path
 
434
  processed_frames += vc_target.size(2) - overlap_frame_len
435
 
436
  # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
437
+ if not generated_wave_chunks:
438
+ logger.error("No audio chunks were generated by Seed-VC!")
439
+ # Create a tiny silence buffer to avoid crash but indicate failure
440
+ audio_out = np.zeros(sr)
441
+ else:
442
+ audio_out = np.concatenate(generated_wave_chunks)
443
+
444
+ logger.info(f"Concatenated {len(generated_wave_chunks)} chunks. Total samples: {len(audio_out)}")
445
+
446
  rms = np.sqrt(np.mean(audio_out ** 2))
447
  target_rms = 10 ** (-18.0 / 20.0) # -18 dBFS
448
  if rms > 1e-6:
 
452
 
453
  # Save
454
  sf.write(output_path, audio_out, sr, subtype="PCM_16")
455
+ logger.info("Conversion complete: {} ({:.1f}s, {} samples)".format(output_path, len(audio_out) / sr, len(audio_out)))
456
  return output_path
pipeline/separation.py CHANGED
@@ -91,9 +91,17 @@ def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
91
  vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
92
  instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
93
 
 
 
 
 
94
  torchaudio.save(vocals_path, vocals, sr)
95
  torchaudio.save(instruments_path, instruments, sr)
96
 
 
 
 
 
97
  logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
98
  return vocals_path, instruments_path
99
 
 
91
  vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
92
  instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
93
 
94
+ logger.info(f"Saving separated vocals to {vocals_path} (shape: {vocals.shape})")
95
+ if vocals.numel() == 0:
96
+ logger.error("Vocals tensor is EMPTY!")
97
+
98
  torchaudio.save(vocals_path, vocals, sr)
99
  torchaudio.save(instruments_path, instruments, sr)
100
 
101
+ # Cleanup GPU memory
102
+ del sources, model
103
+ torch.cuda.empty_cache()
104
+
105
  logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
106
  return vocals_path, instruments_path
107