Spaces:

dimensionalpulsar
/

voice-clone-rvc

Sleeping

App Files Files Community

dimensionalpulsar commited on 24 days ago

Commit

662c1d3

2 Parent(s): 73e9040 915c9a5

Resolve merge conflict: combine GPU optimization with Spanish UI and refresh button

Browse files

Files changed (3) hide show

app.py +264 -481
pipeline/inference.py +10 -2
pipeline/separation.py +8 -0

app.py CHANGED Viewed

@@ -1,481 +1,264 @@
-import os
-import sys
-import logging
-import tempfile
-import shutil
-import gradio as gr
-try:
-    import gradio_client.utils as _gc_utils
-    _orig_get_type = _gc_utils.get_type
-    def _patched_get_type(schema, *args, **kwargs):
-        if not isinstance(schema, dict):
-            return "Any"
-        return _orig_get_type(schema, *args, **kwargs)
-    _gc_utils.get_type = _patched_get_type
-    _orig_json_schema = _gc_utils._json_schema_to_python_type
-    def _patched_json_schema(schema, *args, **kwargs):
-        if not isinstance(schema, dict):
-            return "Any"
-        return _orig_json_schema(schema, *args, **kwargs)
-    _gc_utils._json_schema_to_python_type = _patched_json_schema
-    _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(
-        schema, defs
-    )
-except Exception:
-    pass
-# Configuración de logs
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
-logger = logging.getLogger(__name__)
-# Inicio: clonar Seed-VC
-logger.info("Inicializando la aplicación...")
-from pipeline.setup import setup_seed_vc
-from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
-try:
-    setup_seed_vc()
-except Exception as e:
-    logger.error("Error durante la configuración: {}".format(e))
-HF_MODELS_REPO = os.environ.get("HF_MODELS_REPO", "")
-if HF_MODELS_REPO:
-    init_storage(HF_MODELS_REPO)
-    logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
-from pipeline.training import save_voice_reference, _gpu_warmup
-from pipeline.separation import _separate_audio_impl
-from pipeline.inference import _convert_voice_impl
-from pipeline.mixing import mix_audio
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(duration=60, **kwargs):
-            def decorator(fn):
-                return fn
-            return decorator
-@spaces.GPU(duration=600)
-def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
-                        vocal_volume, instrumental_volume):
-    """
-    Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
-    ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
-    so we consolidate the entire pipeline here.
-    """
-    import torch
-    import os
-    import sys
-    # Ensure app dir is in path for the ZeroGPU worker
-    app_dir = os.path.dirname(os.path.abspath(__file__))
-    if app_dir not in sys.path:
-        sys.path.insert(0, app_dir)
-    os.chdir(app_dir)
-    # 1. Separate vocals / instruments (Demucs)
-    vocals_path, instruments_path = _separate_audio_impl(song_file)
-    # 2. Convert voice (Seed-VC)
-    converted_path = _convert_voice_impl(
-        audio_path=vocals_path,
-        reference_path=reference_path,
-        pitch=int(pitch),
-        diffusion_steps=int(diffusion_steps),
-        similarity=float(similarity),
-    )
-    # 3. Mix
-    final_path = mix_audio(
-        vocals_path=converted_path,
-        instruments_path=instruments_path,
-        vocal_volume=float(vocal_volume),
-        instrumental_volume=float(instrumental_volume),
-    )
-    import librosa
-    # Load back the audio data to return it directly.
-    # This bypasses ZeroGPU filesystem sync issues.
-    v_data, v_sr = librosa.load(vocals_path, sr=None)
-    c_data, c_sr = librosa.load(converted_path, sr=None)
-    f_data, f_sr = librosa.load(final_path, sr=None)
-    return (v_sr, v_data), (c_sr, c_data), (f_sr, f_data)
-def train_voice_model(audio_file, model_name, progress=gr.Progress()):
-    """Controlador: guardar referencia de voz."""
-    if audio_file is None:
-        return "Error: Por favor, sube un archivo de audio.", None
-    if not model_name or not model_name.strip():
-        return "Error: Por favor, ingresa un nombre para el modelo.", None
-    model_name = model_name.strip().replace(" ", "_")
-    def progress_callback(value, desc):
-        progress(value, desc=desc)
-    try:
-        progress(0.0, desc="Iniciando...")
-        pth_path, ref_path = save_voice_reference(
-            audio_path=audio_file,
-            model_name=model_name,
-            progress_callback=progress_callback,
-        )
-        return "¡Referencia de voz '{}' guardada con éxito!".format(model_name), ref_path
-    except Exception as e:
-        import traceback
-        tb = traceback.format_exc()
-        logger.error("Error en el entrenamiento: {}".format(tb))
-        return "Error : {}: {}\n\nDetalles:\n{}".format(
-            type(e).__name__, str(e), tb[-500:]
-        ), None
-def get_model_choices():
-    """Obtener lista de nombres de modelos entrenados para el menú desplegable."""
-    models = list_models()
-    if not models:
-        return ["(ningún modelo)"]
-    return models
-def convert_song(
-    model_choice,
-    song_file,
-    pitch,
-    similarity,
-    diffusion_steps,
-    vocal_volume,
-    instrumental_volume,
-    progress=gr.Progress(),
-):
-    """Pipeline completo: separar + convertir + mezclar (single GPU session)."""
-    if song_file is None:
-        return "Error: Por favor, sube un archivo de audio.", None, None, None
-    if model_choice == "(ningún modelo)" or not model_choice:
-        return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
-    try:
-        progress(0.05, desc="Cargando modelo...")
-        pth_path, ref_or_index = download_model(model_choice)
-        if not pth_path:
-            return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
-        reference_path = get_reference_path(model_choice)
-        if not reference_path:
-            return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
-        progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
-        vocals_path, converted_path, final_path = _full_pipeline_gpu(
-            song_file=song_file,
-            reference_path=reference_path,
-            pitch=pitch,
-            diffusion_steps=diffusion_steps,
-            similarity=similarity,
-            vocal_volume=vocal_volume,
-            instrumental_volume=instrumental_volume,
-        )
-        progress(1.0, desc="¡Terminado!")
-        return (
-            "¡Conversión completada con éxito!",
-            vocals_path,
-            converted_path,
-            final_path,
-        )
-    except Exception as e:
-        import traceback
-        tb = traceback.format_exc()
-        logger.error("Error en la conversión: {}".format(tb))
-        return "Error : {}: {}\n\nDetalles:\n{}".format(
-            type(e).__name__, str(e), tb[-800:]
-        ), None, None, None
-def refresh_models():
-    """Actualizar la lista de modelos como HTML."""
-    models = list_models()
-    if not models:
-        return "<p style='color:gray;'>Ningún modelo guardado</p>"
-    rows = "".join(
-        "<tr><td>{}</td><td>Disponible</td></tr>".format(m) for m in models
-    )
-    return (
-        "<table style='width:100%;border-collapse:collapse;'>"
-        "<tr><th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Nombre</th>"
-        "<th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Estado</th></tr>"
-        "{}</table>".format(rows)
-    )
-def delete_selected_model(model_name_to_delete):
-    """Eliminar un modelo."""
-    if not model_name_to_delete or model_name_to_delete == "(ningún modelo)":
-        return "Por favor, selecciona un modelo para eliminar.", refresh_models()
-    try:
-        delete_model(model_name_to_delete)
-        return "Modelo '{}' eliminado.".format(model_name_to_delete), refresh_models()
-    except Exception as e:
-        return "Error : {}".format(e), refresh_models()
-with gr.Blocks(
-    title="Clon de Voz",
-    theme=gr.themes.Soft(),
-) as app:
-    gr.Markdown(
-        "# 🎤 Aplicación de Clonación de Voz (Seed-VC)\n"
-        "> Powered by [Seed-VC](https://github.com/Plachta/seed-vc) + [Demucs](https://github.com/facebookresearch/demucs) · ZeroGPU · Zero-shot"
-    )
-    with gr.Tabs():
-        # Pestaña 1: Referencia de voz
-        with gr.TabItem("Mi voz"):
-            gr.Markdown("### Guardar tu referencia de voz")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    train_audio = gr.Audio(
-                        label="Extracto de tu voz (WAV o MP3, 3-30 segundos)",
-                        type="filepath",
-                        sources=["upload"],
-                    )
-                    train_model_name = gr.Textbox(
-                        label="Nombre del perfil",
-                        placeholder="ej: mi_voz",
-                        max_lines=1,
-                    )
-                    train_btn = gr.Button(
-                        "Guardar",
-                        variant="primary",
-                        size="lg",
-                    )
-                with gr.Column(scale=1):
-                    train_status = gr.Textbox(
-                        label="Estado",
-                        interactive=False,
-                        lines=3,
-                    )
-                    train_download = gr.File(
-                        label="Archivo de referencia",
-                        interactive=False,
-                    )
-            gr.Markdown(
-                "**Consejos:**\n"
-                "- Usa una grabación limpia (sin ruido de fondo, sin música)\n"
-                "- Habla o canta naturalmente durante 3 a 30 segundos\n"
-                "- Mientras más largo y variado sea el extracto, mejor será el resultado\n"
-                "- Se aceptan formatos WAV o MP3"
-            )
-            with gr.Accordion("Perfiles guardados", open=False):
-                clonacion_refresh_btn = gr.Button("🔄 Actualizar lista de perfiles", size="sm")
-                clonacion_models_table = gr.HTML(value=refresh_models())
-            train_btn.click(
-                fn=train_voice_model,
-                inputs=[train_audio, train_model_name],
-                outputs=[train_status, train_download],
-            ).then(
-                fn=refresh_models,
-                outputs=[clonacion_models_table],
-            ).then(
-                fn=refresh_models,
-                outputs=[models_table],
-            ).then(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[convert_model],
-            ).then(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[models_delete_name],
-            )
-            clonacion_refresh_btn.click(
-                fn=refresh_models,
-                outputs=[clonacion_models_table],
-            )
-        # Pestaña 2: Conversión
-        with gr.TabItem("Convertir una canción"):
-            gr.Markdown("### Reemplazar la voz de una canción por la tuya")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    convert_model = gr.Dropdown(
-                        choices=get_model_choices(),
-                        label="Perfil de voz",
-                        interactive=True,
-                    )
-                    refresh_btn = gr.Button("Actualizar lista", size="sm")
-                    convert_audio = gr.Audio(
-                        label="Canción a convertir (WAV o MP3)",
-                        type="filepath",
-                        sources=["upload"],
-                    )
-                    with gr.Accordion("Parámetros avanzados", open=False):
-                        convert_pitch = gr.Slider(
-                            minimum=-24,
-                            maximum=24,
-                            value=0,
-                            step=1,
-                            label="Transposición (semitonos)",
-                        )
-                        convert_similarity = gr.Slider(
-                            minimum=0.0,
-                            maximum=1.0,
-                            value=0.7,
-                            step=0.05,
-                            label="Similitud de voz (0.5=natural, 0.7=equilibrado, 0.9=más fiel)",
-                        )
-                        convert_diffusion = gr.Slider(
-                            minimum=5,
-                            maximum=100,
-                            value=25,
-                            step=5,
-                            label="Calidad (10=rápido, 25=equilibrado, 50=alta calidad)",
-                        )
-                        convert_vocal_vol = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            value=1.0,
-                            step=0.1,
-                            label="Volumen de la voz",
-                        )
-                        convert_inst_vol = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            value=1.0,
-                            step=0.1,
-                            label="Volumen de los instrumentos",
-                        )
-                    convert_btn = gr.Button(
-                        "Convertir y mezclar",
-                        variant="primary",
-                        size="lg",
-                    )
-                with gr.Column(scale=1):
-                    convert_status = gr.Textbox(
-                        label="Estado",
-                        interactive=False,
-                        lines=3,
-                    )
-                    gr.Markdown("**Vista previa de las pistas:**")
-                    preview_vocals = gr.Audio(
-                        label="Voz original (separada)",
-                        interactive=False,
-                    )
-                    preview_converted = gr.Audio(
-                        label="Voz convertida",
-                        interactive=False,
-                    )
-                    gr.Markdown("**Resultado final:**")
-                    final_output = gr.Audio(
-                        label="Canción final (voz + instrumentos)",
-                        interactive=False,
-                    )
-            refresh_btn.click(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[convert_model],
-            )
-            convert_btn.click(
-                fn=convert_song,
-                inputs=[
-                    convert_model,
-                    convert_audio,
-                    convert_pitch,
-                    convert_similarity,
-                    convert_diffusion,
-                    convert_vocal_vol,
-                    convert_inst_vol,
-                ],
-                outputs=[convert_status, preview_vocals, preview_converted, final_output],
-            )
-        # Pestaña 3: Modelos
-        with gr.TabItem("Mis modelos"):
-            gr.Markdown("### Gestionar tus perfiles de voz")
-            models_table = gr.HTML(
-                value=refresh_models(),
-                label="Modelos guardados",
-            )
-            with gr.Row():
-                models_refresh_btn = gr.Button("Actualizar", size="sm")
-                models_delete_name = gr.Dropdown(
-                    choices=get_model_choices(),
-                    label="Modelo a eliminar",
-                    interactive=True,
-                )
-                models_delete_btn = gr.Button("Eliminar", variant="stop", size="sm")
-            models_delete_status = gr.Textbox(label="Estado", interactive=False)
-            models_refresh_btn.click(
-                fn=refresh_models,
-                outputs=[models_table],
-            )
-            models_refresh_btn.click(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[models_delete_name],
-            )
-            models_delete_btn.click(
-                fn=delete_selected_model,
-                inputs=[models_delete_name],
-                outputs=[models_delete_status, models_table],
-            )
-        # Pestaña 4: Debug (temporal)
-        with gr.TabItem("Depuración GPU"):
-            gr.Markdown("### Logs del Trabajador GPU (para diagnóstico)")
-            debug_output = gr.Textbox(
-                label="Últimos logs de GPU",
-                interactive=False,
-                lines=20,
-            )
-            debug_btn = gr.Button("Leer los logs", size="sm")
-            def read_debug_log():
-                log_path = "/home/user/app/debug_gpu.log"
-                if os.path.exists(log_path):
-                    with open(log_path, "r") as f:
-                        return f.read()
-                return "Ningún log disponible. Ejecuta una conversión primero."
-            debug_btn.click(fn=read_debug_log, outputs=[debug_output])
-if __name__ == "__main__":
-    os.makedirs("./results", exist_ok=True)
-    os.makedirs("./checkpoints/models", exist_ok=True)
-    app.launch(
-        allowed_paths=[
-            os.path.abspath("./results"),
-            os.path.abspath("./checkpoints"),
-        ]
-    )

+import os
+import sys
+import logging
+import tempfile
+import shutil
+import gradio as gr
+import gc
+import time
+import numpy as np
+import torch
+# Patches para Gradio
+try:
+    import gradio_client.utils as _gc_utils
+    _orig_get_type = _gc_utils.get_type
+    def _patched_get_type(schema, *args, **kwargs):
+        if not isinstance(schema, dict): return "Any"
+        return _orig_get_type(schema, *args, **kwargs)
+    _gc_utils.get_type = _patched_get_type
+    _orig_json_schema = _gc_utils._json_schema_to_python_type
+    def _patched_json_schema(schema, *args, **kwargs):
+        if not isinstance(schema, dict): return "Any"
+        return _orig_json_schema(schema, *args, **kwargs)
+    _gc_utils._json_schema_to_python_type = _patched_json_schema
+    _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(schema, defs)
+except Exception:
+    pass
+# Configuración de logs
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+from pipeline.setup import setup_seed_vc
+from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
+from pipeline.training import save_voice_reference
+from pipeline.separation import _separate_audio_impl
+from pipeline.inference import _convert_voice_impl
+from pipeline.mixing import mix_audio
+try:
+    import spaces
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(duration=60, **kwargs):
+            def decorator(fn): return fn
+            return decorator
+def check_file(path, label, logs):
+    if os.path.exists(path):
+        size = os.path.getsize(path)
+        logs.append(f"✅ {label} generado: {os.path.basename(path)} ({size} bytes)")
+        return size > 44
+    else:
+        logs.append(f"❌ ERROR: {label} NO se encontró en {path}")
+        return False
+@spaces.GPU(duration=600)
+def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
+                        vocal_volume, instrumental_volume):
+    import torch
+    import librosa
+    import soundfile as sf
+    logs = []
+    logs.append(f"🚀 Iniciando pipeline en GPU...")
+    # Asegurar directorio de trabajo
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    os.chdir(app_dir)
+    try:
+        # 1. Separación
+        logs.append("⏳ Paso 1/3: Separando voces (Demucs)...")
+        vocals_path, instruments_path = _separate_audio_impl(song_file)
+        if not check_file(vocals_path, "Vocales", logs): return None, None, None, "\n".join(logs)
+        torch.cuda.empty_cache()
+        gc.collect()
+        # 2. Conversión
+        logs.append("⏳ Paso 2/3: Convirtiendo voz (Seed-VC)...")
+        converted_path = _convert_voice_impl(vocals_path, reference_path, int(pitch), int(diffusion_steps), float(similarity))
+        if not check_file(converted_path, "Voz convertida", logs): return None, None, None, "\n".join(logs)
+        torch.cuda.empty_cache()
+        gc.collect()
+        # 3. Mezcla
+        logs.append("⏳ Paso 3/3: Mezclando pistas...")
+        final_path = mix_audio(converted_path, instruments_path, float(vocal_volume), float(instrumental_volume))
+        if not check_file(final_path, "Resultado final", logs): return None, None, None, "\n".join(logs)
+        # 4. Retornar DATOS (para evitar problemas de sincronización de archivos en ZeroGPU)
+        logs.append("📦 Preparando audios para el reproductor...")
+        def load_audio_to_numpy(p):
+            data, sr = librosa.load(p, sr=None)
+            data = np.nan_to_num(data)
+            return (sr, data.astype(np.float32))
+        v_out = load_audio_to_numpy(vocals_path)
+        c_out = load_audio_to_numpy(converted_path)
+        f_out = load_audio_to_numpy(final_path)
+        logs.append("✨ Proceso completado. Enviando al navegador...")
+        return v_out, c_out, f_out, "\n".join(logs)
+    except Exception as e:
+        import traceback
+        logs.append(f"💥 ERROR: {str(e)}\n{traceback.format_exc()}")
+        return None, None, None, "\n".join(logs)
+def train_voice_model(audio_file, model_name, progress=gr.Progress()):
+    if not audio_file or not model_name: return "Error: Datos incompletos.", None
+    model_name = model_name.strip().replace(" ", "_")
+    try:
+        pth_path, ref_path = save_voice_reference(audio_path=audio_file, model_name=model_name)
+        return f"¡Perfil '{model_name}' guardado!", ref_path
+    except Exception as e:
+        return f"Error: {str(e)}", None
+def get_model_choices():
+    models = list_models()
+    if not models:
+        return ["(ningún modelo)"]
+    return models
+def refresh_models():
+    models = list_models()
+    if not models:
+        return "<p style='color:gray;'>Ningún modelo guardado</p>"
+    rows = "".join(
+        "<tr><td>{}</td><td>Disponible</td></tr>".format(m) for m in models
+    )
+    return (
+        "<table style='width:100%;border-collapse:collapse;'>"
+        "<tr><th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Nombre</th>"
+        "<th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Estado</th></tr>"
+        "{}</table>".format(rows)
+    )
+def delete_selected_model(model_name_to_delete):
+    if not model_name_to_delete or model_name_to_delete == "(ningún modelo)":
+        return "Por favor, selecciona un modelo para eliminar.", refresh_models()
+    try:
+        delete_model(model_name_to_delete)
+        return "Modelo '{}' eliminado.".format(model_name_to_delete), refresh_models()
+    except Exception as e:
+        return "Error : {}".format(e), refresh_models()
+def convert_song(model_choice, song_file, pitch, similarity, diffusion_steps, vocal_volume, instrumental_volume, progress=gr.Progress()):
+    if not song_file or not model_choice or model_choice == "(ningún modelo)":
+        return "Error: Faltan datos.", None, None, None, "Esperando..."
+    try:
+        progress(0.1, desc="Iniciando...")
+        reference_path = get_reference_path(model_choice)
+        v_out, c_out, f_out, logs = _full_pipeline_gpu(
+            song_file, reference_path, pitch, diffusion_steps, similarity, vocal_volume, instrumental_volume
+        )
+        status = "✅ Completado" if f_out is not None else "❌ Error (revisa logs)"
+        return status, v_out, c_out, f_out, logs
+    except Exception as e:
+        import traceback
+        return f"Error: {str(e)}", None, None, None, traceback.format_exc()
+# --- UI Layout ---
+with gr.Blocks(title="Voice Clone RVC", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🎤 Aplicación de Clonación de Voz (Seed-VC)\n> Powered by Seed-VC + Demucs · ZeroGPU")
+    with gr.Tabs():
+        # Pestaña 1: Perfil
+        with gr.TabItem("1. Perfil"):
+            gr.Markdown("### Guardar tu referencia de voz")
+            with gr.Row():
+                with gr.Column():
+                    train_audio = gr.Audio(label="Sube tu voz (3-30 seg)", type="filepath")
+                    train_name = gr.Textbox(label="Nombre del perfil", placeholder="ej: mi_voz")
+                    train_btn = gr.Button("Guardar Perfil", variant="primary")
+                with gr.Column():
+                    train_status = gr.Textbox(label="Estado")
+                    train_file = gr.File(label="Archivo de Referencia")
+            with gr.Accordion("📋 Perfiles guardados", open=False):
+                clonacion_refresh_btn = gr.Button("🔄 Actualizar lista", size="sm")
+                clonacion_models_table = gr.HTML(value=refresh_models())
+            train_btn.click(
+                fn=train_voice_model,
+                inputs=[train_audio, train_name],
+                outputs=[train_status, train_file]
+            ).then(
+                fn=refresh_models, outputs=[clonacion_models_table]
+            ).then(
+                fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[clonacion_refresh_btn] # Dummy to trigger something if needed
+            )
+            clonacion_refresh_btn.click(fn=refresh_models, outputs=[clonacion_models_table])
+        # Pestaña 2: Conversión
+        with gr.TabItem("2. Conversión"):
+            gr.Markdown("### Reemplazar la voz de una canción")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_sel = gr.Dropdown(choices=get_model_choices(), label="Selecciona Perfil")
+                    refresh_btn_conv = gr.Button("🔄 Actualizar lista", size="sm")
+                    song_input = gr.Audio(label="Canción a convertir", type="filepath")
+                    with gr.Accordion("Ajustes Avanzados", open=False):
+                        pitch_shift = gr.Slider(-12, 12, 0, step=1, label="Tono (Pitch)")
+                        sim_slider = gr.Slider(0, 1, 0.7, step=0.1, label="Fidelidad/Similitud")
+                        diff_steps = gr.Slider(5, 50, 25, step=5, label="Calidad (Pasos de difusión)")
+                        v_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Voz")
+                        i_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Música")
+                    convert_btn = gr.Button("🚀 Iniciar Conversión", variant="primary", size="lg")
+                with gr.Column(scale=3):
+                    conv_status = gr.Textbox(label="Estado")
+                    out_vocals = gr.Audio(label="Voz Original (Separada)")
+                    out_conv = gr.Audio(label="Voz Clonada")
+                    out_final = gr.Audio(label="Resultado Final (Mezclado)")
+                    debug_logs = gr.Textbox(label="🔍 Logs de Procesamiento", lines=10)
+            refresh_btn_conv.click(fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[model_sel])
+            convert_btn.click(convert_song,
+                             [model_sel, song_input, pitch_shift, sim_slider, diff_steps, v_vol, i_vol],
+                             [conv_status, out_vocals, out_conv, out_final, debug_logs])
+        # Pestaña 3: Gestión de Modelos
+        with gr.TabItem("3. Mis Modelos"):
+            gr.Markdown("### Gestionar perfiles guardados")
+            models_table_mg = gr.HTML(value=refresh_models())
+            with gr.Row():
+                models_refresh_btn = gr.Button("Actualizar", size="sm")
+                models_delete_name = gr.Dropdown(choices=get_model_choices(), label="Eliminar perfil")
+                models_delete_btn = gr.Button("Eliminar", variant="stop", size="sm")
+            models_delete_status = gr.Textbox(label="Resultado")
+            models_refresh_btn.click(fn=refresh_models, outputs=[models_table_mg])
+            models_refresh_btn.click(fn=lambda: gr.Dropdown(choices=get_model_choices()), outputs=[models_delete_name])
+            models_delete_btn.click(fn=delete_selected_model, inputs=[models_delete_name], outputs=[models_delete_status, models_table_mg])
+        # Pestaña 4: Debug
+        with gr.TabItem("Depuración"):
+            gr.Markdown("### Diagnóstico del sistema")
+            debug_view = gr.Textbox(label="Logs de sistema", lines=20, interactive=False)
+            debug_btn = gr.Button("Ver Logs")
+            def read_logs():
+                log_path = "debug_gpu.log" # Or wherever it's saved
+                if os.path.exists(log_path):
+                    with open(log_path, "r") as f: return f.read()
+                return "No hay logs disponibles."
+            debug_btn.click(read_logs, outputs=[debug_view])
+if __name__ == "__main__":
+    setup_seed_vc()
+    os.makedirs("./results", exist_ok=True)
+    app.launch(allowed_paths=[os.path.abspath("./results"), os.path.abspath("./pipeline/results")])

pipeline/inference.py CHANGED Viewed

@@ -434,7 +434,15 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
             processed_frames += vc_target.size(2) - overlap_frame_len
     # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
-    audio_out = np.concatenate(generated_wave_chunks)
     rms = np.sqrt(np.mean(audio_out ** 2))
     target_rms = 10 ** (-18.0 / 20.0)  # -18 dBFS
     if rms > 1e-6:
@@ -444,5 +452,5 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")
-    logger.info("Conversion complete: {} ({:.1f}s)".format(output_path, len(audio_out) / sr))
     return output_path

             processed_frames += vc_target.size(2) - overlap_frame_len
     # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
+    if not generated_wave_chunks:
+        logger.error("No audio chunks were generated by Seed-VC!")
+        # Create a tiny silence buffer to avoid crash but indicate failure
+        audio_out = np.zeros(sr)
+    else:
+        audio_out = np.concatenate(generated_wave_chunks)
+    logger.info(f"Concatenated {len(generated_wave_chunks)} chunks. Total samples: {len(audio_out)}")
     rms = np.sqrt(np.mean(audio_out ** 2))
     target_rms = 10 ** (-18.0 / 20.0)  # -18 dBFS
     if rms > 1e-6:
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")
+    logger.info("Conversion complete: {} ({:.1f}s, {} samples)".format(output_path, len(audio_out) / sr, len(audio_out)))
     return output_path

pipeline/separation.py CHANGED Viewed

@@ -91,9 +91,17 @@ def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
     vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
     instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
     torchaudio.save(vocals_path, vocals, sr)
     torchaudio.save(instruments_path, instruments, sr)
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path

     vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
     instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
+    logger.info(f"Saving separated vocals to {vocals_path} (shape: {vocals.shape})")
+    if vocals.numel() == 0:
+        logger.error("Vocals tensor is EMPTY!")
     torchaudio.save(vocals_path, vocals, sr)
     torchaudio.save(instruments_path, instruments, sr)
+    # Cleanup GPU memory
+    del sources, model
+    torch.cuda.empty_cache()
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path