Spaces:

dimensionalpulsar
/

voice-clone-rvc

Sleeping

App Files Files Community

dimensionalpulsar commited on 24 days ago

Commit

067e4a7

1 Parent(s): 7d30adc

feat: add robust logging and diagnostics for ZeroGPU troubleshooting; fix '0 seconds' issue with explicit checks

Browse files

Files changed (3) hide show

app.py +216 -460
pipeline/inference.py +10 -2
pipeline/separation.py +8 -0

app.py CHANGED Viewed

@@ -1,460 +1,216 @@
-import os
-import sys
-import logging
-import tempfile
-import shutil
-import gradio as gr
-try:
-    import gradio_client.utils as _gc_utils
-    _orig_get_type = _gc_utils.get_type
-    def _patched_get_type(schema, *args, **kwargs):
-        if not isinstance(schema, dict):
-            return "Any"
-        return _orig_get_type(schema, *args, **kwargs)
-    _gc_utils.get_type = _patched_get_type
-    _orig_json_schema = _gc_utils._json_schema_to_python_type
-    def _patched_json_schema(schema, *args, **kwargs):
-        if not isinstance(schema, dict):
-            return "Any"
-        return _orig_json_schema(schema, *args, **kwargs)
-    _gc_utils._json_schema_to_python_type = _patched_json_schema
-    _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(
-        schema, defs
-    )
-except Exception:
-    pass
-# Configuración de logs
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
-logger = logging.getLogger(__name__)
-# Inicio: clonar Seed-VC
-logger.info("Inicializando la aplicación...")
-from pipeline.setup import setup_seed_vc
-from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
-try:
-    setup_seed_vc()
-except Exception as e:
-    logger.error("Error durante la configuración: {}".format(e))
-HF_MODELS_REPO = os.environ.get("HF_MODELS_REPO", "")
-if HF_MODELS_REPO:
-    init_storage(HF_MODELS_REPO)
-    logger.info("Almacenamiento de HuggingFace configurado: {}".format(HF_MODELS_REPO))
-from pipeline.training import save_voice_reference, _gpu_warmup
-from pipeline.separation import _separate_audio_impl
-from pipeline.inference import _convert_voice_impl
-from pipeline.mixing import mix_audio
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(duration=60, **kwargs):
-            def decorator(fn):
-                return fn
-            return decorator
-@spaces.GPU(duration=600)
-def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
-                        vocal_volume, instrumental_volume):
-    """
-    Master ZeroGPU function: runs Demucs + Seed-VC + Mix in a single GPU session.
-    ZeroGPU does NOT allow calling one @spaces.GPU function from inside another,
-    so we consolidate the entire pipeline here.
-    """
-    import torch
-    import os
-    import sys
-    # Ensure app dir is in path for the ZeroGPU worker
-    app_dir = os.path.dirname(os.path.abspath(__file__))
-    if app_dir not in sys.path:
-        sys.path.insert(0, app_dir)
-    os.chdir(app_dir)
-    # 1. Separate vocals / instruments (Demucs)
-    vocals_path, instruments_path = _separate_audio_impl(song_file)
-    # 2. Convert voice (Seed-VC)
-    converted_path = _convert_voice_impl(
-        audio_path=vocals_path,
-        reference_path=reference_path,
-        pitch=int(pitch),
-        diffusion_steps=int(diffusion_steps),
-        similarity=float(similarity),
-    )
-    # 3. Mix
-    final_path = mix_audio(
-        vocals_path=converted_path,
-        instruments_path=instruments_path,
-        vocal_volume=float(vocal_volume),
-        instrumental_volume=float(instrumental_volume),
-    )
-    import librosa
-    # Load back the audio data to return it directly.
-    # This bypasses ZeroGPU filesystem sync issues.
-    v_data, v_sr = librosa.load(vocals_path, sr=None)
-    c_data, c_sr = librosa.load(converted_path, sr=None)
-    f_data, f_sr = librosa.load(final_path, sr=None)
-    return (v_sr, v_data), (c_sr, c_data), (f_sr, f_data)
-def train_voice_model(audio_file, model_name, progress=gr.Progress()):
-    """Controlador: guardar referencia de voz."""
-    if audio_file is None:
-        return "Error: Por favor, sube un archivo de audio.", None
-    if not model_name or not model_name.strip():
-        return "Error: Por favor, ingresa un nombre para el modelo.", None
-    model_name = model_name.strip().replace(" ", "_")
-    def progress_callback(value, desc):
-        progress(value, desc=desc)
-    try:
-        progress(0.0, desc="Iniciando...")
-        pth_path, ref_path = save_voice_reference(
-            audio_path=audio_file,
-            model_name=model_name,
-            progress_callback=progress_callback,
-        )
-        return "¡Referencia de voz '{}' guardada con éxito!".format(model_name), ref_path
-    except Exception as e:
-        import traceback
-        tb = traceback.format_exc()
-        logger.error("Error en el entrenamiento: {}".format(tb))
-        return "Error : {}: {}\n\nDetalles:\n{}".format(
-            type(e).__name__, str(e), tb[-500:]
-        ), None
-def get_model_choices():
-    """Obtener lista de nombres de modelos entrenados para el menú desplegable."""
-    models = list_models()
-    if not models:
-        return ["(ningún modelo)"]
-    return models
-def convert_song(
-    model_choice,
-    song_file,
-    pitch,
-    similarity,
-    diffusion_steps,
-    vocal_volume,
-    instrumental_volume,
-    progress=gr.Progress(),
-):
-    """Pipeline completo: separar + convertir + mezclar (single GPU session)."""
-    if song_file is None:
-        return "Error: Por favor, sube un archivo de audio.", None, None, None
-    if model_choice == "(ningún modelo)" or not model_choice:
-        return "Error: Por favor, guarda una referencia de voz primero.", None, None, None
-    try:
-        progress(0.05, desc="Cargando modelo...")
-        pth_path, ref_or_index = download_model(model_choice)
-        if not pth_path:
-            return "Error: Modelo '{}' no encontrado.".format(model_choice), None, None, None
-        reference_path = get_reference_path(model_choice)
-        if not reference_path:
-            return "Error: Audio de referencia no encontrado para '{}'.".format(model_choice), None, None, None
-        progress(0.10, desc="Iniciando pipeline GPU (Demucs + Seed-VC + Mezcla)...")
-        vocals_path, converted_path, final_path = _full_pipeline_gpu(
-            song_file=song_file,
-            reference_path=reference_path,
-            pitch=pitch,
-            diffusion_steps=diffusion_steps,
-            similarity=similarity,
-            vocal_volume=vocal_volume,
-            instrumental_volume=instrumental_volume,
-        )
-        progress(1.0, desc="¡Terminado!")
-        return (
-            "¡Conversión completada con éxito!",
-            vocals_path,
-            converted_path,
-            final_path,
-        )
-    except Exception as e:
-        import traceback
-        tb = traceback.format_exc()
-        logger.error("Error en la conversión: {}".format(tb))
-        return "Error : {}: {}\n\nDetalles:\n{}".format(
-            type(e).__name__, str(e), tb[-800:]
-        ), None, None, None
-def refresh_models():
-    """Actualizar la lista de modelos como HTML."""
-    models = list_models()
-    if not models:
-        return "<p style='color:gray;'>Ningún modelo guardado</p>"
-    rows = "".join(
-        "<tr><td>{}</td><td>Disponible</td></tr>".format(m) for m in models
-    )
-    return (
-        "<table style='width:100%;border-collapse:collapse;'>"
-        "<tr><th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Nombre</th>"
-        "<th style='text-align:left;border-bottom:1px solid #555;padding:8px;'>Estado</th></tr>"
-        "{}</table>".format(rows)
-    )
-def delete_selected_model(model_name_to_delete):
-    """Eliminar un modelo."""
-    if not model_name_to_delete or model_name_to_delete == "(ningún modelo)":
-        return "Por favor, selecciona un modelo para eliminar.", refresh_models()
-    try:
-        delete_model(model_name_to_delete)
-        return "Modelo '{}' eliminado.".format(model_name_to_delete), refresh_models()
-    except Exception as e:
-        return "Error : {}".format(e), refresh_models()
-with gr.Blocks(
-    title="Clon de Voz",
-    theme=gr.themes.Soft(),
-) as app:
-    gr.Markdown(
-        "# 🎤 Aplicación de Clonación de Voz (Seed-VC)\n"
-        "> Powered by [Seed-VC](https://github.com/Plachta/seed-vc) + [Demucs](https://github.com/facebookresearch/demucs) · ZeroGPU · Zero-shot"
-    )
-    with gr.Tabs():
-        # Pestaña 1: Referencia de voz
-        with gr.TabItem("Mi voz"):
-            gr.Markdown("### Guardar tu referencia de voz")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    train_audio = gr.Audio(
-                        label="Extracto de tu voz (WAV o MP3, 3-30 segundos)",
-                        type="filepath",
-                        sources=["upload"],
-                    )
-                    train_model_name = gr.Textbox(
-                        label="Nombre del perfil",
-                        placeholder="ej: mi_voz",
-                        max_lines=1,
-                    )
-                    train_btn = gr.Button(
-                        "Guardar",
-                        variant="primary",
-                        size="lg",
-                    )
-                with gr.Column(scale=1):
-                    train_status = gr.Textbox(
-                        label="Estado",
-                        interactive=False,
-                        lines=3,
-                    )
-                    train_download = gr.File(
-                        label="Archivo de referencia",
-                        interactive=False,
-                    )
-            gr.Markdown(
-                "**Consejos:**\n"
-                "- Usa una grabación limpia (sin ruido de fondo, sin música)\n"
-                "- Habla o canta naturalmente durante 3 a 30 segundos\n"
-                "- Mientras más largo y variado sea el extracto, mejor será el resultado\n"
-                "- Se aceptan formatos WAV o MP3"
-            )
-            train_btn.click(
-                fn=train_voice_model,
-                inputs=[train_audio, train_model_name],
-                outputs=[train_status, train_download],
-            )
-        # Pestaña 2: Conversión
-        with gr.TabItem("Convertir una canción"):
-            gr.Markdown("### Reemplazar la voz de una canción por la tuya")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    convert_model = gr.Dropdown(
-                        choices=get_model_choices(),
-                        label="Perfil de voz",
-                        interactive=True,
-                    )
-                    refresh_btn = gr.Button("Actualizar lista", size="sm")
-                    convert_audio = gr.Audio(
-                        label="Canción a convertir (WAV o MP3)",
-                        type="filepath",
-                        sources=["upload"],
-                    )
-                    with gr.Accordion("Parámetros avanzados", open=False):
-                        convert_pitch = gr.Slider(
-                            minimum=-24,
-                            maximum=24,
-                            value=0,
-                            step=1,
-                            label="Transposición (semitonos)",
-                        )
-                        convert_similarity = gr.Slider(
-                            minimum=0.0,
-                            maximum=1.0,
-                            value=0.7,
-                            step=0.05,
-                            label="Similitud de voz (0.5=natural, 0.7=equilibrado, 0.9=más fiel)",
-                        )
-                        convert_diffusion = gr.Slider(
-                            minimum=5,
-                            maximum=100,
-                            value=25,
-                            step=5,
-                            label="Calidad (10=rápido, 25=equilibrado, 50=alta calidad)",
-                        )
-                        convert_vocal_vol = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            value=1.0,
-                            step=0.1,
-                            label="Volumen de la voz",
-                        )
-                        convert_inst_vol = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            value=1.0,
-                            step=0.1,
-                            label="Volumen de los instrumentos",
-                        )
-                    convert_btn = gr.Button(
-                        "Convertir y mezclar",
-                        variant="primary",
-                        size="lg",
-                    )
-                with gr.Column(scale=1):
-                    convert_status = gr.Textbox(
-                        label="Estado",
-                        interactive=False,
-                        lines=3,
-                    )
-                    gr.Markdown("**Vista previa de las pistas:**")
-                    preview_vocals = gr.Audio(
-                        label="Voz original (separada)",
-                        interactive=False,
-                    )
-                    preview_converted = gr.Audio(
-                        label="Voz convertida",
-                        interactive=False,
-                    )
-                    gr.Markdown("**Resultado final:**")
-                    final_output = gr.Audio(
-                        label="Canción final (voz + instrumentos)",
-                        interactive=False,
-                    )
-            refresh_btn.click(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[convert_model],
-            )
-            convert_btn.click(
-                fn=convert_song,
-                inputs=[
-                    convert_model,
-                    convert_audio,
-                    convert_pitch,
-                    convert_similarity,
-                    convert_diffusion,
-                    convert_vocal_vol,
-                    convert_inst_vol,
-                ],
-                outputs=[convert_status, preview_vocals, preview_converted, final_output],
-            )
-        # Pestaña 3: Modelos
-        with gr.TabItem("Mis modelos"):
-            gr.Markdown("### Gestionar tus perfiles de voz")
-            models_table = gr.HTML(
-                value=refresh_models(),
-                label="Modelos guardados",
-            )
-            with gr.Row():
-                models_refresh_btn = gr.Button("Actualizar", size="sm")
-                models_delete_name = gr.Dropdown(
-                    choices=get_model_choices(),
-                    label="Modelo a eliminar",
-                    interactive=True,
-                )
-                models_delete_btn = gr.Button("Eliminar", variant="stop", size="sm")
-            models_delete_status = gr.Textbox(label="Estado", interactive=False)
-            models_refresh_btn.click(
-                fn=refresh_models,
-                outputs=[models_table],
-            )
-            models_refresh_btn.click(
-                fn=lambda: gr.Dropdown(choices=get_model_choices()),
-                outputs=[models_delete_name],
-            )
-            models_delete_btn.click(
-                fn=delete_selected_model,
-                inputs=[models_delete_name],
-                outputs=[models_delete_status, models_table],
-            )
-        # Pestaña 4: Debug (temporal)
-        with gr.TabItem("Depuración GPU"):
-            gr.Markdown("### Logs del Trabajador GPU (para diagnóstico)")
-            debug_output = gr.Textbox(
-                label="Últimos logs de GPU",
-                interactive=False,
-                lines=20,
-            )
-            debug_btn = gr.Button("Leer los logs", size="sm")
-            def read_debug_log():
-                log_path = "/home/user/app/debug_gpu.log"
-                if os.path.exists(log_path):
-                    with open(log_path, "r") as f:
-                        return f.read()
-                return "Ningún log disponible. Ejecuta una conversión primero."
-            debug_btn.click(fn=read_debug_log, outputs=[debug_output])
-if __name__ == "__main__":
-    os.makedirs("./results", exist_ok=True)
-    os.makedirs("./checkpoints/models", exist_ok=True)
-    app.launch(
-        allowed_paths=[
-            os.path.abspath("./results"),
-            os.path.abspath("./checkpoints"),
-        ]
-    )

+import os
+import sys
+import logging
+import tempfile
+import shutil
+import gradio as gr
+import gc
+import time
+# Patches para Gradio
+try:
+    import gradio_client.utils as _gc_utils
+    _orig_get_type = _gc_utils.get_type
+    def _patched_get_type(schema, *args, **kwargs):
+        if not isinstance(schema, dict): return "Any"
+        return _orig_get_type(schema, *args, **kwargs)
+    _gc_utils.get_type = _patched_get_type
+    _orig_json_schema = _gc_utils._json_schema_to_python_type
+    def _patched_json_schema(schema, *args, **kwargs):
+        if not isinstance(schema, dict): return "Any"
+        return _orig_json_schema(schema, *args, **kwargs)
+    _gc_utils._json_schema_to_python_type = _patched_json_schema
+    _gc_utils.json_schema_to_python_type = lambda schema, defs=None: _patched_json_schema(schema, defs)
+except Exception:
+    pass
+# Configuración de logs
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+from pipeline.setup import setup_seed_vc
+from pipeline.storage import init_storage, list_models, download_model, delete_model, get_reference_path
+from pipeline.training import save_voice_reference
+from pipeline.separation import _separate_audio_impl
+from pipeline.inference import _convert_voice_impl
+from pipeline.mixing import mix_audio
+try:
+    import spaces
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(duration=60, **kwargs):
+            def decorator(fn): return fn
+            return decorator
+def check_file(path, label, logs):
+    if os.path.exists(path):
+        size = os.path.getsize(path)
+        logs.append(f"✅ {label} generado: {os.path.basename(path)} ({size} bytes)")
+        return size > 44 # Min size for a WAV header
+    else:
+        logs.append(f"❌ ERROR: {label} NO se encontró en {path}")
+        return False
+@spaces.GPU(duration=600)
+def _full_pipeline_gpu(song_file, reference_path, pitch, diffusion_steps, similarity,
+                        vocal_volume, instrumental_volume):
+    import torch
+    import librosa
+    logs = []
+    logs.append(f"🚀 Iniciando pipeline en GPU...")
+    # Asegurar directorio de trabajo
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    os.chdir(app_dir)
+    try:
+        # 1. Separación (Demucs)
+        logs.append("⏳ Paso 1/3: Separando voces e instrumentos (Demucs)...")
+        vocals_path, instruments_path = _separate_audio_impl(song_file)
+        if not check_file(vocals_path, "Vocales", logs):
+            return None, None, None, "\n".join(logs)
+        # Liberar memoria después de Demucs
+        torch.cuda.empty_cache()
+        gc.collect()
+        # 2. Conversión (Seed-VC)
+        logs.append("⏳ Paso 2/3: Convirtiendo voz (Seed-VC)...")
+        converted_path = _convert_voice_impl(
+            audio_path=vocals_path,
+            reference_path=reference_path,
+            pitch=int(pitch),
+            diffusion_steps=int(diffusion_steps),
+            similarity=float(similarity),
+        )
+        if not check_file(converted_path, "Voz convertida", logs):
+            return None, None, None, "\n".join(logs)
+        # Liberar memoria después de Seed-VC
+        torch.cuda.empty_cache()
+        gc.collect()
+        # 3. Mezcla
+        logs.append("⏳ Paso 3/3: Mezclando pistas finales...")
+        final_path = mix_audio(
+            vocals_path=converted_path,
+            instruments_path=instruments_path,
+            vocal_volume=float(vocal_volume),
+            instrumental_volume=float(instrumental_volume),
+        )
+        if not check_file(final_path, "Resultado final", logs):
+            return None, None, None, "\n".join(logs)
+        # 4. Cargar datos para retornar (Bypass ZeroGPU FS sync)
+        logs.append("📦 Cargando audios para salida...")
+        def safe_load(p):
+            data, sr = librosa.load(p, sr=None)
+            if data.size == 0:
+                logs.append(f"⚠️ Advertencia: El archivo {os.path.basename(p)} se cargó como un array VACÍO.")
+            return (sr, data)
+        v_out = safe_load(vocals_path)
+        c_out = safe_load(converted_path)
+        f_out = safe_load(final_path)
+        logs.append("✨ Pipeline completado con éxito.")
+        return v_out, c_out, f_out, "\n".join(logs)
+    except Exception as e:
+        import traceback
+        error_msg = f"💥 Error en Pipeline GPU: {str(e)}\n{traceback.format_exc()}"
+        logs.append(error_msg)
+        return None, None, None, "\n".join(logs)
+def train_voice_model(audio_file, model_name, progress=gr.Progress()):
+    if audio_file is None: return "Error: Sube un audio.", None
+    if not model_name: return "Error: Ponle un nombre.", None
+    model_name = model_name.strip().replace(" ", "_")
+    try:
+        progress(0.1, desc="Guardando referencia...")
+        pth_path, ref_path = save_voice_reference(audio_path=audio_file, model_name=model_name)
+        return f"¡Perfil '{model_name}' guardado!", ref_path
+    except Exception as e:
+        return f"Error: {str(e)}", None
+def convert_song(model_choice, song_file, pitch, similarity, diffusion_steps, vocal_volume, instrumental_volume, progress=gr.Progress()):
+    if not song_file: return "Error: Sube una canción.", None, None, None, "Esperando..."
+    if not model_choice or model_choice == "(ningún modelo)": return "Error: Elige un perfil.", None, None, None, "Esperando..."
+    try:
+        progress(0.1, desc="Preparando archivos...")
+        reference_path = get_reference_path(model_choice)
+        if not reference_path: return f"Error: No hay referencia para {model_choice}", None, None, None, "Error de modelo"
+        v_out, c_out, f_out, logs = _full_pipeline_gpu(
+            song_file, reference_path, pitch, diffusion_steps, similarity, vocal_volume, instrumental_volume
+        )
+        status = "✅ Completado" if f_out is not None else "❌ Falló"
+        return status, v_out, c_out, f_out, logs
+    except Exception as e:
+        import traceback
+        return f"Error: {str(e)}", None, None, None, traceback.format_exc()
+# --- UI ---
+with gr.Blocks(title="Voice Clone RVC/Seed-VC", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🎤 Clonación de Voz Profesional (Seed-VC + ZeroGPU)")
+    with gr.Tabs():
+        with gr.TabItem("1. Crear Perfil"):
+            with gr.Row():
+                with gr.Column():
+                    train_audio = gr.Audio(label="Tu voz (3-30 seg)", type="filepath")
+                    train_name = gr.Textbox(label="Nombre del perfil", placeholder="ej: mi_voz")
+                    train_btn = gr.Button("Guardar Referencia", variant="primary")
+                with gr.Column():
+                    train_status = gr.Textbox(label="Estado", interactive=False)
+                    train_file = gr.File(label="Archivo .pth")
+            train_btn.click(train_voice_model, [train_audio, train_name], [train_status, train_file])
+        with gr.TabItem("2. Convertir Canción"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_sel = gr.Dropdown(choices=list_models() or ["(ningún modelo)"], label="Selecciona tu voz")
+                    refresh_btn = gr.Button("🔄 Actualizar lista", size="sm")
+                    song_input = gr.Audio(label="Canción original", type="filepath")
+                    with gr.Accordion("Ajustes", open=False):
+                        pitch_shift = gr.Slider(-12, 12, 0, step=1, label="Tono (Pitch)")
+                        sim_slider = gr.Slider(0, 1, 0.7, step=0.1, label="Fidelidad")
+                        diff_steps = gr.Slider(5, 50, 25, step=5, label="Pasos Difusión")
+                        v_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Voz")
+                        i_vol = gr.Slider(0, 2, 1, step=0.1, label="Volumen Música")
+                    convert_btn = gr.Button("🚀 Iniciar Proceso", variant="primary", size="lg")
+                with gr.Column(scale=3):
+                    conv_status = gr.Textbox(label="Estado")
+                    with gr.Row():
+                        out_vocals = gr.Audio(label="Voz Original")
+                        out_conv = gr.Audio(label="Voz Clonada")
+                    out_final = gr.Audio(label="Resultado Final (Mix)")
+                    debug_logs = gr.Textbox(label="🔍 Logs Detallados", lines=15)
+            refresh_btn.click(lambda: gr.Dropdown(choices=list_models()), outputs=model_sel)
+            convert_btn.click(convert_song,
+                             [model_sel, song_input, pitch_shift, sim_slider, diff_steps, v_vol, i_vol],
+                             [conv_status, out_vocals, out_conv, out_final, debug_logs])
+        with gr.TabItem("3. Gestión"):
+            models_list = gr.HTML(value="Cargando...")
+            del_btn = gr.Button("Eliminar Seleccionado", variant="stop")
+            app.load(lambda: f"Modelos: {', '.join(list_models())}", outputs=models_list)
+if __name__ == "__main__":
+    setup_seed_vc()
+    app.launch()

pipeline/inference.py CHANGED Viewed

@@ -434,7 +434,15 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
             processed_frames += vc_target.size(2) - overlap_frame_len
     # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
-    audio_out = np.concatenate(generated_wave_chunks)
     rms = np.sqrt(np.mean(audio_out ** 2))
     target_rms = 10 ** (-18.0 / 20.0)  # -18 dBFS
     if rms > 1e-6:
@@ -444,5 +452,5 @@ def _convert_voice_core(audio_path, reference_path, pitch, diffusion_steps, simi
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")
-    logger.info("Conversion complete: {} ({:.1f}s)".format(output_path, len(audio_out) / sr))
     return output_path

             processed_frames += vc_target.size(2) - overlap_frame_len
     # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
+    if not generated_wave_chunks:
+        logger.error("No audio chunks were generated by Seed-VC!")
+        # Create a tiny silence buffer to avoid crash but indicate failure
+        audio_out = np.zeros(sr)
+    else:
+        audio_out = np.concatenate(generated_wave_chunks)
+    logger.info(f"Concatenated {len(generated_wave_chunks)} chunks. Total samples: {len(audio_out)}")
     rms = np.sqrt(np.mean(audio_out ** 2))
     target_rms = 10 ** (-18.0 / 20.0)  # -18 dBFS
     if rms > 1e-6:
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")
+    logger.info("Conversion complete: {} ({:.1f}s, {} samples)".format(output_path, len(audio_out) / sr, len(audio_out)))
     return output_path

pipeline/separation.py CHANGED Viewed

@@ -91,9 +91,17 @@ def _separate_audio_impl(audio_path: str, model_name: str = "htdemucs_ft"):
     vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
     instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
     torchaudio.save(vocals_path, vocals, sr)
     torchaudio.save(instruments_path, instruments, sr)
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path

     vocals_path = os.path.join(OUTPUT_DIR, f"{base_name}_vocals.wav")
     instruments_path = os.path.join(OUTPUT_DIR, f"{base_name}_instruments.wav")
+    logger.info(f"Saving separated vocals to {vocals_path} (shape: {vocals.shape})")
+    if vocals.numel() == 0:
+        logger.error("Vocals tensor is EMPTY!")
     torchaudio.save(vocals_path, vocals, sr)
     torchaudio.save(instruments_path, instruments, sr)
+    # Cleanup GPU memory
+    del sources, model
+    torch.cuda.empty_cache()
     logger.info(f"Separation complete. Vocals: {vocals_path}, Instruments: {instruments_path}")
     return vocals_path, instruments_path