"""
Voice model creation: save a reference audio clip for Seed-VC zero-shot conversion.
No neural network training needed - Seed-VC uses in-context learning from
reference audio at inference time.
"""

import os
import logging
import shutil

logger = logging.getLogger(__name__)

try:
    import spaces
except ImportError:
    class spaces:
        @staticmethod
        def GPU(duration=60, **kwargs):
            def decorator(fn):
                return fn
            return decorator


# Dummy GPU-decorated function so ZeroGPU detects a GPU function at startup
@spaces.GPU(duration=10)
def _gpu_warmup():
    """Minimal GPU function for ZeroGPU detection."""
    import torch
    return torch.cuda.is_available() if hasattr(torch.cuda, "is_available") else False


def save_voice_reference(
    audio_path,
    model_name,
    progress_callback=None,
):
    """
    Save a voice reference audio clip as the user's 'voice model'.

    With Seed-VC, no training is needed. The reference audio (3-30 seconds)
    is used directly at inference time for zero-shot voice conversion.

    Args:
        audio_path: Path to the uploaded voice recording
        model_name: Name for the voice model
        progress_callback: Optional callback for progress updates

    Returns:
        (reference_path, None) - path to saved reference audio
    """
    import librosa
    import soundfile as sf
    import numpy as np

    from pipeline.storage import LOCAL_MODELS_DIR, upload_model

    if progress_callback:
        progress_callback(0.1, "Chargement de l'audio...")

    # Load and preprocess the reference audio
    audio, sr = librosa.load(audio_path, sr=44100, mono=True)

    duration = len(audio) / sr
    logger.info("Reference audio: {:.1f}s at {}Hz".format(duration, sr))

    if duration < 2.0:
        raise RuntimeError(
            "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
        )

    if progress_callback:
        progress_callback(0.3, "Optimisation de la reference vocale...")

    # 1. Trim silence from start and end (aggressive: top_db=20)
    audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
    if len(audio_trimmed) > sr * 2:
        audio = audio_trimmed

    # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
    max_samples = 25 * sr
    if len(audio) > max_samples:
        audio = audio[:max_samples]
        logger.info("Trimmed reference to 25s (Seed-VC effective max).")

    # 3. Remove low-frequency noise (high-pass filter at 80Hz)
    try:
        from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
        ref_board = Pedalboard([
            HighpassFilter(cutoff_frequency_hz=80.0),
            # Light compression to even out the reference voice level
            Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
            Gain(gain_db=1.0),
        ])
        audio_2d = audio.reshape(1, -1).astype(np.float32)
        audio_2d = ref_board(audio_2d, sr)
        audio = audio_2d.squeeze()
    except Exception as e:
        logger.warning("Pedalboard processing skipped: {}".format(e))

    # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
    # to give the speaker embedding model a strong signal)
    rms = np.sqrt(np.mean(audio ** 2))
    target_rms = 10 ** (-16.0 / 20.0)
    if rms > 1e-6:
        audio = audio * (target_rms / rms)
    audio = np.clip(audio, -0.99, 0.99)

    if progress_callback:
        progress_callback(0.6, "Sauvegarde de la reference vocale...")

    # Save to local models directory
    local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
    os.makedirs(local_model_dir, exist_ok=True)

    reference_path = os.path.join(local_model_dir, "{}_ref.wav".format(model_name))
    sf.write(reference_path, audio, 44100, subtype="PCM_16")

    # Also save a .pth marker for compatibility with storage/listing
    import torch
    marker_path = os.path.join(local_model_dir, "{}.pth".format(model_name))
    torch.save({
        "type": "seed_vc_reference",
        "reference_audio": "{}_ref.wav".format(model_name),
        "duration": len(audio) / sr,
        "sample_rate": 44100,
    }, marker_path)

    if progress_callback:
        progress_callback(0.8, "Upload vers HuggingFace...")

    # Upload to HF
    try:
        upload_model(model_name, marker_path, reference_path=reference_path)
    except Exception as e:
        logger.warning("Failed to upload to HF (non-critical): {}".format(e))

    if progress_callback:
        progress_callback(1.0, "Reference vocale sauvegardee !")

    final_duration = len(audio) / sr
    logger.info("Voice reference saved: {} ({:.1f}s)".format(reference_path, final_duration))

    return marker_path, reference_path