rvc

Sleeping

File size: 4,776 Bytes

2376414
fea49f2
 
 
2376414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fea49f2
 
 
 
2376414
fea49f2
2376414
 
fea49f2
 
 
 
2376414
 
fea49f2
2376414
fea49f2
 
2376414
fea49f2
 
 
 
2376414
fea49f2
 
27bc094
fea49f2
 
 
27bc094
fea49f2
27bc094
fea49f2
 
27bc094
fea49f2
 
27bc094
fea49f2
 
27bc094
fea49f2
 
 
 
27bc094
2376414
969158e
2376414
969158e
 
fea49f2
 
55b9bab
969158e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2806ea
fea49f2
2376414
259efa9
2376414
 
 
fea49f2
 
55b9bab
fea49f2
 
 
 
 
 
 
 
 
2376414
 
fea49f2
2376414
fea49f2
2376414
fea49f2
2376414
fea49f2
2376414
 
fea49f2
 
 
 
2376414
fea49f2

"""
Voice model creation: save a reference audio clip for Seed-VC zero-shot conversion.
No neural network training needed - Seed-VC uses in-context learning from
reference audio at inference time.
"""

import os
import logging
import shutil

logger = logging.getLogger(__name__)

try:
    import spaces
except ImportError:
    class spaces:
        @staticmethod
        def GPU(duration=60, **kwargs):
            def decorator(fn):
                return fn
            return decorator


# Dummy GPU-decorated function so ZeroGPU detects a GPU function at startup
@spaces.GPU(duration=10)
def _gpu_warmup():
    """Minimal GPU function for ZeroGPU detection."""
    import torch
    return torch.cuda.is_available() if hasattr(torch.cuda, "is_available") else False


def save_voice_reference(
    audio_path,
    model_name,
    progress_callback=None,
):
    """
    Save a voice reference audio clip as the user's 'voice model'.

    With Seed-VC, no training is needed. The reference audio (3-30 seconds)
    is used directly at inference time for zero-shot voice conversion.

    Args:
        audio_path: Path to the uploaded voice recording
        model_name: Name for the voice model
        progress_callback: Optional callback for progress updates

    Returns:
        (reference_path, None) - path to saved reference audio
    """
    import librosa
    import soundfile as sf
    import numpy as np

    from pipeline.storage import LOCAL_MODELS_DIR, upload_model

    if progress_callback:
        progress_callback(0.1, "Chargement de l'audio...")

    # Load and preprocess the reference audio
    audio, sr = librosa.load(audio_path, sr=44100, mono=True)

    duration = len(audio) / sr
    logger.info("Reference audio: {:.1f}s at {}Hz".format(duration, sr))

    if duration < 2.0:
        raise RuntimeError(
            "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
        )

    if progress_callback:
        progress_callback(0.3, "Optimisation de la reference vocale...")

    # 1. Trim silence from start and end (aggressive: top_db=20)
    audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
    if len(audio_trimmed) > sr * 2:
        audio = audio_trimmed

    # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
    max_samples = 25 * sr
    if len(audio) > max_samples:
        audio = audio[:max_samples]
        logger.info("Trimmed reference to 25s (Seed-VC effective max).")

    # 3. Remove low-frequency noise (high-pass filter at 80Hz)
    try:
        from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
        ref_board = Pedalboard([
            HighpassFilter(cutoff_frequency_hz=80.0),
            # Light compression to even out the reference voice level
            Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
            Gain(gain_db=1.0),
        ])
        audio_2d = audio.reshape(1, -1).astype(np.float32)
        audio_2d = ref_board(audio_2d, sr)
        audio = audio_2d.squeeze()
    except Exception as e:
        logger.warning("Pedalboard processing skipped: {}".format(e))

    # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
    # to give the speaker embedding model a strong signal)
    rms = np.sqrt(np.mean(audio ** 2))
    target_rms = 10 ** (-16.0 / 20.0)
    if rms > 1e-6:
        audio = audio * (target_rms / rms)
    audio = np.clip(audio, -0.99, 0.99)

    if progress_callback:
        progress_callback(0.6, "Sauvegarde de la reference vocale...")

    # Save to local models directory
    local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
    os.makedirs(local_model_dir, exist_ok=True)

    reference_path = os.path.join(local_model_dir, "{}_ref.wav".format(model_name))
    sf.write(reference_path, audio, 44100, subtype="PCM_16")

    # Also save a .pth marker for compatibility with storage/listing
    import torch
    marker_path = os.path.join(local_model_dir, "{}.pth".format(model_name))
    torch.save({
        "type": "seed_vc_reference",
        "reference_audio": "{}_ref.wav".format(model_name),
        "duration": len(audio) / sr,
        "sample_rate": 44100,
    }, marker_path)

    if progress_callback:
        progress_callback(0.8, "Upload vers HuggingFace...")

    # Upload to HF
    try:
        upload_model(model_name, marker_path, reference_path=reference_path)
    except Exception as e:
        logger.warning("Failed to upload to HF (non-critical): {}".format(e))

    if progress_callback:
        progress_callback(1.0, "Reference vocale sauvegardee !")

    final_duration = len(audio) / sr
    logger.info("Voice reference saved: {} ({:.1f}s)".format(reference_path, final_duration))

    return marker_path, reference_path