""" Voice model creation: save a reference audio clip for Seed-VC zero-shot conversion. No neural network training needed - Seed-VC uses in-context learning from reference audio at inference time. """ import os import logging import shutil logger = logging.getLogger(__name__) try: import spaces except ImportError: class spaces: @staticmethod def GPU(duration=60, **kwargs): def decorator(fn): return fn return decorator # Dummy GPU-decorated function so ZeroGPU detects a GPU function at startup @spaces.GPU(duration=10) def _gpu_warmup(): """Minimal GPU function for ZeroGPU detection.""" import torch return torch.cuda.is_available() if hasattr(torch.cuda, "is_available") else False def save_voice_reference( audio_path, model_name, progress_callback=None, ): """ Save a voice reference audio clip as the user's 'voice model'. With Seed-VC, no training is needed. The reference audio (3-30 seconds) is used directly at inference time for zero-shot voice conversion. Args: audio_path: Path to the uploaded voice recording model_name: Name for the voice model progress_callback: Optional callback for progress updates Returns: (reference_path, None) - path to saved reference audio """ import librosa import soundfile as sf import numpy as np from pipeline.storage import LOCAL_MODELS_DIR, upload_model if progress_callback: progress_callback(0.1, "Chargement de l'audio...") # Load and preprocess the reference audio audio, sr = librosa.load(audio_path, sr=44100, mono=True) duration = len(audio) / sr logger.info("Reference audio: {:.1f}s at {}Hz".format(duration, sr)) if duration < 2.0: raise RuntimeError( "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration) ) if progress_callback: progress_callback(0.3, "Optimisation de la reference vocale...") # 1. Trim silence from start and end (aggressive: top_db=20) audio_trimmed, _ = librosa.effects.trim(audio, top_db=20) if len(audio_trimmed) > sr * 2: audio = audio_trimmed # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally) max_samples = 25 * sr if len(audio) > max_samples: audio = audio[:max_samples] logger.info("Trimmed reference to 25s (Seed-VC effective max).") # 3. Remove low-frequency noise (high-pass filter at 80Hz) try: from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain ref_board = Pedalboard([ HighpassFilter(cutoff_frequency_hz=80.0), # Light compression to even out the reference voice level Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0), Gain(gain_db=1.0), ]) audio_2d = audio.reshape(1, -1).astype(np.float32) audio_2d = ref_board(audio_2d, sr) audio = audio_2d.squeeze() except Exception as e: logger.warning("Pedalboard processing skipped: {}".format(e)) # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals # to give the speaker embedding model a strong signal) rms = np.sqrt(np.mean(audio ** 2)) target_rms = 10 ** (-16.0 / 20.0) if rms > 1e-6: audio = audio * (target_rms / rms) audio = np.clip(audio, -0.99, 0.99) if progress_callback: progress_callback(0.6, "Sauvegarde de la reference vocale...") # Save to local models directory local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name) os.makedirs(local_model_dir, exist_ok=True) reference_path = os.path.join(local_model_dir, "{}_ref.wav".format(model_name)) sf.write(reference_path, audio, 44100, subtype="PCM_16") # Also save a .pth marker for compatibility with storage/listing import torch marker_path = os.path.join(local_model_dir, "{}.pth".format(model_name)) torch.save({ "type": "seed_vc_reference", "reference_audio": "{}_ref.wav".format(model_name), "duration": len(audio) / sr, "sample_rate": 44100, }, marker_path) if progress_callback: progress_callback(0.8, "Upload vers HuggingFace...") # Upload to HF try: upload_model(model_name, marker_path, reference_path=reference_path) except Exception as e: logger.warning("Failed to upload to HF (non-critical): {}".format(e)) if progress_callback: progress_callback(1.0, "Reference vocale sauvegardee !") final_duration = len(audio) / sr logger.info("Voice reference saved: {} ({:.1f}s)".format(reference_path, final_duration)) return marker_path, reference_path