Spaces:
Sleeping
Sleeping
File size: 4,776 Bytes
2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 27bc094 fea49f2 27bc094 fea49f2 27bc094 fea49f2 27bc094 fea49f2 27bc094 fea49f2 27bc094 fea49f2 27bc094 2376414 969158e 2376414 969158e fea49f2 55b9bab 969158e d2806ea fea49f2 2376414 259efa9 2376414 fea49f2 55b9bab fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 2376414 fea49f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
Voice model creation: save a reference audio clip for Seed-VC zero-shot conversion.
No neural network training needed - Seed-VC uses in-context learning from
reference audio at inference time.
"""
import os
import logging
import shutil
logger = logging.getLogger(__name__)
try:
import spaces
except ImportError:
class spaces:
@staticmethod
def GPU(duration=60, **kwargs):
def decorator(fn):
return fn
return decorator
# Dummy GPU-decorated function so ZeroGPU detects a GPU function at startup
@spaces.GPU(duration=10)
def _gpu_warmup():
"""Minimal GPU function for ZeroGPU detection."""
import torch
return torch.cuda.is_available() if hasattr(torch.cuda, "is_available") else False
def save_voice_reference(
audio_path,
model_name,
progress_callback=None,
):
"""
Save a voice reference audio clip as the user's 'voice model'.
With Seed-VC, no training is needed. The reference audio (3-30 seconds)
is used directly at inference time for zero-shot voice conversion.
Args:
audio_path: Path to the uploaded voice recording
model_name: Name for the voice model
progress_callback: Optional callback for progress updates
Returns:
(reference_path, None) - path to saved reference audio
"""
import librosa
import soundfile as sf
import numpy as np
from pipeline.storage import LOCAL_MODELS_DIR, upload_model
if progress_callback:
progress_callback(0.1, "Chargement de l'audio...")
# Load and preprocess the reference audio
audio, sr = librosa.load(audio_path, sr=44100, mono=True)
duration = len(audio) / sr
logger.info("Reference audio: {:.1f}s at {}Hz".format(duration, sr))
if duration < 2.0:
raise RuntimeError(
"Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
)
if progress_callback:
progress_callback(0.3, "Optimisation de la reference vocale...")
# 1. Trim silence from start and end (aggressive: top_db=20)
audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
if len(audio_trimmed) > sr * 2:
audio = audio_trimmed
# 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
max_samples = 25 * sr
if len(audio) > max_samples:
audio = audio[:max_samples]
logger.info("Trimmed reference to 25s (Seed-VC effective max).")
# 3. Remove low-frequency noise (high-pass filter at 80Hz)
try:
from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
ref_board = Pedalboard([
HighpassFilter(cutoff_frequency_hz=80.0),
# Light compression to even out the reference voice level
Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
Gain(gain_db=1.0),
])
audio_2d = audio.reshape(1, -1).astype(np.float32)
audio_2d = ref_board(audio_2d, sr)
audio = audio_2d.squeeze()
except Exception as e:
logger.warning("Pedalboard processing skipped: {}".format(e))
# 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
# to give the speaker embedding model a strong signal)
rms = np.sqrt(np.mean(audio ** 2))
target_rms = 10 ** (-16.0 / 20.0)
if rms > 1e-6:
audio = audio * (target_rms / rms)
audio = np.clip(audio, -0.99, 0.99)
if progress_callback:
progress_callback(0.6, "Sauvegarde de la reference vocale...")
# Save to local models directory
local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
os.makedirs(local_model_dir, exist_ok=True)
reference_path = os.path.join(local_model_dir, "{}_ref.wav".format(model_name))
sf.write(reference_path, audio, 44100, subtype="PCM_16")
# Also save a .pth marker for compatibility with storage/listing
import torch
marker_path = os.path.join(local_model_dir, "{}.pth".format(model_name))
torch.save({
"type": "seed_vc_reference",
"reference_audio": "{}_ref.wav".format(model_name),
"duration": len(audio) / sr,
"sample_rate": 44100,
}, marker_path)
if progress_callback:
progress_callback(0.8, "Upload vers HuggingFace...")
# Upload to HF
try:
upload_model(model_name, marker_path, reference_path=reference_path)
except Exception as e:
logger.warning("Failed to upload to HF (non-critical): {}".format(e))
if progress_callback:
progress_callback(1.0, "Reference vocale sauvegardee !")
final_duration = len(audio) / sr
logger.info("Voice reference saved: {} ({:.1f}s)".format(reference_path, final_duration))
return marker_path, reference_path
|