File size: 4,776 Bytes
2376414
fea49f2
 
 
2376414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fea49f2
 
 
 
2376414
fea49f2
2376414
 
fea49f2
 
 
 
2376414
 
fea49f2
2376414
fea49f2
 
2376414
fea49f2
 
 
 
2376414
fea49f2
 
27bc094
fea49f2
 
 
27bc094
fea49f2
27bc094
fea49f2
 
27bc094
fea49f2
 
27bc094
fea49f2
 
27bc094
fea49f2
 
 
 
27bc094
2376414
969158e
2376414
969158e
 
fea49f2
 
55b9bab
969158e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2806ea
fea49f2
2376414
259efa9
2376414
 
 
fea49f2
 
55b9bab
fea49f2
 
 
 
 
 
 
 
 
2376414
 
fea49f2
2376414
fea49f2
2376414
fea49f2
2376414
fea49f2
2376414
 
fea49f2
 
 
 
2376414
fea49f2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Voice model creation: save a reference audio clip for Seed-VC zero-shot conversion.
No neural network training needed - Seed-VC uses in-context learning from
reference audio at inference time.
"""

import os
import logging
import shutil

logger = logging.getLogger(__name__)

try:
    import spaces
except ImportError:
    class spaces:
        @staticmethod
        def GPU(duration=60, **kwargs):
            def decorator(fn):
                return fn
            return decorator


# Dummy GPU-decorated function so ZeroGPU detects a GPU function at startup
@spaces.GPU(duration=10)
def _gpu_warmup():
    """Minimal GPU function for ZeroGPU detection."""
    import torch
    return torch.cuda.is_available() if hasattr(torch.cuda, "is_available") else False


def save_voice_reference(
    audio_path,
    model_name,
    progress_callback=None,
):
    """
    Save a voice reference audio clip as the user's 'voice model'.

    With Seed-VC, no training is needed. The reference audio (3-30 seconds)
    is used directly at inference time for zero-shot voice conversion.

    Args:
        audio_path: Path to the uploaded voice recording
        model_name: Name for the voice model
        progress_callback: Optional callback for progress updates

    Returns:
        (reference_path, None) - path to saved reference audio
    """
    import librosa
    import soundfile as sf
    import numpy as np

    from pipeline.storage import LOCAL_MODELS_DIR, upload_model

    if progress_callback:
        progress_callback(0.1, "Chargement de l'audio...")

    # Load and preprocess the reference audio
    audio, sr = librosa.load(audio_path, sr=44100, mono=True)

    duration = len(audio) / sr
    logger.info("Reference audio: {:.1f}s at {}Hz".format(duration, sr))

    if duration < 2.0:
        raise RuntimeError(
            "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
        )

    if progress_callback:
        progress_callback(0.3, "Optimisation de la reference vocale...")

    # 1. Trim silence from start and end (aggressive: top_db=20)
    audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
    if len(audio_trimmed) > sr * 2:
        audio = audio_trimmed

    # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
    max_samples = 25 * sr
    if len(audio) > max_samples:
        audio = audio[:max_samples]
        logger.info("Trimmed reference to 25s (Seed-VC effective max).")

    # 3. Remove low-frequency noise (high-pass filter at 80Hz)
    try:
        from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
        ref_board = Pedalboard([
            HighpassFilter(cutoff_frequency_hz=80.0),
            # Light compression to even out the reference voice level
            Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
            Gain(gain_db=1.0),
        ])
        audio_2d = audio.reshape(1, -1).astype(np.float32)
        audio_2d = ref_board(audio_2d, sr)
        audio = audio_2d.squeeze()
    except Exception as e:
        logger.warning("Pedalboard processing skipped: {}".format(e))

    # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
    # to give the speaker embedding model a strong signal)
    rms = np.sqrt(np.mean(audio ** 2))
    target_rms = 10 ** (-16.0 / 20.0)
    if rms > 1e-6:
        audio = audio * (target_rms / rms)
    audio = np.clip(audio, -0.99, 0.99)

    if progress_callback:
        progress_callback(0.6, "Sauvegarde de la reference vocale...")

    # Save to local models directory
    local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
    os.makedirs(local_model_dir, exist_ok=True)

    reference_path = os.path.join(local_model_dir, "{}_ref.wav".format(model_name))
    sf.write(reference_path, audio, 44100, subtype="PCM_16")

    # Also save a .pth marker for compatibility with storage/listing
    import torch
    marker_path = os.path.join(local_model_dir, "{}.pth".format(model_name))
    torch.save({
        "type": "seed_vc_reference",
        "reference_audio": "{}_ref.wav".format(model_name),
        "duration": len(audio) / sr,
        "sample_rate": 44100,
    }, marker_path)

    if progress_callback:
        progress_callback(0.8, "Upload vers HuggingFace...")

    # Upload to HF
    try:
        upload_model(model_name, marker_path, reference_path=reference_path)
    except Exception as e:
        logger.warning("Failed to upload to HF (non-critical): {}".format(e))

    if progress_callback:
        progress_callback(1.0, "Reference vocale sauvegardee !")

    final_duration = len(audio) / sr
    logger.info("Voice reference saved: {} ({:.1f}s)".format(reference_path, final_duration))

    return marker_path, reference_path