ibcplateformes Claude Opus 4.6 commited on
Commit
55b9bab
·
1 Parent(s): 27bc094

Implement real RVC v2 inference pipeline with HuBERT + FAISS + generator

Browse files

Major rewrite of the voice conversion to use proper RVC pipeline:
- Extract HuBERT (ContentVec) features from source audio
- Upsample features 2x to match F0 frame rate (50Hz -> 100Hz)
- FAISS retrieval: find target voice embeddings, blend with source
- Extract F0 with RMVPE, apply pitch shift, quantize to mel buckets
- Feed blended features + F0 into pretrained Synthesizer generator
- Voice identity comes from FAISS retrieval, not generator fine-tuning

Training pipeline now saves big_npy embeddings alongside FAISS index
for efficient retrieval at inference time. The .pth file is now just
a marker - the pretrained generator is loaded directly from Applio.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. app.py +1 -1
  2. pipeline/inference.py +279 -234
  3. pipeline/storage.py +22 -1
  4. pipeline/training.py +25 -19
app.py CHANGED
@@ -69,7 +69,7 @@ else:
69
 
70
 
71
  # ── Import GPU-decorated functions at top level for ZeroGPU detection ───────
72
- from pipeline.training import full_training_pipeline, extract_features, train_model
73
  from pipeline.separation import separate_audio
74
  from pipeline.inference import convert_voice
75
 
 
69
 
70
 
71
  # ── Import GPU-decorated functions at top level for ZeroGPU detection ───────
72
+ from pipeline.training import full_training_pipeline, extract_features
73
  from pipeline.separation import separate_audio
74
  from pipeline.inference import convert_voice
75
 
pipeline/inference.py CHANGED
@@ -1,13 +1,16 @@
1
  """
2
- Voice conversion module: standalone RVC-like inference using
3
- HuBERT embeddings + FAISS index + pitch shifting.
4
- Does not require a trained model uses pre-extracted voice features.
 
5
  """
6
 
7
  import os
8
  import sys
9
  import logging
10
  import numpy as np
 
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
@@ -25,275 +28,317 @@ from pipeline.setup import APPLIO_DIR, ensure_applio_path
25
 
26
  OUTPUT_DIR = "/tmp/rvc_output"
27
 
 
 
 
 
28
 
29
- def _ensure_inference_format(model_path):
30
- """
31
- Check if model is in RVC inference format (has 'weight' key).
32
- If it's a training checkpoint (has 'model' key), convert it on the fly.
33
- """
34
- import torch
35
 
36
- checkpoint = torch.load(model_path, map_location="cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- if "weight" in checkpoint:
39
- return model_path # Already in inference format
 
40
 
41
- if "model" not in checkpoint:
42
- logger.warning("Model has neither 'weight' nor 'model' key.")
43
- return model_path
44
 
45
- logger.info("Converting training checkpoint to inference format...")
 
 
 
 
 
 
 
46
 
47
- state_dict = checkpoint["model"]
48
- weight = {}
49
- for k, v in state_dict.items():
50
- new_key = k.replace("module.", "")
51
- weight[new_key] = v.half()
52
 
53
- # Standard RVC v2 40k config
54
- config = [
55
- 1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
56
- [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
57
- [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
58
- ]
59
 
60
- inference_model = {
61
- "weight": weight,
62
- "config": config,
63
- "info": "v2_40k",
64
- "sr": "40k",
65
- "f0": 1,
66
- "version": "v2",
67
- }
68
 
69
- # Save converted model alongside original
70
- converted_path = model_path.replace(".pth", "_inference.pth")
71
- torch.save(inference_model, converted_path)
72
- logger.info(f"Saved inference model: {converted_path}")
73
- return converted_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
  @spaces.GPU(duration=60)
77
  def convert_voice(
78
- audio_path: str,
79
- model_path: str,
80
- index_path: str = None,
81
- pitch: int = 0,
82
- f0_method: str = "rmvpe",
83
- index_rate: float = 0.75,
84
- protect: float = 0.33,
85
- volume_envelope: float = 1.0,
86
- output_format: str = "WAV",
87
  ):
88
  """
89
- Convert voice using FAISS index matching + pitch shifting.
90
- Uses HuBERT embeddings from the target voice (stored in FAISS index)
91
- to guide voice conversion. Falls back to pitch shifting when needed.
 
 
92
 
93
  Returns path to converted audio file.
94
  """
95
- import torch
96
  import librosa
97
  import soundfile as sf
98
 
99
  os.makedirs(OUTPUT_DIR, exist_ok=True)
100
  base_name = os.path.splitext(os.path.basename(audio_path))[0]
101
- output_path = os.path.join(OUTPUT_DIR, f"{base_name}_converted.wav")
102
 
103
- logger.info(f"Converting voice: {audio_path}")
104
- logger.info(f"Index: {index_path}, Pitch: {pitch}")
 
105
 
106
- # Load source audio
107
- source_audio, sr = librosa.load(audio_path, sr=40000, mono=True)
108
- logger.info(f"Source audio: {len(source_audio)} samples, {len(source_audio)/sr:.1f}s")
109
 
110
- if len(source_audio) < sr * 0.5:
111
- raise RuntimeError("Audio source trop court pour la conversion.")
 
112
 
113
- # Ensure model is in RVC inference format (weight key, not model key)
114
- model_path = _ensure_inference_format(model_path)
115
 
116
- # Try Applio VoiceConverter
117
- try:
118
- converted = _try_applio_inference(
119
- audio_path, model_path, index_path, pitch,
120
- f0_method, index_rate, protect, volume_envelope, output_format, output_path
121
- )
122
- if converted:
123
- return converted
124
- except Exception as e:
125
- logger.info(f"Applio inference not available ({type(e).__name__}: {e}), using fallback.")
126
 
127
- # Fallback: pitch-shifting based conversion
128
- logger.info("Using pitch-shift + formant conversion...")
 
129
 
130
- # Apply pitch shift
131
- if pitch != 0:
132
- source_audio = librosa.effects.pitch_shift(
133
- source_audio, sr=sr, n_steps=pitch
134
- )
 
 
135
 
136
- # If we have a FAISS index, use it to adjust voice characteristics
137
  if index_path and os.path.exists(index_path):
138
- source_audio = _apply_voice_features(source_audio, sr, index_path, index_rate)
 
139
 
140
- # Normalize output
141
- peak = np.abs(source_audio).max()
142
- if peak > 0:
143
- source_audio = source_audio / peak * 0.95
144
 
145
- # Save output at 44.1kHz 16-bit (standard audio)
146
- output_44k = librosa.resample(source_audio, orig_sr=sr, target_sr=44100)
147
- sf.write(output_path, output_44k, 44100, subtype='PCM_16')
148
 
149
- logger.info(f"Conversion complete: {output_path}")
150
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
 
 
152
 
153
- def _try_applio_inference(audio_path, model_path, index_path, pitch,
154
- f0_method, index_rate, protect, volume_envelope,
155
- output_format, output_path):
156
- """Try to use Applio's VoiceConverter. Returns output path or None."""
157
- import torch
 
158
 
159
- # Check if model is a proper RVC inference model
160
- checkpoint = torch.load(model_path, map_location="cpu")
161
- if "weight" not in checkpoint:
162
- logger.info("Model is not an RVC inference model (no 'weight' key).")
163
- return None
164
 
165
- ensure_applio_path()
166
- old_cwd = os.getcwd()
167
- os.chdir(APPLIO_DIR)
168
-
169
- try:
170
- from rvc.infer.infer import VoiceConverter
171
- converter = VoiceConverter()
172
-
173
- converter.convert_audio(
174
- pitch=pitch,
175
- index_rate=index_rate,
176
- volume_envelope=volume_envelope,
177
- protect=protect,
178
- f0_method=f0_method,
179
- audio_input_path=audio_path,
180
- audio_output_path=output_path,
181
- model_path=model_path,
182
- index_path=index_path or "",
183
- split_audio=False,
184
- f0_autotune=False,
185
- f0_autotune_strength=1.0,
186
- proposed_pitch=False,
187
- proposed_pitch_threshold=0.5,
188
- clean_audio=True,
189
- clean_strength=0.5,
190
- export_format=output_format,
191
- embedder_model="contentvec",
192
- embedder_model_custom=None,
193
- sid=0,
194
- formant_shifting=False,
195
- formant_qfrency=1.0,
196
- formant_timbre=1.0,
197
- post_process=False,
198
- reverb=False,
199
- pitch_shift=False,
200
- limiter=False,
201
- gain=False,
202
- distortion=False,
203
- chorus=False,
204
- bitcrush=False,
205
- clipping=False,
206
- compressor=False,
207
- delay=False,
208
- sliders=None,
209
- )
210
- return output_path if os.path.exists(output_path) else None
211
- finally:
212
- os.chdir(old_cwd)
213
 
 
 
 
 
 
214
 
215
- def _apply_voice_features(audio, sr, index_path, index_rate):
216
- """
217
- Apply voice characteristics from FAISS index using spectral envelope matching.
218
- This is a simplified version of RVC's retrieval-based conversion.
219
- """
220
- try:
221
- import faiss
222
-
223
- index = faiss.read_index(index_path)
224
- n_vectors = index.ntotal
225
-
226
- if n_vectors == 0:
227
- logger.warning("FAISS index is empty, skipping voice feature matching.")
228
- return audio
229
-
230
- # Extract spectral features from source audio
231
- # Use short-time Fourier transform
232
- hop_length = 512
233
- n_fft = 2048
234
-
235
- stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
236
- magnitude = np.abs(stft)
237
- phase = np.angle(stft)
238
-
239
- # Get spectral envelope (smoothed magnitude spectrum)
240
- source_envelope = np.mean(magnitude, axis=1, keepdims=True)
241
-
242
- # Get target voice spectral characteristics from index
243
- # Sample embeddings from index to estimate target voice profile
244
- dim = index.d
245
- n_sample = min(n_vectors, 50)
246
-
247
- # Reconstruct vectors from index
248
- if hasattr(index, 'reconstruct'):
249
- target_features = np.zeros((n_sample, dim), dtype=np.float32)
250
- for i in range(n_sample):
251
- target_features[i] = index.reconstruct(i)
252
- else:
253
- logger.info("Index doesn't support reconstruct, skipping feature matching.")
254
- return audio
255
-
256
- # Use the target features to create a spectral weighting
257
- # Compute mean and variance of target voice features
258
- target_mean = np.mean(target_features, axis=0)
259
- target_std = np.std(target_features, axis=0) + 1e-6
260
-
261
- # Apply subtle spectral coloring based on target voice profile
262
- # Map feature dimensions to frequency bins
263
- freq_bins = magnitude.shape[0]
264
- if dim >= freq_bins:
265
- weights = target_mean[:freq_bins]
266
- else:
267
- weights = np.interp(
268
- np.linspace(0, dim - 1, freq_bins),
269
- np.arange(dim),
270
- target_mean
271
- )
272
-
273
- # Normalize weights to be centered around 1.0
274
- weights = weights - np.mean(weights)
275
- weights = weights / (np.std(weights) + 1e-6)
276
- weights = 1.0 + weights * 0.1 * index_rate # Subtle adjustment
277
-
278
- # Apply spectral weighting
279
- weighted_magnitude = magnitude * weights.reshape(-1, 1)
280
-
281
- # Blend original and modified magnitude
282
- blended_magnitude = magnitude * (1 - index_rate * 0.3) + weighted_magnitude * (index_rate * 0.3)
283
-
284
- # Reconstruct audio
285
- modified_stft = blended_magnitude * np.exp(1j * phase)
286
- modified_audio = librosa.istft(modified_stft, hop_length=hop_length)
287
-
288
- # Match length
289
- if len(modified_audio) > len(audio):
290
- modified_audio = modified_audio[:len(audio)]
291
- elif len(modified_audio) < len(audio):
292
- modified_audio = np.pad(modified_audio, (0, len(audio) - len(modified_audio)))
293
-
294
- logger.info(f"Applied voice features from {n_vectors} index vectors.")
295
- return modified_audio
296
-
297
- except Exception as e:
298
- logger.warning(f"Voice feature matching failed: {e}, returning original audio.")
299
- return audio
 
1
  """
2
+ Voice conversion module: manual RVC v2 inference pipeline.
3
+ Uses HuBERT feature extraction + FAISS retrieval + pretrained generator.
4
+ The voice identity comes from the FAISS index (target voice embeddings),
5
+ not from fine-tuning the generator.
6
  """
7
 
8
  import os
9
  import sys
10
  import logging
11
  import numpy as np
12
+ import torch
13
+ import torch.nn.functional as F
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
28
 
29
  OUTPUT_DIR = "/tmp/rvc_output"
30
 
31
+ # Cache loaded models to avoid reloading on every call
32
+ _cached_hubert = None
33
+ _cached_generator = None
34
+ _cached_rmvpe = None
35
 
 
 
 
 
 
 
36
 
37
+ def _load_hubert(device):
38
+ """Load ContentVec HuBERT model for feature extraction."""
39
+ global _cached_hubert
40
+ if _cached_hubert is not None:
41
+ return _cached_hubert.to(device)
42
+
43
+ ensure_applio_path()
44
+ from rvc.lib.utils import load_embedding
45
+
46
+ model = load_embedding("contentvec", None)
47
+ model = model.to(device).float()
48
+ model.requires_grad_(False)
49
+ _cached_hubert = model
50
+ logger.info("Loaded ContentVec HuBERT model.")
51
+ return model
52
+
53
+
54
+ def _load_generator(device, sample_rate=40000):
55
+ """Load pretrained RVC v2 generator (Synthesizer)."""
56
+ global _cached_generator
57
+ if _cached_generator is not None:
58
+ return _cached_generator.to(device)
59
+
60
+ ensure_applio_path()
61
+ from rvc.lib.algorithm.synthesizers import Synthesizer
62
+
63
+ sr_prefix = str(sample_rate)[:2]
64
+ model_path = os.path.join(
65
+ APPLIO_DIR, "rvc", "models", "pretraineds", "hifi-gan",
66
+ "f0G{}k.pth".format(sr_prefix),
67
+ )
68
+
69
+ if not os.path.exists(model_path):
70
+ raise RuntimeError("Pretrained generator not found: {}".format(model_path))
71
+
72
+ cpt = torch.load(model_path, map_location="cpu", weights_only=False)
73
+
74
+ # Training checkpoint has "model" key, inference format has "weight" key
75
+ weights = cpt.get("weight", cpt.get("model", cpt))
76
+
77
+ # Read config from Applio config files
78
+ import json
79
+ config_path = os.path.join(APPLIO_DIR, "configs", "v2", "{}k.json".format(sr_prefix))
80
+ if os.path.exists(config_path):
81
+ with open(config_path) as f:
82
+ cfg = json.load(f)
83
+ config_args = [
84
+ cfg["data"]["filter_length"] // 2 + 1,
85
+ cfg["train"]["segment_size"] // cfg["data"]["hop_length"],
86
+ cfg["model"]["inter_channels"],
87
+ cfg["model"]["hidden_channels"],
88
+ cfg["model"]["filter_channels"],
89
+ cfg["model"]["n_heads"],
90
+ cfg["model"]["n_layers"],
91
+ cfg["model"]["kernel_size"],
92
+ cfg["model"]["p_dropout"],
93
+ cfg["model"]["resblock"],
94
+ cfg["model"]["resblock_kernel_sizes"],
95
+ cfg["model"]["resblock_dilation_sizes"],
96
+ cfg["model"]["upsample_rates"],
97
+ cfg["model"]["upsample_initial_channel"],
98
+ cfg["model"]["upsample_kernel_sizes"],
99
+ cfg["model"]["spk_embed_dim"],
100
+ cfg["model"]["gin_channels"],
101
+ cfg["data"]["sampling_rate"],
102
+ ]
103
+ logger.info("Loaded generator config from Applio.")
104
+ else:
105
+ # Fallback: standard RVC v2 40k config
106
+ config_args = [
107
+ 1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
108
+ [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
109
+ [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
110
+ ]
111
+
112
+ net_g = Synthesizer(*config_args, use_f0=True)
113
+ net_g.load_state_dict(weights, strict=False)
114
+ net_g.requires_grad_(False)
115
+ net_g.to(device)
116
+ _cached_generator = net_g
117
+ logger.info("Loaded pretrained RVC generator.")
118
+ return net_g
119
+
120
+
121
+ def _extract_f0(audio_np, sr, device):
122
+ """Extract F0 using RMVPE. Returns f0 numpy array."""
123
+ global _cached_rmvpe
124
+
125
+ ensure_applio_path()
126
+
127
+ rmvpe_path = os.path.join(
128
+ APPLIO_DIR, "rvc", "models", "predictors", "rmvpe.pt"
129
+ )
130
 
131
+ if os.path.exists(rmvpe_path):
132
+ try:
133
+ from rvc.lib.predictors.RMVPE import RMVPE0Predictor
134
 
135
+ if _cached_rmvpe is None:
136
+ _cached_rmvpe = RMVPE0Predictor(rmvpe_path, device=device)
137
+ logger.info("Loaded RMVPE predictor.")
138
 
139
+ f0 = _cached_rmvpe.infer_from_audio(audio_np, sample_rate=sr, thred=0.03)
140
+ return f0
141
+ except Exception as e:
142
+ logger.warning("RMVPE failed ({}), using torchcrepe fallback.".format(e))
143
+
144
+ # Fallback: torchcrepe
145
+ import torchcrepe
146
+ import librosa
147
 
148
+ audio_16k = librosa.resample(audio_np, orig_sr=sr, target_sr=16000) if sr != 16000 else audio_np
149
+ audio_t = torch.from_numpy(audio_16k).float().unsqueeze(0).to(device)
 
 
 
150
 
151
+ f0 = torchcrepe.predict(
152
+ audio_t, 16000, hop_length=160,
153
+ fmin=50, fmax=1100, model="full", device=device,
154
+ )
155
+ return f0[0].cpu().numpy()
 
156
 
 
 
 
 
 
 
 
 
157
 
158
+ def _quantize_f0(f0):
159
+ """Quantize F0 to mel-scale buckets (1-255). 0 = unvoiced."""
160
+ f0_mel = 1127.0 * np.log(1.0 + f0 / 700.0)
161
+ f0_mel_min = 1127.0 * np.log(1.0 + 1.0 / 700.0)
162
+ f0_mel_max = 1127.0 * np.log(1.0 + 1100.0 / 700.0)
163
+
164
+ f0_coarse = np.copy(f0_mel)
165
+ voiced = f0_coarse > 0
166
+ f0_coarse[voiced] = (
167
+ (f0_coarse[voiced] - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0
168
+ )
169
+ f0_coarse = np.clip(f0_coarse, 0, 255).astype(np.int64)
170
+ f0_coarse[~voiced] = 0
171
+ return f0_coarse
172
+
173
+
174
+ def _faiss_retrieval(feats, index_path, big_npy_path, index_rate, device):
175
+ """
176
+ Retrieve target voice features from FAISS index and blend with source.
177
+ This is the core of retrieval-based voice conversion: the voice identity
178
+ comes from replacing source embeddings with target voice embeddings.
179
+ """
180
+ import faiss
181
+
182
+ index = faiss.read_index(index_path)
183
+
184
+ if index.ntotal == 0:
185
+ logger.warning("FAISS index is empty, skipping retrieval.")
186
+ return feats
187
+
188
+ # Load precomputed embeddings array
189
+ if big_npy_path and os.path.exists(big_npy_path):
190
+ big_npy = np.load(big_npy_path)
191
+ else:
192
+ # Reconstruct from index (works for IndexFlatL2)
193
+ logger.info("No big_npy file found, reconstructing from index...")
194
+ dim = feats.shape[2]
195
+ big_npy = np.zeros((index.ntotal, dim), dtype=np.float32)
196
+ try:
197
+ for i in range(index.ntotal):
198
+ big_npy[i] = index.reconstruct(i)
199
+ except RuntimeError:
200
+ logger.warning("Cannot reconstruct vectors from index, skipping retrieval.")
201
+ return feats
202
+
203
+ npy = feats[0].cpu().numpy().astype(np.float32)
204
+
205
+ # Search k=8 nearest neighbors for each frame
206
+ score, ix = index.search(npy, k=8)
207
+
208
+ # Weight by inverse square distance
209
+ weight = np.square(1.0 / (score + 1e-6))
210
+ weight /= weight.sum(axis=1, keepdims=True)
211
+
212
+ # Weighted combination of nearest neighbor embeddings
213
+ retrieved = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
214
+
215
+ # Blend retrieved (target voice) with source features
216
+ retrieved_t = torch.from_numpy(retrieved).unsqueeze(0).to(device).float()
217
+ blended = index_rate * retrieved_t + (1.0 - index_rate) * feats
218
+
219
+ logger.info(
220
+ "FAISS retrieval done: {} vectors, index_rate={}".format(
221
+ index.ntotal, index_rate
222
+ )
223
+ )
224
+ return blended
225
 
226
 
227
  @spaces.GPU(duration=60)
228
  def convert_voice(
229
+ audio_path,
230
+ model_path,
231
+ index_path=None,
232
+ pitch=0,
233
+ f0_method="rmvpe",
234
+ index_rate=0.75,
235
+ protect=0.33,
236
+ volume_envelope=1.0,
237
+ output_format="WAV",
238
  ):
239
  """
240
+ Convert voice using the full RVC v2 pipeline:
241
+ 1. Extract HuBERT features from source audio
242
+ 2. Retrieve target voice features from FAISS index
243
+ 3. Extract F0 pitch and apply shift
244
+ 4. Run pretrained generator to synthesize converted audio
245
 
246
  Returns path to converted audio file.
247
  """
 
248
  import librosa
249
  import soundfile as sf
250
 
251
  os.makedirs(OUTPUT_DIR, exist_ok=True)
252
  base_name = os.path.splitext(os.path.basename(audio_path))[0]
253
+ output_path = os.path.join(OUTPUT_DIR, "{}_converted.wav".format(base_name))
254
 
255
+ device = "cuda" if torch.cuda.is_available() else "cpu"
256
+ logger.info("Converting voice on {}: {}".format(device, audio_path))
257
+ logger.info("Index: {}, Pitch: {}, Index rate: {}".format(index_path, pitch, index_rate))
258
 
259
+ ensure_applio_path()
 
 
260
 
261
+ # Load source audio at 16kHz for HuBERT and F0
262
+ audio_16k, _ = librosa.load(audio_path, sr=16000, mono=True)
263
+ logger.info("Source audio: {:.1f}s".format(len(audio_16k) / 16000))
264
 
265
+ if len(audio_16k) < 16000 * 0.5:
266
+ raise RuntimeError("Audio source trop court pour la conversion (< 0.5s).")
267
 
268
+ # ---- Step 1: Extract HuBERT features ----
269
+ hubert = _load_hubert(device)
 
 
 
 
 
 
 
 
270
 
271
+ feats_input = torch.from_numpy(audio_16k).float().view(1, -1).to(device)
272
+ with torch.no_grad():
273
+ feats = hubert(feats_input)["last_hidden_state"] # (1, T_50hz, 768)
274
 
275
+ # Upsample 2x to match F0 frame rate (50Hz -> 100Hz)
276
+ feats = F.interpolate(
277
+ feats.permute(0, 2, 1), scale_factor=2
278
+ ).permute(0, 2, 1) # (1, T_100hz, 768)
279
+
280
+ # Keep a copy for protect blending
281
+ feats0 = feats.clone()
282
 
283
+ # ---- Step 2: FAISS retrieval ----
284
  if index_path and os.path.exists(index_path):
285
+ big_npy_path = index_path.replace(".index", "_big_npy.npy")
286
+ feats = _faiss_retrieval(feats, index_path, big_npy_path, index_rate, device)
287
 
288
+ # Apply protect: blend original features for consonants/unvoiced parts
289
+ if protect < 0.5 and feats0 is not None:
290
+ feats = protect * feats0 + (1.0 - protect) * feats
 
291
 
292
+ # ---- Step 3: Extract F0 ----
293
+ f0 = _extract_f0(audio_16k, 16000, device)
 
294
 
295
+ # Apply pitch shift (in semitones)
296
+ if pitch != 0:
297
+ f0 = f0.copy()
298
+ voiced = f0 > 0
299
+ f0[voiced] *= 2.0 ** (pitch / 12.0)
300
+
301
+ # ---- Step 4: Match lengths ----
302
+ # Target: 100Hz frame rate = 16000 / 160 = 100 frames/sec
303
+ p_len = len(audio_16k) // 160
304
+ p_len = min(p_len, feats.shape[1])
305
+
306
+ # Interpolate F0 to match p_len if needed
307
+ if len(f0) != p_len:
308
+ f0 = np.interp(
309
+ np.linspace(0, len(f0) - 1, p_len),
310
+ np.arange(len(f0)),
311
+ f0,
312
+ )
313
 
314
+ # Trim features to p_len
315
+ feats = feats[:, :p_len, :]
316
 
317
+ # Quantize F0 and convert to tensors
318
+ f0_coarse = _quantize_f0(f0)
319
+ pitch_t = torch.tensor(f0_coarse, device=device).unsqueeze(0).long()
320
+ pitchf_t = torch.tensor(f0, device=device).unsqueeze(0).float()
321
+ p_len_t = torch.tensor([p_len], device=device).long()
322
+ sid = torch.tensor([0], device=device).long()
323
 
324
+ # ---- Step 5: Generator inference ----
325
+ net_g = _load_generator(device, sample_rate=40000)
 
 
 
326
 
327
+ with torch.no_grad():
328
+ result = net_g.infer(feats.float(), p_len_t, pitch_t, pitchf_t, sid)
329
+ audio_out = result[0][0, 0].data.cpu().float().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
+ # ---- Step 6: Post-processing ----
332
+ # Normalize
333
+ audio_max = np.abs(audio_out).max()
334
+ if audio_max > 0.01:
335
+ audio_out = audio_out / audio_max * 0.95
336
 
337
+ # Resample 40kHz -> 44.1kHz for standard output
338
+ audio_44k = librosa.resample(audio_out, orig_sr=40000, target_sr=44100)
339
+
340
+ # Save as WAV 16-bit
341
+ sf.write(output_path, audio_44k, 44100, subtype="PCM_16")
342
+
343
+ logger.info("Conversion complete: {} ({:.1f}s)".format(output_path, len(audio_44k) / 44100))
344
+ return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline/storage.py CHANGED
@@ -21,7 +21,7 @@ def init_storage(repo_id: str):
21
  logger.info(f"Storage initialized with repo: {repo_id}")
22
 
23
 
24
- def upload_model(model_name: str, pth_path: str, index_path: str = None):
25
  """Upload trained model files to HF dataset repo."""
26
  if not MODELS_REPO_ID:
27
  logger.warning("No HF repo configured. Model saved locally only.")
@@ -51,6 +51,16 @@ def upload_model(model_name: str, pth_path: str, index_path: str = None):
51
  )
52
  logger.info(f"Uploaded {model_name}.index to HF")
53
 
 
 
 
 
 
 
 
 
 
 
54
  # Upload metadata
55
  metadata = {
56
  "name": model_name,
@@ -110,6 +120,17 @@ def download_model(model_name: str):
110
  except Exception:
111
  pass # Index file is optional
112
 
 
 
 
 
 
 
 
 
 
 
 
113
  return pth_path, index_path
114
  except Exception as e:
115
  logger.error(f"Failed to download model from HF: {e}")
 
21
  logger.info(f"Storage initialized with repo: {repo_id}")
22
 
23
 
24
+ def upload_model(model_name: str, pth_path: str, index_path: str = None, big_npy_path: str = None):
25
  """Upload trained model files to HF dataset repo."""
26
  if not MODELS_REPO_ID:
27
  logger.warning("No HF repo configured. Model saved locally only.")
 
51
  )
52
  logger.info(f"Uploaded {model_name}.index to HF")
53
 
54
+ # Upload big_npy embeddings if exists
55
+ if big_npy_path and os.path.exists(big_npy_path):
56
+ api.upload_file(
57
+ path_or_fileobj=big_npy_path,
58
+ path_in_repo=f"models/{model_name}/{model_name}_big_npy.npy",
59
+ repo_id=MODELS_REPO_ID,
60
+ repo_type="dataset",
61
+ )
62
+ logger.info(f"Uploaded {model_name}_big_npy.npy to HF")
63
+
64
  # Upload metadata
65
  metadata = {
66
  "name": model_name,
 
120
  except Exception:
121
  pass # Index file is optional
122
 
123
+ # Download big_npy embeddings (for FAISS retrieval)
124
+ try:
125
+ hf_hub_download(
126
+ repo_id=MODELS_REPO_ID,
127
+ repo_type="dataset",
128
+ filename=f"models/{model_name}/{model_name}_big_npy.npy",
129
+ local_dir=local_dir,
130
+ )
131
+ except Exception:
132
+ pass # Will reconstruct from index if missing
133
+
134
  return pth_path, index_path
135
  except Exception as e:
136
  logger.error(f"Failed to download model from HF: {e}")
pipeline/training.py CHANGED
@@ -363,8 +363,13 @@ def build_index(model_name: str):
363
 
364
  index_path = os.path.join(exp_dir, f"{model_name}.index")
365
  faiss.write_index(index, index_path)
 
 
 
 
 
366
  logger.info(f"FAISS index built: {index_path} ({n_vectors} vectors)")
367
- return index_path
368
 
369
 
370
  def find_trained_model(model_name: str):
@@ -498,37 +503,38 @@ def full_training_pipeline(
498
  progress_callback(0.60, "Caractéristiques extraites. Construction de l'index vocal...")
499
 
500
  # Build FAISS index (fast, CPU-friendly)
501
- index_path = build_index(model_name)
502
-
503
- # Use pre-trained RVC generator model + user's FAISS index for voice conversion.
504
- # Full HiFi-GAN training is skipped because:
505
- # - On CPU: takes hours (impractical)
506
- # - On ZeroGPU: worker sandbox doesn't support runpy/multiprocessing patterns
507
- # The FAISS index captures the user's voice characteristics for retrieval-based conversion.
 
508
  if progress_callback:
509
  progress_callback(0.75, "Finalisation du modèle vocal...")
510
- pth_path = find_pretrained_model(sample_rate)
511
-
512
- if not pth_path:
513
- raise RuntimeError("Aucun modèle trouvé. Vérifiez que les modèles pré-entraînés sont téléchargés.")
514
 
515
  # Save to local models directory
516
  local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
517
  os.makedirs(local_model_dir, exist_ok=True)
518
 
519
- local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
520
- _convert_to_inference_model(pth_path, local_pth, sample_rate)
 
 
 
 
 
521
 
522
- local_index = None
523
- if index_path:
524
- local_index = os.path.join(local_model_dir, f"{model_name}.index")
525
- shutil.copy2(index_path, local_index)
526
 
527
  if progress_callback:
528
  progress_callback(0.90, "Sauvegarde du modèle...")
529
 
530
  try:
531
- upload_model(model_name, local_pth, local_index)
532
  except Exception as e:
533
  logger.warning(f"Failed to upload to HF (non-critical): {e}")
534
 
 
363
 
364
  index_path = os.path.join(exp_dir, f"{model_name}.index")
365
  faiss.write_index(index, index_path)
366
+
367
+ # Save raw embeddings for FAISS retrieval at inference time
368
+ big_npy_path = os.path.join(exp_dir, f"{model_name}_big_npy.npy")
369
+ np.save(big_npy_path, all_emb)
370
+
371
  logger.info(f"FAISS index built: {index_path} ({n_vectors} vectors)")
372
+ return index_path, big_npy_path
373
 
374
 
375
  def find_trained_model(model_name: str):
 
503
  progress_callback(0.60, "Caractéristiques extraites. Construction de l'index vocal...")
504
 
505
  # Build FAISS index (fast, CPU-friendly)
506
+ result = build_index(model_name)
507
+ if result is None:
508
+ raise RuntimeError("Impossible de construire l'index FAISS. Pas d'embeddings extraits.")
509
+ index_path, big_npy_path = result
510
+
511
+ # The user's "model" is the FAISS index + embeddings.
512
+ # The pretrained generator is shared by all models (loaded at inference time).
513
+ # Voice identity comes from FAISS retrieval, not generator fine-tuning.
514
  if progress_callback:
515
  progress_callback(0.75, "Finalisation du modèle vocal...")
 
 
 
 
516
 
517
  # Save to local models directory
518
  local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
519
  os.makedirs(local_model_dir, exist_ok=True)
520
 
521
+ # Save FAISS index
522
+ local_index = os.path.join(local_model_dir, f"{model_name}.index")
523
+ shutil.copy2(index_path, local_index)
524
+
525
+ # Save big_npy embeddings (needed for FAISS retrieval at inference)
526
+ local_big_npy = os.path.join(local_model_dir, f"{model_name}_big_npy.npy")
527
+ shutil.copy2(big_npy_path, local_big_npy)
528
 
529
+ # Create a minimal model marker file (no actual model weights needed)
530
+ local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
531
+ torch.save({"type": "faiss_voice_model", "sample_rate": sample_rate}, local_pth)
 
532
 
533
  if progress_callback:
534
  progress_callback(0.90, "Sauvegarde du modèle...")
535
 
536
  try:
537
+ upload_model(model_name, local_pth, local_index, local_big_npy)
538
  except Exception as e:
539
  logger.warning(f"Failed to upload to HF (non-critical): {e}")
540