rvc

Sleeping

ibcplateformes Claude Opus 4.6 commited on Mar 31

Commit

55b9bab

1 Parent(s): 27bc094

Implement real RVC v2 inference pipeline with HuBERT + FAISS + generator

Major rewrite of the voice conversion to use proper RVC pipeline:
- Extract HuBERT (ContentVec) features from source audio
- Upsample features 2x to match F0 frame rate (50Hz -> 100Hz)
- FAISS retrieval: find target voice embeddings, blend with source
- Extract F0 with RMVPE, apply pitch shift, quantize to mel buckets
- Feed blended features + F0 into pretrained Synthesizer generator
- Voice identity comes from FAISS retrieval, not generator fine-tuning

Training pipeline now saves big_npy embeddings alongside FAISS index
for efficient retrieval at inference time. The .pth file is now just
a marker - the pretrained generator is loaded directly from Applio.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show

app.py +1 -1
pipeline/inference.py +279 -234
pipeline/storage.py +22 -1
pipeline/training.py +25 -19

app.py CHANGED Viewed

@@ -69,7 +69,7 @@ else:
 # ── Import GPU-decorated functions at top level for ZeroGPU detection ───────
-from pipeline.training import full_training_pipeline, extract_features, train_model
 from pipeline.separation import separate_audio
 from pipeline.inference import convert_voice

 # ── Import GPU-decorated functions at top level for ZeroGPU detection ───────
+from pipeline.training import full_training_pipeline, extract_features
 from pipeline.separation import separate_audio
 from pipeline.inference import convert_voice

pipeline/inference.py CHANGED Viewed

@@ -1,13 +1,16 @@
 """
-Voice conversion module: standalone RVC-like inference using
-HuBERT embeddings + FAISS index + pitch shifting.
-Does not require a trained model — uses pre-extracted voice features.
 """
 import os
 import sys
 import logging
 import numpy as np
 logger = logging.getLogger(__name__)
@@ -25,275 +28,317 @@ from pipeline.setup import APPLIO_DIR, ensure_applio_path
 OUTPUT_DIR = "/tmp/rvc_output"
-def _ensure_inference_format(model_path):
-    """
-    Check if model is in RVC inference format (has 'weight' key).
-    If it's a training checkpoint (has 'model' key), convert it on the fly.
-    """
-    import torch
-    checkpoint = torch.load(model_path, map_location="cpu")
-    if "weight" in checkpoint:
-        return model_path  # Already in inference format
-    if "model" not in checkpoint:
-        logger.warning("Model has neither 'weight' nor 'model' key.")
-        return model_path
-    logger.info("Converting training checkpoint to inference format...")
-    state_dict = checkpoint["model"]
-    weight = {}
-    for k, v in state_dict.items():
-        new_key = k.replace("module.", "")
-        weight[new_key] = v.half()
-    # Standard RVC v2 40k config
-    config = [
-        1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
-        [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-        [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
-    ]
-    inference_model = {
-        "weight": weight,
-        "config": config,
-        "info": "v2_40k",
-        "sr": "40k",
-        "f0": 1,
-        "version": "v2",
-    }
-    # Save converted model alongside original
-    converted_path = model_path.replace(".pth", "_inference.pth")
-    torch.save(inference_model, converted_path)
-    logger.info(f"Saved inference model: {converted_path}")
-    return converted_path
 @spaces.GPU(duration=60)
 def convert_voice(
-    audio_path: str,
-    model_path: str,
-    index_path: str = None,
-    pitch: int = 0,
-    f0_method: str = "rmvpe",
-    index_rate: float = 0.75,
-    protect: float = 0.33,
-    volume_envelope: float = 1.0,
-    output_format: str = "WAV",
 ):
     """
-    Convert voice using FAISS index matching + pitch shifting.
-    Uses HuBERT embeddings from the target voice (stored in FAISS index)
-    to guide voice conversion. Falls back to pitch shifting when needed.
     Returns path to converted audio file.
     """
-    import torch
     import librosa
     import soundfile as sf
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     base_name = os.path.splitext(os.path.basename(audio_path))[0]
-    output_path = os.path.join(OUTPUT_DIR, f"{base_name}_converted.wav")
-    logger.info(f"Converting voice: {audio_path}")
-    logger.info(f"Index: {index_path}, Pitch: {pitch}")
-    # Load source audio
-    source_audio, sr = librosa.load(audio_path, sr=40000, mono=True)
-    logger.info(f"Source audio: {len(source_audio)} samples, {len(source_audio)/sr:.1f}s")
-    if len(source_audio) < sr * 0.5:
-        raise RuntimeError("Audio source trop court pour la conversion.")
-    # Ensure model is in RVC inference format (weight key, not model key)
-    model_path = _ensure_inference_format(model_path)
-    # Try Applio VoiceConverter
-    try:
-        converted = _try_applio_inference(
-            audio_path, model_path, index_path, pitch,
-            f0_method, index_rate, protect, volume_envelope, output_format, output_path
-        )
-        if converted:
-            return converted
-    except Exception as e:
-        logger.info(f"Applio inference not available ({type(e).__name__}: {e}), using fallback.")
-    # Fallback: pitch-shifting based conversion
-    logger.info("Using pitch-shift + formant conversion...")
-    # Apply pitch shift
-    if pitch != 0:
-        source_audio = librosa.effects.pitch_shift(
-            source_audio, sr=sr, n_steps=pitch
-        )
-    # If we have a FAISS index, use it to adjust voice characteristics
     if index_path and os.path.exists(index_path):
-        source_audio = _apply_voice_features(source_audio, sr, index_path, index_rate)
-    # Normalize output
-    peak = np.abs(source_audio).max()
-    if peak > 0:
-        source_audio = source_audio / peak * 0.95
-    # Save output at 44.1kHz 16-bit (standard audio)
-    output_44k = librosa.resample(source_audio, orig_sr=sr, target_sr=44100)
-    sf.write(output_path, output_44k, 44100, subtype='PCM_16')
-    logger.info(f"Conversion complete: {output_path}")
-    return output_path
-def _try_applio_inference(audio_path, model_path, index_path, pitch,
-                          f0_method, index_rate, protect, volume_envelope,
-                          output_format, output_path):
-    """Try to use Applio's VoiceConverter. Returns output path or None."""
-    import torch
-    # Check if model is a proper RVC inference model
-    checkpoint = torch.load(model_path, map_location="cpu")
-    if "weight" not in checkpoint:
-        logger.info("Model is not an RVC inference model (no 'weight' key).")
-        return None
-    ensure_applio_path()
-    old_cwd = os.getcwd()
-    os.chdir(APPLIO_DIR)
-    try:
-        from rvc.infer.infer import VoiceConverter
-        converter = VoiceConverter()
-        converter.convert_audio(
-            pitch=pitch,
-            index_rate=index_rate,
-            volume_envelope=volume_envelope,
-            protect=protect,
-            f0_method=f0_method,
-            audio_input_path=audio_path,
-            audio_output_path=output_path,
-            model_path=model_path,
-            index_path=index_path or "",
-            split_audio=False,
-            f0_autotune=False,
-            f0_autotune_strength=1.0,
-            proposed_pitch=False,
-            proposed_pitch_threshold=0.5,
-            clean_audio=True,
-            clean_strength=0.5,
-            export_format=output_format,
-            embedder_model="contentvec",
-            embedder_model_custom=None,
-            sid=0,
-            formant_shifting=False,
-            formant_qfrency=1.0,
-            formant_timbre=1.0,
-            post_process=False,
-            reverb=False,
-            pitch_shift=False,
-            limiter=False,
-            gain=False,
-            distortion=False,
-            chorus=False,
-            bitcrush=False,
-            clipping=False,
-            compressor=False,
-            delay=False,
-            sliders=None,
-        )
-        return output_path if os.path.exists(output_path) else None
-    finally:
-        os.chdir(old_cwd)
-def _apply_voice_features(audio, sr, index_path, index_rate):
-    """
-    Apply voice characteristics from FAISS index using spectral envelope matching.
-    This is a simplified version of RVC's retrieval-based conversion.
-    """
-    try:
-        import faiss
-        index = faiss.read_index(index_path)
-        n_vectors = index.ntotal
-        if n_vectors == 0:
-            logger.warning("FAISS index is empty, skipping voice feature matching.")
-            return audio
-        # Extract spectral features from source audio
-        # Use short-time Fourier transform
-        hop_length = 512
-        n_fft = 2048
-        stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
-        magnitude = np.abs(stft)
-        phase = np.angle(stft)
-        # Get spectral envelope (smoothed magnitude spectrum)
-        source_envelope = np.mean(magnitude, axis=1, keepdims=True)
-        # Get target voice spectral characteristics from index
-        # Sample embeddings from index to estimate target voice profile
-        dim = index.d
-        n_sample = min(n_vectors, 50)
-        # Reconstruct vectors from index
-        if hasattr(index, 'reconstruct'):
-            target_features = np.zeros((n_sample, dim), dtype=np.float32)
-            for i in range(n_sample):
-                target_features[i] = index.reconstruct(i)
-        else:
-            logger.info("Index doesn't support reconstruct, skipping feature matching.")
-            return audio
-        # Use the target features to create a spectral weighting
-        # Compute mean and variance of target voice features
-        target_mean = np.mean(target_features, axis=0)
-        target_std = np.std(target_features, axis=0) + 1e-6
-        # Apply subtle spectral coloring based on target voice profile
-        # Map feature dimensions to frequency bins
-        freq_bins = magnitude.shape[0]
-        if dim >= freq_bins:
-            weights = target_mean[:freq_bins]
-        else:
-            weights = np.interp(
-                np.linspace(0, dim - 1, freq_bins),
-                np.arange(dim),
-                target_mean
-            )
-        # Normalize weights to be centered around 1.0
-        weights = weights - np.mean(weights)
-        weights = weights / (np.std(weights) + 1e-6)
-        weights = 1.0 + weights * 0.1 * index_rate  # Subtle adjustment
-        # Apply spectral weighting
-        weighted_magnitude = magnitude * weights.reshape(-1, 1)
-        # Blend original and modified magnitude
-        blended_magnitude = magnitude * (1 - index_rate * 0.3) + weighted_magnitude * (index_rate * 0.3)
-        # Reconstruct audio
-        modified_stft = blended_magnitude * np.exp(1j * phase)
-        modified_audio = librosa.istft(modified_stft, hop_length=hop_length)
-        # Match length
-        if len(modified_audio) > len(audio):
-            modified_audio = modified_audio[:len(audio)]
-        elif len(modified_audio) < len(audio):
-            modified_audio = np.pad(modified_audio, (0, len(audio) - len(modified_audio)))
-        logger.info(f"Applied voice features from {n_vectors} index vectors.")
-        return modified_audio
-    except Exception as e:
-        logger.warning(f"Voice feature matching failed: {e}, returning original audio.")
-        return audio

 """
+Voice conversion module: manual RVC v2 inference pipeline.
+Uses HuBERT feature extraction + FAISS retrieval + pretrained generator.
+The voice identity comes from the FAISS index (target voice embeddings),
+not from fine-tuning the generator.
 """
 import os
 import sys
 import logging
 import numpy as np
+import torch
+import torch.nn.functional as F
 logger = logging.getLogger(__name__)
 OUTPUT_DIR = "/tmp/rvc_output"
+# Cache loaded models to avoid reloading on every call
+_cached_hubert = None
+_cached_generator = None
+_cached_rmvpe = None
+def _load_hubert(device):
+    """Load ContentVec HuBERT model for feature extraction."""
+    global _cached_hubert
+    if _cached_hubert is not None:
+        return _cached_hubert.to(device)
+    ensure_applio_path()
+    from rvc.lib.utils import load_embedding
+    model = load_embedding("contentvec", None)
+    model = model.to(device).float()
+    model.requires_grad_(False)
+    _cached_hubert = model
+    logger.info("Loaded ContentVec HuBERT model.")
+    return model
+def _load_generator(device, sample_rate=40000):
+    """Load pretrained RVC v2 generator (Synthesizer)."""
+    global _cached_generator
+    if _cached_generator is not None:
+        return _cached_generator.to(device)
+    ensure_applio_path()
+    from rvc.lib.algorithm.synthesizers import Synthesizer
+    sr_prefix = str(sample_rate)[:2]
+    model_path = os.path.join(
+        APPLIO_DIR, "rvc", "models", "pretraineds", "hifi-gan",
+        "f0G{}k.pth".format(sr_prefix),
+    )
+    if not os.path.exists(model_path):
+        raise RuntimeError("Pretrained generator not found: {}".format(model_path))
+    cpt = torch.load(model_path, map_location="cpu", weights_only=False)
+    # Training checkpoint has "model" key, inference format has "weight" key
+    weights = cpt.get("weight", cpt.get("model", cpt))
+    # Read config from Applio config files
+    import json
+    config_path = os.path.join(APPLIO_DIR, "configs", "v2", "{}k.json".format(sr_prefix))
+    if os.path.exists(config_path):
+        with open(config_path) as f:
+            cfg = json.load(f)
+        config_args = [
+            cfg["data"]["filter_length"] // 2 + 1,
+            cfg["train"]["segment_size"] // cfg["data"]["hop_length"],
+            cfg["model"]["inter_channels"],
+            cfg["model"]["hidden_channels"],
+            cfg["model"]["filter_channels"],
+            cfg["model"]["n_heads"],
+            cfg["model"]["n_layers"],
+            cfg["model"]["kernel_size"],
+            cfg["model"]["p_dropout"],
+            cfg["model"]["resblock"],
+            cfg["model"]["resblock_kernel_sizes"],
+            cfg["model"]["resblock_dilation_sizes"],
+            cfg["model"]["upsample_rates"],
+            cfg["model"]["upsample_initial_channel"],
+            cfg["model"]["upsample_kernel_sizes"],
+            cfg["model"]["spk_embed_dim"],
+            cfg["model"]["gin_channels"],
+            cfg["data"]["sampling_rate"],
+        ]
+        logger.info("Loaded generator config from Applio.")
+    else:
+        # Fallback: standard RVC v2 40k config
+        config_args = [
+            1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
+            [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
+        ]
+    net_g = Synthesizer(*config_args, use_f0=True)
+    net_g.load_state_dict(weights, strict=False)
+    net_g.requires_grad_(False)
+    net_g.to(device)
+    _cached_generator = net_g
+    logger.info("Loaded pretrained RVC generator.")
+    return net_g
+def _extract_f0(audio_np, sr, device):
+    """Extract F0 using RMVPE. Returns f0 numpy array."""
+    global _cached_rmvpe
+    ensure_applio_path()
+    rmvpe_path = os.path.join(
+        APPLIO_DIR, "rvc", "models", "predictors", "rmvpe.pt"
+    )
+    if os.path.exists(rmvpe_path):
+        try:
+            from rvc.lib.predictors.RMVPE import RMVPE0Predictor
+            if _cached_rmvpe is None:
+                _cached_rmvpe = RMVPE0Predictor(rmvpe_path, device=device)
+                logger.info("Loaded RMVPE predictor.")
+            f0 = _cached_rmvpe.infer_from_audio(audio_np, sample_rate=sr, thred=0.03)
+            return f0
+        except Exception as e:
+            logger.warning("RMVPE failed ({}), using torchcrepe fallback.".format(e))
+    # Fallback: torchcrepe
+    import torchcrepe
+    import librosa
+    audio_16k = librosa.resample(audio_np, orig_sr=sr, target_sr=16000) if sr != 16000 else audio_np
+    audio_t = torch.from_numpy(audio_16k).float().unsqueeze(0).to(device)
+    f0 = torchcrepe.predict(
+        audio_t, 16000, hop_length=160,
+        fmin=50, fmax=1100, model="full", device=device,
+    )
+    return f0[0].cpu().numpy()
+def _quantize_f0(f0):
+    """Quantize F0 to mel-scale buckets (1-255). 0 = unvoiced."""
+    f0_mel = 1127.0 * np.log(1.0 + f0 / 700.0)
+    f0_mel_min = 1127.0 * np.log(1.0 + 1.0 / 700.0)
+    f0_mel_max = 1127.0 * np.log(1.0 + 1100.0 / 700.0)
+    f0_coarse = np.copy(f0_mel)
+    voiced = f0_coarse > 0
+    f0_coarse[voiced] = (
+        (f0_coarse[voiced] - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0
+    )
+    f0_coarse = np.clip(f0_coarse, 0, 255).astype(np.int64)
+    f0_coarse[~voiced] = 0
+    return f0_coarse
+def _faiss_retrieval(feats, index_path, big_npy_path, index_rate, device):
+    """
+    Retrieve target voice features from FAISS index and blend with source.
+    This is the core of retrieval-based voice conversion: the voice identity
+    comes from replacing source embeddings with target voice embeddings.
+    """
+    import faiss
+    index = faiss.read_index(index_path)
+    if index.ntotal == 0:
+        logger.warning("FAISS index is empty, skipping retrieval.")
+        return feats
+    # Load precomputed embeddings array
+    if big_npy_path and os.path.exists(big_npy_path):
+        big_npy = np.load(big_npy_path)
+    else:
+        # Reconstruct from index (works for IndexFlatL2)
+        logger.info("No big_npy file found, reconstructing from index...")
+        dim = feats.shape[2]
+        big_npy = np.zeros((index.ntotal, dim), dtype=np.float32)
+        try:
+            for i in range(index.ntotal):
+                big_npy[i] = index.reconstruct(i)
+        except RuntimeError:
+            logger.warning("Cannot reconstruct vectors from index, skipping retrieval.")
+            return feats
+    npy = feats[0].cpu().numpy().astype(np.float32)
+    # Search k=8 nearest neighbors for each frame
+    score, ix = index.search(npy, k=8)
+    # Weight by inverse square distance
+    weight = np.square(1.0 / (score + 1e-6))
+    weight /= weight.sum(axis=1, keepdims=True)
+    # Weighted combination of nearest neighbor embeddings
+    retrieved = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+    # Blend retrieved (target voice) with source features
+    retrieved_t = torch.from_numpy(retrieved).unsqueeze(0).to(device).float()
+    blended = index_rate * retrieved_t + (1.0 - index_rate) * feats
+    logger.info(
+        "FAISS retrieval done: {} vectors, index_rate={}".format(
+            index.ntotal, index_rate
+        )
+    )
+    return blended
 @spaces.GPU(duration=60)
 def convert_voice(
+    audio_path,
+    model_path,
+    index_path=None,
+    pitch=0,
+    f0_method="rmvpe",
+    index_rate=0.75,
+    protect=0.33,
+    volume_envelope=1.0,
+    output_format="WAV",
 ):
     """
+    Convert voice using the full RVC v2 pipeline:
+    1. Extract HuBERT features from source audio
+    2. Retrieve target voice features from FAISS index
+    3. Extract F0 pitch and apply shift
+    4. Run pretrained generator to synthesize converted audio
     Returns path to converted audio file.
     """
     import librosa
     import soundfile as sf
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     base_name = os.path.splitext(os.path.basename(audio_path))[0]
+    output_path = os.path.join(OUTPUT_DIR, "{}_converted.wav".format(base_name))
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    logger.info("Converting voice on {}: {}".format(device, audio_path))
+    logger.info("Index: {}, Pitch: {}, Index rate: {}".format(index_path, pitch, index_rate))
+    ensure_applio_path()
+    # Load source audio at 16kHz for HuBERT and F0
+    audio_16k, _ = librosa.load(audio_path, sr=16000, mono=True)
+    logger.info("Source audio: {:.1f}s".format(len(audio_16k) / 16000))
+    if len(audio_16k) < 16000 * 0.5:
+        raise RuntimeError("Audio source trop court pour la conversion (< 0.5s).")
+    # ---- Step 1: Extract HuBERT features ----
+    hubert = _load_hubert(device)
+    feats_input = torch.from_numpy(audio_16k).float().view(1, -1).to(device)
+    with torch.no_grad():
+        feats = hubert(feats_input)["last_hidden_state"]  # (1, T_50hz, 768)
+    # Upsample 2x to match F0 frame rate (50Hz -> 100Hz)
+    feats = F.interpolate(
+        feats.permute(0, 2, 1), scale_factor=2
+    ).permute(0, 2, 1)  # (1, T_100hz, 768)
+    # Keep a copy for protect blending
+    feats0 = feats.clone()
+    # ---- Step 2: FAISS retrieval ----
     if index_path and os.path.exists(index_path):
+        big_npy_path = index_path.replace(".index", "_big_npy.npy")
+        feats = _faiss_retrieval(feats, index_path, big_npy_path, index_rate, device)
+    # Apply protect: blend original features for consonants/unvoiced parts
+    if protect < 0.5 and feats0 is not None:
+        feats = protect * feats0 + (1.0 - protect) * feats
+    # ---- Step 3: Extract F0 ----
+    f0 = _extract_f0(audio_16k, 16000, device)
+    # Apply pitch shift (in semitones)
+    if pitch != 0:
+        f0 = f0.copy()
+        voiced = f0 > 0
+        f0[voiced] *= 2.0 ** (pitch / 12.0)
+    # ---- Step 4: Match lengths ----
+    # Target: 100Hz frame rate = 16000 / 160 = 100 frames/sec
+    p_len = len(audio_16k) // 160
+    p_len = min(p_len, feats.shape[1])
+    # Interpolate F0 to match p_len if needed
+    if len(f0) != p_len:
+        f0 = np.interp(
+            np.linspace(0, len(f0) - 1, p_len),
+            np.arange(len(f0)),
+            f0,
+        )
+    # Trim features to p_len
+    feats = feats[:, :p_len, :]
+    # Quantize F0 and convert to tensors
+    f0_coarse = _quantize_f0(f0)
+    pitch_t = torch.tensor(f0_coarse, device=device).unsqueeze(0).long()
+    pitchf_t = torch.tensor(f0, device=device).unsqueeze(0).float()
+    p_len_t = torch.tensor([p_len], device=device).long()
+    sid = torch.tensor([0], device=device).long()
+    # ---- Step 5: Generator inference ----
+    net_g = _load_generator(device, sample_rate=40000)
+    with torch.no_grad():
+        result = net_g.infer(feats.float(), p_len_t, pitch_t, pitchf_t, sid)
+        audio_out = result[0][0, 0].data.cpu().float().numpy()
+    # ---- Step 6: Post-processing ----
+    # Normalize
+    audio_max = np.abs(audio_out).max()
+    if audio_max > 0.01:
+        audio_out = audio_out / audio_max * 0.95
+    # Resample 40kHz -> 44.1kHz for standard output
+    audio_44k = librosa.resample(audio_out, orig_sr=40000, target_sr=44100)
+    # Save as WAV 16-bit
+    sf.write(output_path, audio_44k, 44100, subtype="PCM_16")
+    logger.info("Conversion complete: {} ({:.1f}s)".format(output_path, len(audio_44k) / 44100))
+    return output_path

pipeline/storage.py CHANGED Viewed

@@ -21,7 +21,7 @@ def init_storage(repo_id: str):
     logger.info(f"Storage initialized with repo: {repo_id}")
-def upload_model(model_name: str, pth_path: str, index_path: str = None):
     """Upload trained model files to HF dataset repo."""
     if not MODELS_REPO_ID:
         logger.warning("No HF repo configured. Model saved locally only.")
@@ -51,6 +51,16 @@ def upload_model(model_name: str, pth_path: str, index_path: str = None):
             )
             logger.info(f"Uploaded {model_name}.index to HF")
         # Upload metadata
         metadata = {
             "name": model_name,
@@ -110,6 +120,17 @@ def download_model(model_name: str):
         except Exception:
             pass  # Index file is optional
         return pth_path, index_path
     except Exception as e:
         logger.error(f"Failed to download model from HF: {e}")

     logger.info(f"Storage initialized with repo: {repo_id}")
+def upload_model(model_name: str, pth_path: str, index_path: str = None, big_npy_path: str = None):
     """Upload trained model files to HF dataset repo."""
     if not MODELS_REPO_ID:
         logger.warning("No HF repo configured. Model saved locally only.")
             )
             logger.info(f"Uploaded {model_name}.index to HF")
+        # Upload big_npy embeddings if exists
+        if big_npy_path and os.path.exists(big_npy_path):
+            api.upload_file(
+                path_or_fileobj=big_npy_path,
+                path_in_repo=f"models/{model_name}/{model_name}_big_npy.npy",
+                repo_id=MODELS_REPO_ID,
+                repo_type="dataset",
+            )
+            logger.info(f"Uploaded {model_name}_big_npy.npy to HF")
         # Upload metadata
         metadata = {
             "name": model_name,
         except Exception:
             pass  # Index file is optional
+        # Download big_npy embeddings (for FAISS retrieval)
+        try:
+            hf_hub_download(
+                repo_id=MODELS_REPO_ID,
+                repo_type="dataset",
+                filename=f"models/{model_name}/{model_name}_big_npy.npy",
+                local_dir=local_dir,
+            )
+        except Exception:
+            pass  # Will reconstruct from index if missing
         return pth_path, index_path
     except Exception as e:
         logger.error(f"Failed to download model from HF: {e}")

pipeline/training.py CHANGED Viewed

@@ -363,8 +363,13 @@ def build_index(model_name: str):
     index_path = os.path.join(exp_dir, f"{model_name}.index")
     faiss.write_index(index, index_path)
     logger.info(f"FAISS index built: {index_path} ({n_vectors} vectors)")
-    return index_path
 def find_trained_model(model_name: str):
@@ -498,37 +503,38 @@ def full_training_pipeline(
         progress_callback(0.60, "Caractéristiques extraites. Construction de l'index vocal...")
     # Build FAISS index (fast, CPU-friendly)
-    index_path = build_index(model_name)
-    # Use pre-trained RVC generator model + user's FAISS index for voice conversion.
-    # Full HiFi-GAN training is skipped because:
-    # - On CPU: takes hours (impractical)
-    # - On ZeroGPU: worker sandbox doesn't support runpy/multiprocessing patterns
-    # The FAISS index captures the user's voice characteristics for retrieval-based conversion.
     if progress_callback:
         progress_callback(0.75, "Finalisation du modèle vocal...")
-    pth_path = find_pretrained_model(sample_rate)
-    if not pth_path:
-        raise RuntimeError("Aucun modèle trouvé. Vérifiez que les modèles pré-entraînés sont téléchargés.")
     # Save to local models directory
     local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
     os.makedirs(local_model_dir, exist_ok=True)
-    local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
-    _convert_to_inference_model(pth_path, local_pth, sample_rate)
-    local_index = None
-    if index_path:
-        local_index = os.path.join(local_model_dir, f"{model_name}.index")
-        shutil.copy2(index_path, local_index)
     if progress_callback:
         progress_callback(0.90, "Sauvegarde du modèle...")
     try:
-        upload_model(model_name, local_pth, local_index)
     except Exception as e:
         logger.warning(f"Failed to upload to HF (non-critical): {e}")

     index_path = os.path.join(exp_dir, f"{model_name}.index")
     faiss.write_index(index, index_path)
+    # Save raw embeddings for FAISS retrieval at inference time
+    big_npy_path = os.path.join(exp_dir, f"{model_name}_big_npy.npy")
+    np.save(big_npy_path, all_emb)
     logger.info(f"FAISS index built: {index_path} ({n_vectors} vectors)")
+    return index_path, big_npy_path
 def find_trained_model(model_name: str):
         progress_callback(0.60, "Caractéristiques extraites. Construction de l'index vocal...")
     # Build FAISS index (fast, CPU-friendly)
+    result = build_index(model_name)
+    if result is None:
+        raise RuntimeError("Impossible de construire l'index FAISS. Pas d'embeddings extraits.")
+    index_path, big_npy_path = result
+    # The user's "model" is the FAISS index + embeddings.
+    # The pretrained generator is shared by all models (loaded at inference time).
+    # Voice identity comes from FAISS retrieval, not generator fine-tuning.
     if progress_callback:
         progress_callback(0.75, "Finalisation du modèle vocal...")
     # Save to local models directory
     local_model_dir = os.path.join(LOCAL_MODELS_DIR, model_name)
     os.makedirs(local_model_dir, exist_ok=True)
+    # Save FAISS index
+    local_index = os.path.join(local_model_dir, f"{model_name}.index")
+    shutil.copy2(index_path, local_index)
+    # Save big_npy embeddings (needed for FAISS retrieval at inference)
+    local_big_npy = os.path.join(local_model_dir, f"{model_name}_big_npy.npy")
+    shutil.copy2(big_npy_path, local_big_npy)
+    # Create a minimal model marker file (no actual model weights needed)
+    local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
+    torch.save({"type": "faiss_voice_model", "sample_rate": sample_rate}, local_pth)
     if progress_callback:
         progress_callback(0.90, "Sauvegarde du modèle...")
     try:
+        upload_model(model_name, local_pth, local_index, local_big_npy)
     except Exception as e:
         logger.warning(f"Failed to upload to HF (non-critical): {e}")