rvc

Sleeping

ibcplateformes Claude Opus 4.6 commited on Mar 31

Commit

27bc094

1 Parent(s): dbae9aa

Convert pretrained checkpoint to RVC inference format for proper voice conversion

The pretrained f0G40k.pth is a training checkpoint (keys: model, optimizer)
but Applio's VoiceConverter requires inference format (keys: weight, config, sr, version).
Without this conversion, VoiceConverter fails silently and the fallback produces
identical audio (pitch=0 + negligible spectral adjustment).

- Add _convert_to_inference_model() to training.py
- Add _ensure_inference_format() to inference.py for backward compat with existing models
- Training pipeline now saves models in correct inference format

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

pipeline/inference.py +51 -1
pipeline/training.py +75 -1

pipeline/inference.py CHANGED Viewed

@@ -26,6 +26,53 @@ from pipeline.setup import APPLIO_DIR, ensure_applio_path
 OUTPUT_DIR = "/tmp/rvc_output"
 @spaces.GPU(duration=60)
 def convert_voice(
     audio_path: str,
@@ -63,7 +110,10 @@ def convert_voice(
     if len(source_audio) < sr * 0.5:
         raise RuntimeError("Audio source trop court pour la conversion.")
-    # Try Applio VoiceConverter first if model is a proper RVC model
     try:
         converted = _try_applio_inference(
             audio_path, model_path, index_path, pitch,

 OUTPUT_DIR = "/tmp/rvc_output"
+def _ensure_inference_format(model_path):
+    """
+    Check if model is in RVC inference format (has 'weight' key).
+    If it's a training checkpoint (has 'model' key), convert it on the fly.
+    """
+    import torch
+    checkpoint = torch.load(model_path, map_location="cpu")
+    if "weight" in checkpoint:
+        return model_path  # Already in inference format
+    if "model" not in checkpoint:
+        logger.warning("Model has neither 'weight' nor 'model' key.")
+        return model_path
+    logger.info("Converting training checkpoint to inference format...")
+    state_dict = checkpoint["model"]
+    weight = {}
+    for k, v in state_dict.items():
+        new_key = k.replace("module.", "")
+        weight[new_key] = v.half()
+    # Standard RVC v2 40k config
+    config = [
+        1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
+        [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
+    ]
+    inference_model = {
+        "weight": weight,
+        "config": config,
+        "info": "v2_40k",
+        "sr": "40k",
+        "f0": 1,
+        "version": "v2",
+    }
+    # Save converted model alongside original
+    converted_path = model_path.replace(".pth", "_inference.pth")
+    torch.save(inference_model, converted_path)
+    logger.info(f"Saved inference model: {converted_path}")
+    return converted_path
 @spaces.GPU(duration=60)
 def convert_voice(
     audio_path: str,
     if len(source_audio) < sr * 0.5:
         raise RuntimeError("Audio source trop court pour la conversion.")
+    # Ensure model is in RVC inference format (weight key, not model key)
+    model_path = _ensure_inference_format(model_path)
+    # Try Applio VoiceConverter
     try:
         converted = _try_applio_inference(
             audio_path, model_path, index_path, pitch,

pipeline/training.py CHANGED Viewed

@@ -392,6 +392,80 @@ def find_pretrained_model(sample_rate: int = 40000):
     return None
 def full_training_pipeline(
     audio_path: str,
     model_name: str,
@@ -443,7 +517,7 @@ def full_training_pipeline(
     os.makedirs(local_model_dir, exist_ok=True)
     local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
-    shutil.copy2(pth_path, local_pth)
     local_index = None
     if index_path:

     return None
+def _convert_to_inference_model(checkpoint_path, output_path, sample_rate=40000):
+    """
+    Convert a pretrained training checkpoint to RVC inference format.
+    Training checkpoints have keys: model, optimizer, iteration, learning_rate
+    Inference models need keys: weight, config, info, sr, f0, version
+    """
+    import torch
+    import json
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # Extract generator weights
+    if "model" in checkpoint:
+        state_dict = checkpoint["model"]
+    elif "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    else:
+        state_dict = checkpoint
+    # Remove "module." prefix if present (from DataParallel)
+    weight = {}
+    for k, v in state_dict.items():
+        new_key = k.replace("module.", "")
+        weight[new_key] = v.half()
+    # Read config from Applio config file
+    sr_label = "40k" if sample_rate == 40000 else "48k"
+    config_path = os.path.join(APPLIO_DIR, "configs", "v2", f"{sr_label}.json")
+    if os.path.exists(config_path):
+        with open(config_path) as f:
+            cfg = json.load(f)
+        config = [
+            cfg["data"]["filter_length"] // 2 + 1,
+            cfg["train"]["segment_size"] // cfg["data"]["hop_length"],
+            cfg["model"]["inter_channels"],
+            cfg["model"]["hidden_channels"],
+            cfg["model"]["filter_channels"],
+            cfg["model"]["n_heads"],
+            cfg["model"]["n_layers"],
+            cfg["model"]["kernel_size"],
+            cfg["model"]["p_dropout"],
+            cfg["model"]["resblock"],
+            cfg["model"]["resblock_kernel_sizes"],
+            cfg["model"]["resblock_dilation_sizes"],
+            cfg["model"]["upsample_rates"],
+            cfg["model"]["upsample_initial_channel"],
+            cfg["model"]["upsample_kernel_sizes"],
+            cfg["model"]["spk_embed_dim"],
+            cfg["model"]["gin_channels"],
+            cfg["data"]["sampling_rate"],
+        ]
+    else:
+        # Fallback: standard RVC v2 40k config
+        config = [
+            1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
+            [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
+        ]
+    inference_model = {
+        "weight": weight,
+        "config": config,
+        "info": f"v2_{sr_label}",
+        "sr": sr_label,
+        "f0": 1,
+        "version": "v2",
+    }
+    torch.save(inference_model, output_path)
+    logger.info(f"Converted checkpoint to inference format: {output_path}")
+    return output_path
 def full_training_pipeline(
     audio_path: str,
     model_name: str,
     os.makedirs(local_model_dir, exist_ok=True)
     local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
+    _convert_to_inference_model(pth_path, local_pth, sample_rate)
     local_index = None
     if index_path: