ibcplateformes Claude Opus 4.6 commited on
Commit
27bc094
·
1 Parent(s): dbae9aa

Convert pretrained checkpoint to RVC inference format for proper voice conversion

Browse files

The pretrained f0G40k.pth is a training checkpoint (keys: model, optimizer)
but Applio's VoiceConverter requires inference format (keys: weight, config, sr, version).
Without this conversion, VoiceConverter fails silently and the fallback produces
identical audio (pitch=0 + negligible spectral adjustment).

- Add _convert_to_inference_model() to training.py
- Add _ensure_inference_format() to inference.py for backward compat with existing models
- Training pipeline now saves models in correct inference format

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. pipeline/inference.py +51 -1
  2. pipeline/training.py +75 -1
pipeline/inference.py CHANGED
@@ -26,6 +26,53 @@ from pipeline.setup import APPLIO_DIR, ensure_applio_path
26
  OUTPUT_DIR = "/tmp/rvc_output"
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  @spaces.GPU(duration=60)
30
  def convert_voice(
31
  audio_path: str,
@@ -63,7 +110,10 @@ def convert_voice(
63
  if len(source_audio) < sr * 0.5:
64
  raise RuntimeError("Audio source trop court pour la conversion.")
65
 
66
- # Try Applio VoiceConverter first if model is a proper RVC model
 
 
 
67
  try:
68
  converted = _try_applio_inference(
69
  audio_path, model_path, index_path, pitch,
 
26
  OUTPUT_DIR = "/tmp/rvc_output"
27
 
28
 
29
+ def _ensure_inference_format(model_path):
30
+ """
31
+ Check if model is in RVC inference format (has 'weight' key).
32
+ If it's a training checkpoint (has 'model' key), convert it on the fly.
33
+ """
34
+ import torch
35
+
36
+ checkpoint = torch.load(model_path, map_location="cpu")
37
+
38
+ if "weight" in checkpoint:
39
+ return model_path # Already in inference format
40
+
41
+ if "model" not in checkpoint:
42
+ logger.warning("Model has neither 'weight' nor 'model' key.")
43
+ return model_path
44
+
45
+ logger.info("Converting training checkpoint to inference format...")
46
+
47
+ state_dict = checkpoint["model"]
48
+ weight = {}
49
+ for k, v in state_dict.items():
50
+ new_key = k.replace("module.", "")
51
+ weight[new_key] = v.half()
52
+
53
+ # Standard RVC v2 40k config
54
+ config = [
55
+ 1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
56
+ [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
57
+ [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
58
+ ]
59
+
60
+ inference_model = {
61
+ "weight": weight,
62
+ "config": config,
63
+ "info": "v2_40k",
64
+ "sr": "40k",
65
+ "f0": 1,
66
+ "version": "v2",
67
+ }
68
+
69
+ # Save converted model alongside original
70
+ converted_path = model_path.replace(".pth", "_inference.pth")
71
+ torch.save(inference_model, converted_path)
72
+ logger.info(f"Saved inference model: {converted_path}")
73
+ return converted_path
74
+
75
+
76
  @spaces.GPU(duration=60)
77
  def convert_voice(
78
  audio_path: str,
 
110
  if len(source_audio) < sr * 0.5:
111
  raise RuntimeError("Audio source trop court pour la conversion.")
112
 
113
+ # Ensure model is in RVC inference format (weight key, not model key)
114
+ model_path = _ensure_inference_format(model_path)
115
+
116
+ # Try Applio VoiceConverter
117
  try:
118
  converted = _try_applio_inference(
119
  audio_path, model_path, index_path, pitch,
pipeline/training.py CHANGED
@@ -392,6 +392,80 @@ def find_pretrained_model(sample_rate: int = 40000):
392
  return None
393
 
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  def full_training_pipeline(
396
  audio_path: str,
397
  model_name: str,
@@ -443,7 +517,7 @@ def full_training_pipeline(
443
  os.makedirs(local_model_dir, exist_ok=True)
444
 
445
  local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
446
- shutil.copy2(pth_path, local_pth)
447
 
448
  local_index = None
449
  if index_path:
 
392
  return None
393
 
394
 
395
+ def _convert_to_inference_model(checkpoint_path, output_path, sample_rate=40000):
396
+ """
397
+ Convert a pretrained training checkpoint to RVC inference format.
398
+ Training checkpoints have keys: model, optimizer, iteration, learning_rate
399
+ Inference models need keys: weight, config, info, sr, f0, version
400
+ """
401
+ import torch
402
+ import json
403
+
404
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
405
+
406
+ # Extract generator weights
407
+ if "model" in checkpoint:
408
+ state_dict = checkpoint["model"]
409
+ elif "state_dict" in checkpoint:
410
+ state_dict = checkpoint["state_dict"]
411
+ else:
412
+ state_dict = checkpoint
413
+
414
+ # Remove "module." prefix if present (from DataParallel)
415
+ weight = {}
416
+ for k, v in state_dict.items():
417
+ new_key = k.replace("module.", "")
418
+ weight[new_key] = v.half()
419
+
420
+ # Read config from Applio config file
421
+ sr_label = "40k" if sample_rate == 40000 else "48k"
422
+ config_path = os.path.join(APPLIO_DIR, "configs", "v2", f"{sr_label}.json")
423
+
424
+ if os.path.exists(config_path):
425
+ with open(config_path) as f:
426
+ cfg = json.load(f)
427
+ config = [
428
+ cfg["data"]["filter_length"] // 2 + 1,
429
+ cfg["train"]["segment_size"] // cfg["data"]["hop_length"],
430
+ cfg["model"]["inter_channels"],
431
+ cfg["model"]["hidden_channels"],
432
+ cfg["model"]["filter_channels"],
433
+ cfg["model"]["n_heads"],
434
+ cfg["model"]["n_layers"],
435
+ cfg["model"]["kernel_size"],
436
+ cfg["model"]["p_dropout"],
437
+ cfg["model"]["resblock"],
438
+ cfg["model"]["resblock_kernel_sizes"],
439
+ cfg["model"]["resblock_dilation_sizes"],
440
+ cfg["model"]["upsample_rates"],
441
+ cfg["model"]["upsample_initial_channel"],
442
+ cfg["model"]["upsample_kernel_sizes"],
443
+ cfg["model"]["spk_embed_dim"],
444
+ cfg["model"]["gin_channels"],
445
+ cfg["data"]["sampling_rate"],
446
+ ]
447
+ else:
448
+ # Fallback: standard RVC v2 40k config
449
+ config = [
450
+ 1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
451
+ [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
452
+ [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
453
+ ]
454
+
455
+ inference_model = {
456
+ "weight": weight,
457
+ "config": config,
458
+ "info": f"v2_{sr_label}",
459
+ "sr": sr_label,
460
+ "f0": 1,
461
+ "version": "v2",
462
+ }
463
+
464
+ torch.save(inference_model, output_path)
465
+ logger.info(f"Converted checkpoint to inference format: {output_path}")
466
+ return output_path
467
+
468
+
469
  def full_training_pipeline(
470
  audio_path: str,
471
  model_name: str,
 
517
  os.makedirs(local_model_dir, exist_ok=True)
518
 
519
  local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
520
+ _convert_to_inference_model(pth_path, local_pth, sample_rate)
521
 
522
  local_index = None
523
  if index_path: