Spaces:
Sleeping
Sleeping
ibcplateformes Claude Opus 4.6 commited on
Commit ·
27bc094
1
Parent(s): dbae9aa
Convert pretrained checkpoint to RVC inference format for proper voice conversion
Browse filesThe pretrained f0G40k.pth is a training checkpoint (keys: model, optimizer)
but Applio's VoiceConverter requires inference format (keys: weight, config, sr, version).
Without this conversion, VoiceConverter fails silently and the fallback produces
identical audio (pitch=0 + negligible spectral adjustment).
- Add _convert_to_inference_model() to training.py
- Add _ensure_inference_format() to inference.py for backward compat with existing models
- Training pipeline now saves models in correct inference format
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- pipeline/inference.py +51 -1
- pipeline/training.py +75 -1
pipeline/inference.py
CHANGED
|
@@ -26,6 +26,53 @@ from pipeline.setup import APPLIO_DIR, ensure_applio_path
|
|
| 26 |
OUTPUT_DIR = "/tmp/rvc_output"
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
@spaces.GPU(duration=60)
|
| 30 |
def convert_voice(
|
| 31 |
audio_path: str,
|
|
@@ -63,7 +110,10 @@ def convert_voice(
|
|
| 63 |
if len(source_audio) < sr * 0.5:
|
| 64 |
raise RuntimeError("Audio source trop court pour la conversion.")
|
| 65 |
|
| 66 |
-
#
|
|
|
|
|
|
|
|
|
|
| 67 |
try:
|
| 68 |
converted = _try_applio_inference(
|
| 69 |
audio_path, model_path, index_path, pitch,
|
|
|
|
| 26 |
OUTPUT_DIR = "/tmp/rvc_output"
|
| 27 |
|
| 28 |
|
| 29 |
+
def _ensure_inference_format(model_path):
|
| 30 |
+
"""
|
| 31 |
+
Check if model is in RVC inference format (has 'weight' key).
|
| 32 |
+
If it's a training checkpoint (has 'model' key), convert it on the fly.
|
| 33 |
+
"""
|
| 34 |
+
import torch
|
| 35 |
+
|
| 36 |
+
checkpoint = torch.load(model_path, map_location="cpu")
|
| 37 |
+
|
| 38 |
+
if "weight" in checkpoint:
|
| 39 |
+
return model_path # Already in inference format
|
| 40 |
+
|
| 41 |
+
if "model" not in checkpoint:
|
| 42 |
+
logger.warning("Model has neither 'weight' nor 'model' key.")
|
| 43 |
+
return model_path
|
| 44 |
+
|
| 45 |
+
logger.info("Converting training checkpoint to inference format...")
|
| 46 |
+
|
| 47 |
+
state_dict = checkpoint["model"]
|
| 48 |
+
weight = {}
|
| 49 |
+
for k, v in state_dict.items():
|
| 50 |
+
new_key = k.replace("module.", "")
|
| 51 |
+
weight[new_key] = v.half()
|
| 52 |
+
|
| 53 |
+
# Standard RVC v2 40k config
|
| 54 |
+
config = [
|
| 55 |
+
1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
|
| 56 |
+
[3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
| 57 |
+
[10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
inference_model = {
|
| 61 |
+
"weight": weight,
|
| 62 |
+
"config": config,
|
| 63 |
+
"info": "v2_40k",
|
| 64 |
+
"sr": "40k",
|
| 65 |
+
"f0": 1,
|
| 66 |
+
"version": "v2",
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Save converted model alongside original
|
| 70 |
+
converted_path = model_path.replace(".pth", "_inference.pth")
|
| 71 |
+
torch.save(inference_model, converted_path)
|
| 72 |
+
logger.info(f"Saved inference model: {converted_path}")
|
| 73 |
+
return converted_path
|
| 74 |
+
|
| 75 |
+
|
| 76 |
@spaces.GPU(duration=60)
|
| 77 |
def convert_voice(
|
| 78 |
audio_path: str,
|
|
|
|
| 110 |
if len(source_audio) < sr * 0.5:
|
| 111 |
raise RuntimeError("Audio source trop court pour la conversion.")
|
| 112 |
|
| 113 |
+
# Ensure model is in RVC inference format (weight key, not model key)
|
| 114 |
+
model_path = _ensure_inference_format(model_path)
|
| 115 |
+
|
| 116 |
+
# Try Applio VoiceConverter
|
| 117 |
try:
|
| 118 |
converted = _try_applio_inference(
|
| 119 |
audio_path, model_path, index_path, pitch,
|
pipeline/training.py
CHANGED
|
@@ -392,6 +392,80 @@ def find_pretrained_model(sample_rate: int = 40000):
|
|
| 392 |
return None
|
| 393 |
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
def full_training_pipeline(
|
| 396 |
audio_path: str,
|
| 397 |
model_name: str,
|
|
@@ -443,7 +517,7 @@ def full_training_pipeline(
|
|
| 443 |
os.makedirs(local_model_dir, exist_ok=True)
|
| 444 |
|
| 445 |
local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
|
| 446 |
-
|
| 447 |
|
| 448 |
local_index = None
|
| 449 |
if index_path:
|
|
|
|
| 392 |
return None
|
| 393 |
|
| 394 |
|
| 395 |
+
def _convert_to_inference_model(checkpoint_path, output_path, sample_rate=40000):
|
| 396 |
+
"""
|
| 397 |
+
Convert a pretrained training checkpoint to RVC inference format.
|
| 398 |
+
Training checkpoints have keys: model, optimizer, iteration, learning_rate
|
| 399 |
+
Inference models need keys: weight, config, info, sr, f0, version
|
| 400 |
+
"""
|
| 401 |
+
import torch
|
| 402 |
+
import json
|
| 403 |
+
|
| 404 |
+
checkpoint = torch.load(checkpoint_path, map_location="cpu")
|
| 405 |
+
|
| 406 |
+
# Extract generator weights
|
| 407 |
+
if "model" in checkpoint:
|
| 408 |
+
state_dict = checkpoint["model"]
|
| 409 |
+
elif "state_dict" in checkpoint:
|
| 410 |
+
state_dict = checkpoint["state_dict"]
|
| 411 |
+
else:
|
| 412 |
+
state_dict = checkpoint
|
| 413 |
+
|
| 414 |
+
# Remove "module." prefix if present (from DataParallel)
|
| 415 |
+
weight = {}
|
| 416 |
+
for k, v in state_dict.items():
|
| 417 |
+
new_key = k.replace("module.", "")
|
| 418 |
+
weight[new_key] = v.half()
|
| 419 |
+
|
| 420 |
+
# Read config from Applio config file
|
| 421 |
+
sr_label = "40k" if sample_rate == 40000 else "48k"
|
| 422 |
+
config_path = os.path.join(APPLIO_DIR, "configs", "v2", f"{sr_label}.json")
|
| 423 |
+
|
| 424 |
+
if os.path.exists(config_path):
|
| 425 |
+
with open(config_path) as f:
|
| 426 |
+
cfg = json.load(f)
|
| 427 |
+
config = [
|
| 428 |
+
cfg["data"]["filter_length"] // 2 + 1,
|
| 429 |
+
cfg["train"]["segment_size"] // cfg["data"]["hop_length"],
|
| 430 |
+
cfg["model"]["inter_channels"],
|
| 431 |
+
cfg["model"]["hidden_channels"],
|
| 432 |
+
cfg["model"]["filter_channels"],
|
| 433 |
+
cfg["model"]["n_heads"],
|
| 434 |
+
cfg["model"]["n_layers"],
|
| 435 |
+
cfg["model"]["kernel_size"],
|
| 436 |
+
cfg["model"]["p_dropout"],
|
| 437 |
+
cfg["model"]["resblock"],
|
| 438 |
+
cfg["model"]["resblock_kernel_sizes"],
|
| 439 |
+
cfg["model"]["resblock_dilation_sizes"],
|
| 440 |
+
cfg["model"]["upsample_rates"],
|
| 441 |
+
cfg["model"]["upsample_initial_channel"],
|
| 442 |
+
cfg["model"]["upsample_kernel_sizes"],
|
| 443 |
+
cfg["model"]["spk_embed_dim"],
|
| 444 |
+
cfg["model"]["gin_channels"],
|
| 445 |
+
cfg["data"]["sampling_rate"],
|
| 446 |
+
]
|
| 447 |
+
else:
|
| 448 |
+
# Fallback: standard RVC v2 40k config
|
| 449 |
+
config = [
|
| 450 |
+
1025, 32, 192, 192, 768, 2, 6, 3, 0, "1",
|
| 451 |
+
[3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
| 452 |
+
[10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000,
|
| 453 |
+
]
|
| 454 |
+
|
| 455 |
+
inference_model = {
|
| 456 |
+
"weight": weight,
|
| 457 |
+
"config": config,
|
| 458 |
+
"info": f"v2_{sr_label}",
|
| 459 |
+
"sr": sr_label,
|
| 460 |
+
"f0": 1,
|
| 461 |
+
"version": "v2",
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
torch.save(inference_model, output_path)
|
| 465 |
+
logger.info(f"Converted checkpoint to inference format: {output_path}")
|
| 466 |
+
return output_path
|
| 467 |
+
|
| 468 |
+
|
| 469 |
def full_training_pipeline(
|
| 470 |
audio_path: str,
|
| 471 |
model_name: str,
|
|
|
|
| 517 |
os.makedirs(local_model_dir, exist_ok=True)
|
| 518 |
|
| 519 |
local_pth = os.path.join(local_model_dir, f"{model_name}.pth")
|
| 520 |
+
_convert_to_inference_model(pth_path, local_pth, sample_rate)
|
| 521 |
|
| 522 |
local_index = None
|
| 523 |
if index_path:
|