Spaces:
Sleeping
Sleeping
Commit ·
9ec07f2
1
Parent(s): 462fd1f
Migrate RVC training to ultimate-rvc library (no fairseq needed)
Browse files- README.md +2 -2
- pipeline/rvc_training.py +55 -134
- requirements.txt +5 -5
README.md
CHANGED
|
@@ -5,7 +5,7 @@ colorFrom: purple
|
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.12.0
|
| 8 |
-
python_version: "3.
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: mit
|
|
@@ -43,7 +43,7 @@ Outil web de **clonage vocal zero-shot** basé sur **Seed-VC** (Diffusion Transf
|
|
| 43 |
1. Onglet **"Convertir un morceau"**
|
| 44 |
2. Sélectionnez votre profil vocal
|
| 45 |
3. Uploadez le morceau à convertir
|
| 46 |
-
4. Ajustez les
|
| 47 |
5. Cliquez **"Convertir et mixer"**
|
| 48 |
|
| 49 |
## Architecture technique
|
|
|
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.12.0
|
| 8 |
+
python_version: "3.12"
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: mit
|
|
|
|
| 43 |
1. Onglet **"Convertir un morceau"**
|
| 44 |
2. Sélectionnez votre profil vocal
|
| 45 |
3. Uploadez le morceau à convertir
|
| 46 |
+
4. Ajustez les parámetros si besoin (transposition, qualité, volumes)
|
| 47 |
5. Cliquez **"Convertir et mixer"**
|
| 48 |
|
| 49 |
## Architecture technique
|
pipeline/rvc_training.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
import subprocess
|
| 3 |
import shutil
|
| 4 |
import logging
|
| 5 |
import traceback
|
| 6 |
-
from huggingface_hub import
|
| 7 |
|
|
|
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
try:
|
|
@@ -17,27 +17,26 @@ except ImportError:
|
|
| 17 |
return fn
|
| 18 |
return decorator
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G40k.pth", "assets/pretrained_v2"),
|
| 26 |
-
("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D40k.pth", "assets/pretrained_v2"),
|
| 27 |
-
]
|
| 28 |
-
for repo_id, filename, local_dir in models_to_download:
|
| 29 |
-
dest_path = os.path.join(base_dir, local_dir, os.path.basename(filename))
|
| 30 |
-
if not os.path.exists(dest_path):
|
| 31 |
-
logger.info(f"Downloading {filename}...")
|
| 32 |
-
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
| 33 |
-
try:
|
| 34 |
-
dl_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 35 |
-
shutil.copy(dl_path, dest_path)
|
| 36 |
-
except Exception as e:
|
| 37 |
-
logger.warning(f"Failed to download {filename}: {e}")
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
|
|
|
|
|
|
|
|
|
|
| 41 |
if not audio_path:
|
| 42 |
return "Error: Please upload an audio file.", None
|
| 43 |
if not model_name:
|
|
@@ -49,124 +48,45 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
|
|
| 49 |
logger.info(desc)
|
| 50 |
|
| 51 |
model_name = model_name.strip().replace(" ", "_")
|
| 52 |
-
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "RVC"))
|
| 53 |
-
logs_dir = os.path.join(base_dir, "logs", model_name)
|
| 54 |
-
weights_dir = os.path.join(base_dir, "assets", "weights")
|
| 55 |
-
|
| 56 |
-
# Create directories
|
| 57 |
-
os.makedirs(logs_dir, exist_ok=True)
|
| 58 |
-
os.makedirs(weights_dir, exist_ok=True)
|
| 59 |
-
|
| 60 |
-
# 1. Download Base Models
|
| 61 |
-
p(0.05, "Downloading base models...")
|
| 62 |
-
download_rvc_models(base_dir)
|
| 63 |
-
|
| 64 |
-
# 2. Prepare Dataset
|
| 65 |
-
dataset_dir = os.path.join(base_dir, "dataset", model_name)
|
| 66 |
-
os.makedirs(dataset_dir, exist_ok=True)
|
| 67 |
-
p(0.1, "Preparing dataset...")
|
| 68 |
|
| 69 |
try:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
sf.write(os.path.join(dataset_dir, "audio.wav"), y, 40000)
|
| 74 |
-
except Exception as e:
|
| 75 |
-
return f"Failed to process audio: {e}", None
|
| 76 |
-
|
| 77 |
-
def run_cmd(cmd, desc):
|
| 78 |
-
p(None, desc)
|
| 79 |
-
logger.info(f"Running: {cmd}")
|
| 80 |
-
env = os.environ.copy()
|
| 81 |
-
env["PYTHONPATH"] = base_dir
|
| 82 |
-
# Ensure it runs from RVC dir
|
| 83 |
-
result = subprocess.run(cmd, shell=True, env=env, cwd=base_dir, capture_output=True, text=True)
|
| 84 |
-
if result.returncode != 0:
|
| 85 |
-
logger.error(f"Error in {desc}:\n{result.stderr}")
|
| 86 |
-
raise RuntimeError(f"{desc} failed: {result.stderr}")
|
| 87 |
-
return result.stdout
|
| 88 |
-
|
| 89 |
-
try:
|
| 90 |
-
# Preprocess
|
| 91 |
-
run_cmd(f"python infer/modules/train/preprocess.py {dataset_dir} 40000 2 {logs_dir} False", "Preprocessing audio...")
|
| 92 |
-
|
| 93 |
-
# Extract F0
|
| 94 |
-
run_cmd(f"python infer/modules/train/extract/extract_f0_print.py {logs_dir} 2 rmvpe", "Extracting F0...")
|
| 95 |
-
|
| 96 |
-
# Extract Features
|
| 97 |
-
device = "cuda:0" if __import__("torch").cuda.is_available() else "cpu"
|
| 98 |
-
run_cmd(f"python infer/modules/train/extract_feature_print.py {device} 1 0 0 {logs_dir} v2", "Extracting Features...")
|
| 99 |
-
|
| 100 |
-
# Train
|
| 101 |
-
p(0.4, f"Training for {epochs} epochs (this will take a while)...")
|
| 102 |
-
# Generate config file dynamically
|
| 103 |
-
config_path = os.path.join(base_dir, "configs", "v2", "40k.json")
|
| 104 |
-
if not os.path.exists(os.path.dirname(config_path)):
|
| 105 |
-
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
| 106 |
-
import json
|
| 107 |
-
basic_config = {
|
| 108 |
-
"train": {
|
| 109 |
-
"log_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001,
|
| 110 |
-
"betas": [0.8, 0.99], "eps": 1e-09, "batch_size": 4, "fp16_run": True,
|
| 111 |
-
"lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1,
|
| 112 |
-
"warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0
|
| 113 |
-
},
|
| 114 |
-
"data": {
|
| 115 |
-
"max_wav_value": 32768.0, "sampling_rate": 40000, "filter_length": 2048,
|
| 116 |
-
"hop_length": 400, "win_length": 2048, "n_mel_channels": 125, "mel_fmin": 0.0, "mel_fmax": None
|
| 117 |
-
},
|
| 118 |
-
"model": {
|
| 119 |
-
"inter_channels": 192, "hidden_channels": 192, "filter_channels": 768,
|
| 120 |
-
"n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0, "resblock": "1",
|
| 121 |
-
"resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
| 122 |
-
"upsample_rates": [10, 10, 2, 2], "upsample_initial_channel": 512,
|
| 123 |
-
"upsample_kernel_sizes": [16, 16, 4, 4], "n_layers_q": 3, "use_spectral_norm": False, "gin_channels": 256
|
| 124 |
-
}
|
| 125 |
-
}
|
| 126 |
-
with open(config_path, "w") as f:
|
| 127 |
-
json.dump(basic_config, f)
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
f"-l 1 -c 0 -sw 1 -v v2"
|
| 136 |
)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
index.add(big_npy)
|
| 159 |
-
faiss.write_index(index, os.path.join(logs_dir, f"added_{model_name}_v2.index"))
|
| 160 |
-
except Exception as e:
|
| 161 |
-
logger.warning(f"Failed to build index: {e}")
|
| 162 |
-
|
| 163 |
-
pth_path = os.path.join(weights_dir, f"{model_name}.pth")
|
| 164 |
-
index_path = os.path.join(logs_dir, f"added_{model_name}_v2.index")
|
| 165 |
|
| 166 |
-
|
| 167 |
-
raise FileNotFoundError(f"Model .pth not found at {pth_path}")
|
| 168 |
|
| 169 |
-
|
|
|
|
| 170 |
api = HfApi(token=os.environ.get("HF_TOKEN", ""))
|
| 171 |
repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
|
| 172 |
|
|
@@ -187,10 +107,11 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
|
|
| 187 |
)
|
| 188 |
except Exception as e:
|
| 189 |
logger.error(f"Upload to dataset failed: {e}")
|
| 190 |
-
return f"Model trained but failed to upload to HF: {e}", pth_path
|
| 191 |
|
| 192 |
return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
|
| 193 |
|
| 194 |
except Exception as e:
|
| 195 |
tb = traceback.format_exc()
|
|
|
|
| 196 |
return f"Error: {str(e)}\n\nDetails:\n{tb}", None
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import shutil
|
| 3 |
import logging
|
| 4 |
import traceback
|
| 5 |
+
from huggingface_hub import HfApi
|
| 6 |
|
| 7 |
+
# Set up logging
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
try:
|
|
|
|
| 17 |
return fn
|
| 18 |
return decorator
|
| 19 |
|
| 20 |
+
# Configuration for Ultimate-RVC paths
|
| 21 |
+
# We set these environment variables BEFORE importing ultimate_rvc to ensure it uses our paths
|
| 22 |
+
os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models")
|
| 23 |
+
os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio")
|
| 24 |
+
os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Now we can import the core functions from ultimate_rvc
|
| 27 |
+
try:
|
| 28 |
+
from ultimate_rvc.core.train import prepare, extract, train
|
| 29 |
+
from ultimate_rvc.typing_extra import TrainingSampleRate, F0Method, EmbedderModel
|
| 30 |
+
ULTIMATE_RVC_AVAILABLE = True
|
| 31 |
+
except ImportError as e:
|
| 32 |
+
logger.error(f"Failed to import ultimate_rvc: {e}")
|
| 33 |
+
ULTIMATE_RVC_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
@spaces.GPU(duration=1000) # Training takes time, let's request more
|
| 36 |
def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
|
| 37 |
+
if not ULTIMATE_RVC_AVAILABLE:
|
| 38 |
+
return "Error: ultimate-rvc library not installed correctly.", None
|
| 39 |
+
|
| 40 |
if not audio_path:
|
| 41 |
return "Error: Please upload an audio file.", None
|
| 42 |
if not model_name:
|
|
|
|
| 48 |
logger.info(desc)
|
| 49 |
|
| 50 |
model_name = model_name.strip().replace(" ", "_")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
try:
|
| 53 |
+
# 1. Populate Dataset
|
| 54 |
+
p(0.1, "Step 1/4: Preparing dataset...")
|
| 55 |
+
dataset_path = prepare.populate_dataset(model_name, [audio_path])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# 2. Preprocess
|
| 58 |
+
p(0.2, "Step 2/4: Preprocessing audio...")
|
| 59 |
+
prepare.preprocess_dataset(
|
| 60 |
+
model_name=model_name,
|
| 61 |
+
dataset=dataset_path,
|
| 62 |
+
sample_rate=TrainingSampleRate.HZ_40K
|
|
|
|
| 63 |
)
|
| 64 |
+
|
| 65 |
+
# 3. Extract Features
|
| 66 |
+
p(0.4, "Step 3/4: Extracting features (F0 & Content)...")
|
| 67 |
+
extract.extract_features(
|
| 68 |
+
model_name=model_name,
|
| 69 |
+
f0_method=F0Method.RMVPE,
|
| 70 |
+
embedder_model=EmbedderModel.CONTENTVEC
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# 4. Train
|
| 74 |
+
p(0.6, f"Step 4/4: Training for {epochs} epochs (this may take several minutes)...")
|
| 75 |
+
# ultimate-rvc's run_training returns [pth_path, index_path]
|
| 76 |
+
result_paths = train.run_training(
|
| 77 |
+
model_name=model_name,
|
| 78 |
+
num_epochs=epochs,
|
| 79 |
+
batch_size=4, # Safe for ZeroGPU
|
| 80 |
+
save_interval=epochs # Only save at the end
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
if not result_paths or len(result_paths) < 2:
|
| 84 |
+
return "Training completed but could not find the output files.", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
pth_path, index_path = result_paths[0], result_paths[1]
|
|
|
|
| 87 |
|
| 88 |
+
# 5. Upload to Hugging Face
|
| 89 |
+
p(0.9, "Final Step: Uploading to Hugging Face Dataset...")
|
| 90 |
api = HfApi(token=os.environ.get("HF_TOKEN", ""))
|
| 91 |
repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
|
| 92 |
|
|
|
|
| 107 |
)
|
| 108 |
except Exception as e:
|
| 109 |
logger.error(f"Upload to dataset failed: {e}")
|
| 110 |
+
return f"Model trained but failed to upload to HF: {e}. Files are at {pth_path}", pth_path
|
| 111 |
|
| 112 |
return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
|
| 113 |
|
| 114 |
except Exception as e:
|
| 115 |
tb = traceback.format_exc()
|
| 116 |
+
logger.error(f"Training error: {e}\n{tb}")
|
| 117 |
return f"Error: {str(e)}\n\nDetails:\n{tb}", None
|
requirements.txt
CHANGED
|
@@ -33,9 +33,9 @@ bigvgan
|
|
| 33 |
descript-audio-codec
|
| 34 |
vocos
|
| 35 |
|
| 36 |
-
# RVC Training
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
praat-parselmouth
|
| 41 |
tensorboardX
|
|
|
|
|
|
| 33 |
descript-audio-codec
|
| 34 |
vocos
|
| 35 |
|
| 36 |
+
# RVC Training via Ultimate-RVC
|
| 37 |
+
ultimate-rvc==0.6.0
|
| 38 |
+
torchcrepe
|
| 39 |
+
torchfcpe
|
|
|
|
| 40 |
tensorboardX
|
| 41 |
+
wget
|