assistanttttttt commited on
Commit
9ec07f2
·
1 Parent(s): 462fd1f

Migrate RVC training to ultimate-rvc library (no fairseq needed)

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. pipeline/rvc_training.py +55 -134
  3. requirements.txt +5 -5
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.12.0
8
- python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
@@ -43,7 +43,7 @@ Outil web de **clonage vocal zero-shot** basé sur **Seed-VC** (Diffusion Transf
43
  1. Onglet **"Convertir un morceau"**
44
  2. Sélectionnez votre profil vocal
45
  3. Uploadez le morceau à convertir
46
- 4. Ajustez les paramètres si besoin (transposition, qualité, volumes)
47
  5. Cliquez **"Convertir et mixer"**
48
 
49
  ## Architecture technique
 
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.12.0
8
+ python_version: "3.12"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
 
43
  1. Onglet **"Convertir un morceau"**
44
  2. Sélectionnez votre profil vocal
45
  3. Uploadez le morceau à convertir
46
+ 4. Ajustez les parámetros si besoin (transposition, qualité, volumes)
47
  5. Cliquez **"Convertir et mixer"**
48
 
49
  ## Architecture technique
pipeline/rvc_training.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
2
- import subprocess
3
  import shutil
4
  import logging
5
  import traceback
6
- from huggingface_hub import hf_hub_download, HfApi
7
 
 
8
  logger = logging.getLogger(__name__)
9
 
10
  try:
@@ -17,27 +17,26 @@ except ImportError:
17
  return fn
18
  return decorator
19
 
20
- def download_rvc_models(base_dir):
21
- """Download required base models for RVC if they don't exist."""
22
- models_to_download = [
23
- ("lj1995/VoiceConversionWebUI", "hubert_base.pt", "assets/hubert"),
24
- ("lj1995/VoiceConversionWebUI", "rmvpe.pt", "assets/rmvpe"),
25
- ("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G40k.pth", "assets/pretrained_v2"),
26
- ("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D40k.pth", "assets/pretrained_v2"),
27
- ]
28
- for repo_id, filename, local_dir in models_to_download:
29
- dest_path = os.path.join(base_dir, local_dir, os.path.basename(filename))
30
- if not os.path.exists(dest_path):
31
- logger.info(f"Downloading {filename}...")
32
- os.makedirs(os.path.dirname(dest_path), exist_ok=True)
33
- try:
34
- dl_path = hf_hub_download(repo_id=repo_id, filename=filename)
35
- shutil.copy(dl_path, dest_path)
36
- except Exception as e:
37
- logger.warning(f"Failed to download {filename}: {e}")
38
 
39
- @spaces.GPU(duration=600)
 
 
 
 
 
 
 
 
 
40
  def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
 
 
 
41
  if not audio_path:
42
  return "Error: Please upload an audio file.", None
43
  if not model_name:
@@ -49,124 +48,45 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
49
  logger.info(desc)
50
 
51
  model_name = model_name.strip().replace(" ", "_")
52
- base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "RVC"))
53
- logs_dir = os.path.join(base_dir, "logs", model_name)
54
- weights_dir = os.path.join(base_dir, "assets", "weights")
55
-
56
- # Create directories
57
- os.makedirs(logs_dir, exist_ok=True)
58
- os.makedirs(weights_dir, exist_ok=True)
59
-
60
- # 1. Download Base Models
61
- p(0.05, "Downloading base models...")
62
- download_rvc_models(base_dir)
63
-
64
- # 2. Prepare Dataset
65
- dataset_dir = os.path.join(base_dir, "dataset", model_name)
66
- os.makedirs(dataset_dir, exist_ok=True)
67
- p(0.1, "Preparing dataset...")
68
 
69
  try:
70
- import soundfile as sf
71
- import librosa
72
- y, sr = librosa.load(audio_path, sr=40000)
73
- sf.write(os.path.join(dataset_dir, "audio.wav"), y, 40000)
74
- except Exception as e:
75
- return f"Failed to process audio: {e}", None
76
-
77
- def run_cmd(cmd, desc):
78
- p(None, desc)
79
- logger.info(f"Running: {cmd}")
80
- env = os.environ.copy()
81
- env["PYTHONPATH"] = base_dir
82
- # Ensure it runs from RVC dir
83
- result = subprocess.run(cmd, shell=True, env=env, cwd=base_dir, capture_output=True, text=True)
84
- if result.returncode != 0:
85
- logger.error(f"Error in {desc}:\n{result.stderr}")
86
- raise RuntimeError(f"{desc} failed: {result.stderr}")
87
- return result.stdout
88
-
89
- try:
90
- # Preprocess
91
- run_cmd(f"python infer/modules/train/preprocess.py {dataset_dir} 40000 2 {logs_dir} False", "Preprocessing audio...")
92
-
93
- # Extract F0
94
- run_cmd(f"python infer/modules/train/extract/extract_f0_print.py {logs_dir} 2 rmvpe", "Extracting F0...")
95
-
96
- # Extract Features
97
- device = "cuda:0" if __import__("torch").cuda.is_available() else "cpu"
98
- run_cmd(f"python infer/modules/train/extract_feature_print.py {device} 1 0 0 {logs_dir} v2", "Extracting Features...")
99
-
100
- # Train
101
- p(0.4, f"Training for {epochs} epochs (this will take a while)...")
102
- # Generate config file dynamically
103
- config_path = os.path.join(base_dir, "configs", "v2", "40k.json")
104
- if not os.path.exists(os.path.dirname(config_path)):
105
- os.makedirs(os.path.dirname(config_path), exist_ok=True)
106
- import json
107
- basic_config = {
108
- "train": {
109
- "log_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001,
110
- "betas": [0.8, 0.99], "eps": 1e-09, "batch_size": 4, "fp16_run": True,
111
- "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1,
112
- "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0
113
- },
114
- "data": {
115
- "max_wav_value": 32768.0, "sampling_rate": 40000, "filter_length": 2048,
116
- "hop_length": 400, "win_length": 2048, "n_mel_channels": 125, "mel_fmin": 0.0, "mel_fmax": None
117
- },
118
- "model": {
119
- "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768,
120
- "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0, "resblock": "1",
121
- "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
122
- "upsample_rates": [10, 10, 2, 2], "upsample_initial_channel": 512,
123
- "upsample_kernel_sizes": [16, 16, 4, 4], "n_layers_q": 3, "use_spectral_norm": False, "gin_channels": 256
124
- }
125
- }
126
- with open(config_path, "w") as f:
127
- json.dump(basic_config, f)
128
 
129
- shutil.copy(config_path, os.path.join(logs_dir, "config.json"))
130
-
131
- gpus_arg = "-g 0" if device.startswith("cuda") else ""
132
- train_cmd = (
133
- f"python infer/modules/train/train.py -e {model_name} -sr 40k -f0 1 -bs 4 {gpus_arg} "
134
- f"-te {epochs} -se 25 -pg assets/pretrained_v2/f0G40k.pth -pd assets/pretrained_v2/f0D40k.pth "
135
- f"-l 1 -c 0 -sw 1 -v v2"
136
  )
137
- run_cmd(train_cmd, "Training RVC Model...")
138
-
139
- # Build Index
140
- p(0.8, "Building Index...")
141
- try:
142
- import numpy as np
143
- import faiss
144
- feature_dir = os.path.join(logs_dir, "3_feature768")
145
- if os.path.exists(feature_dir):
146
- npys = []
147
- for name in sorted(os.listdir(feature_dir)):
148
- phone = np.load(os.path.join(feature_dir, name))
149
- npys.append(phone)
150
- if npys:
151
- big_npy = np.concatenate(npys, 0)
152
- n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
153
- if n_ivf > 0:
154
- index = faiss.index_factory(768, f"IVF{n_ivf},Flat")
155
- index_ivf = faiss.extract_index_ivf(index)
156
- index_ivf.nprobe = 1
157
- index.train(big_npy)
158
- index.add(big_npy)
159
- faiss.write_index(index, os.path.join(logs_dir, f"added_{model_name}_v2.index"))
160
- except Exception as e:
161
- logger.warning(f"Failed to build index: {e}")
162
-
163
- pth_path = os.path.join(weights_dir, f"{model_name}.pth")
164
- index_path = os.path.join(logs_dir, f"added_{model_name}_v2.index")
165
 
166
- if not os.path.exists(pth_path):
167
- raise FileNotFoundError(f"Model .pth not found at {pth_path}")
168
 
169
- p(0.9, "Uploading to Hugging Face Dataset...")
 
170
  api = HfApi(token=os.environ.get("HF_TOKEN", ""))
171
  repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
172
 
@@ -187,10 +107,11 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
187
  )
188
  except Exception as e:
189
  logger.error(f"Upload to dataset failed: {e}")
190
- return f"Model trained but failed to upload to HF: {e}", pth_path
191
 
192
  return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
193
 
194
  except Exception as e:
195
  tb = traceback.format_exc()
 
196
  return f"Error: {str(e)}\n\nDetails:\n{tb}", None
 
1
  import os
 
2
  import shutil
3
  import logging
4
  import traceback
5
+ from huggingface_hub import HfApi
6
 
7
+ # Set up logging
8
  logger = logging.getLogger(__name__)
9
 
10
  try:
 
17
  return fn
18
  return decorator
19
 
20
+ # Configuration for Ultimate-RVC paths
21
+ # We set these environment variables BEFORE importing ultimate_rvc to ensure it uses our paths
22
+ os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models")
23
+ os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio")
24
+ os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp")
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Now we can import the core functions from ultimate_rvc
27
+ try:
28
+ from ultimate_rvc.core.train import prepare, extract, train
29
+ from ultimate_rvc.typing_extra import TrainingSampleRate, F0Method, EmbedderModel
30
+ ULTIMATE_RVC_AVAILABLE = True
31
+ except ImportError as e:
32
+ logger.error(f"Failed to import ultimate_rvc: {e}")
33
+ ULTIMATE_RVC_AVAILABLE = False
34
+
35
+ @spaces.GPU(duration=1000) # Training takes time, let's request more
36
  def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
37
+ if not ULTIMATE_RVC_AVAILABLE:
38
+ return "Error: ultimate-rvc library not installed correctly.", None
39
+
40
  if not audio_path:
41
  return "Error: Please upload an audio file.", None
42
  if not model_name:
 
48
  logger.info(desc)
49
 
50
  model_name = model_name.strip().replace(" ", "_")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  try:
53
+ # 1. Populate Dataset
54
+ p(0.1, "Step 1/4: Preparing dataset...")
55
+ dataset_path = prepare.populate_dataset(model_name, [audio_path])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # 2. Preprocess
58
+ p(0.2, "Step 2/4: Preprocessing audio...")
59
+ prepare.preprocess_dataset(
60
+ model_name=model_name,
61
+ dataset=dataset_path,
62
+ sample_rate=TrainingSampleRate.HZ_40K
 
63
  )
64
+
65
+ # 3. Extract Features
66
+ p(0.4, "Step 3/4: Extracting features (F0 & Content)...")
67
+ extract.extract_features(
68
+ model_name=model_name,
69
+ f0_method=F0Method.RMVPE,
70
+ embedder_model=EmbedderModel.CONTENTVEC
71
+ )
72
+
73
+ # 4. Train
74
+ p(0.6, f"Step 4/4: Training for {epochs} epochs (this may take several minutes)...")
75
+ # ultimate-rvc's run_training returns [pth_path, index_path]
76
+ result_paths = train.run_training(
77
+ model_name=model_name,
78
+ num_epochs=epochs,
79
+ batch_size=4, # Safe for ZeroGPU
80
+ save_interval=epochs # Only save at the end
81
+ )
82
+
83
+ if not result_paths or len(result_paths) < 2:
84
+ return "Training completed but could not find the output files.", None
 
 
 
 
 
 
 
85
 
86
+ pth_path, index_path = result_paths[0], result_paths[1]
 
87
 
88
+ # 5. Upload to Hugging Face
89
+ p(0.9, "Final Step: Uploading to Hugging Face Dataset...")
90
  api = HfApi(token=os.environ.get("HF_TOKEN", ""))
91
  repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
92
 
 
107
  )
108
  except Exception as e:
109
  logger.error(f"Upload to dataset failed: {e}")
110
+ return f"Model trained but failed to upload to HF: {e}. Files are at {pth_path}", pth_path
111
 
112
  return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
113
 
114
  except Exception as e:
115
  tb = traceback.format_exc()
116
+ logger.error(f"Training error: {e}\n{tb}")
117
  return f"Error: {str(e)}\n\nDetails:\n{tb}", None
requirements.txt CHANGED
@@ -33,9 +33,9 @@ bigvgan
33
  descript-audio-codec
34
  vocos
35
 
36
- # RVC Training dependencies
37
- Cython
38
- fairseq @ git+https://github.com/facebookresearch/fairseq.git
39
- faiss-cpu
40
- praat-parselmouth
41
  tensorboardX
 
 
33
  descript-audio-codec
34
  vocos
35
 
36
+ # RVC Training via Ultimate-RVC
37
+ ultimate-rvc==0.6.0
38
+ torchcrepe
39
+ torchfcpe
 
40
  tensorboardX
41
+ wget