Spaces:

dimensionalpulsar
/

voice-clone-rvc

Sleeping

App Files Files Community

assistanttttttt commited on 22 days ago

Commit

9ec07f2

1 Parent(s): 462fd1f

Migrate RVC training to ultimate-rvc library (no fairseq needed)

Browse files

Files changed (3) hide show

README.md +2 -2
pipeline/rvc_training.py +55 -134
requirements.txt +5 -5

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: purple
 colorTo: blue
 sdk: gradio
 sdk_version: 5.12.0
-python_version: "3.10"
 app_file: app.py
 pinned: false
 license: mit
@@ -43,7 +43,7 @@ Outil web de **clonage vocal zero-shot** basé sur **Seed-VC** (Diffusion Transf
 1. Onglet **"Convertir un morceau"**
 2. Sélectionnez votre profil vocal
 3. Uploadez le morceau à convertir
-4. Ajustez les paramètres si besoin (transposition, qualité, volumes)
 5. Cliquez **"Convertir et mixer"**
 ## Architecture technique

 colorTo: blue
 sdk: gradio
 sdk_version: 5.12.0
+python_version: "3.12"
 app_file: app.py
 pinned: false
 license: mit
 1. Onglet **"Convertir un morceau"**
 2. Sélectionnez votre profil vocal
 3. Uploadez le morceau à convertir
+4. Ajustez les parámetros si besoin (transposition, qualité, volumes)
 5. Cliquez **"Convertir et mixer"**
 ## Architecture technique

pipeline/rvc_training.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
-import subprocess
 import shutil
 import logging
 import traceback
-from huggingface_hub import hf_hub_download, HfApi
 logger = logging.getLogger(__name__)
 try:
@@ -17,27 +17,26 @@ except ImportError:
                 return fn
             return decorator
-def download_rvc_models(base_dir):
-    """Download required base models for RVC if they don't exist."""
-    models_to_download = [
-        ("lj1995/VoiceConversionWebUI", "hubert_base.pt", "assets/hubert"),
-        ("lj1995/VoiceConversionWebUI", "rmvpe.pt", "assets/rmvpe"),
-        ("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G40k.pth", "assets/pretrained_v2"),
-        ("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D40k.pth", "assets/pretrained_v2"),
-    ]
-    for repo_id, filename, local_dir in models_to_download:
-        dest_path = os.path.join(base_dir, local_dir, os.path.basename(filename))
-        if not os.path.exists(dest_path):
-            logger.info(f"Downloading {filename}...")
-            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
-            try:
-                dl_path = hf_hub_download(repo_id=repo_id, filename=filename)
-                shutil.copy(dl_path, dest_path)
-            except Exception as e:
-                logger.warning(f"Failed to download {filename}: {e}")
-@spaces.GPU(duration=600)
 def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
     if not audio_path:
         return "Error: Please upload an audio file.", None
     if not model_name:
@@ -49,124 +48,45 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
         logger.info(desc)
     model_name = model_name.strip().replace(" ", "_")
-    base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "RVC"))
-    logs_dir = os.path.join(base_dir, "logs", model_name)
-    weights_dir = os.path.join(base_dir, "assets", "weights")
-    # Create directories
-    os.makedirs(logs_dir, exist_ok=True)
-    os.makedirs(weights_dir, exist_ok=True)
-    # 1. Download Base Models
-    p(0.05, "Downloading base models...")
-    download_rvc_models(base_dir)
-    # 2. Prepare Dataset
-    dataset_dir = os.path.join(base_dir, "dataset", model_name)
-    os.makedirs(dataset_dir, exist_ok=True)
-    p(0.1, "Preparing dataset...")
     try:
-        import soundfile as sf
-        import librosa
-        y, sr = librosa.load(audio_path, sr=40000)
-        sf.write(os.path.join(dataset_dir, "audio.wav"), y, 40000)
-    except Exception as e:
-        return f"Failed to process audio: {e}", None
-    def run_cmd(cmd, desc):
-        p(None, desc)
-        logger.info(f"Running: {cmd}")
-        env = os.environ.copy()
-        env["PYTHONPATH"] = base_dir
-        # Ensure it runs from RVC dir
-        result = subprocess.run(cmd, shell=True, env=env, cwd=base_dir, capture_output=True, text=True)
-        if result.returncode != 0:
-            logger.error(f"Error in {desc}:\n{result.stderr}")
-            raise RuntimeError(f"{desc} failed: {result.stderr}")
-        return result.stdout
-    try:
-        # Preprocess
-        run_cmd(f"python infer/modules/train/preprocess.py {dataset_dir} 40000 2 {logs_dir} False", "Preprocessing audio...")
-        # Extract F0
-        run_cmd(f"python infer/modules/train/extract/extract_f0_print.py {logs_dir} 2 rmvpe", "Extracting F0...")
-        # Extract Features
-        device = "cuda:0" if __import__("torch").cuda.is_available() else "cpu"
-        run_cmd(f"python infer/modules/train/extract_feature_print.py {device} 1 0 0 {logs_dir} v2", "Extracting Features...")
-        # Train
-        p(0.4, f"Training for {epochs} epochs (this will take a while)...")
-        # Generate config file dynamically
-        config_path = os.path.join(base_dir, "configs", "v2", "40k.json")
-        if not os.path.exists(os.path.dirname(config_path)):
-            os.makedirs(os.path.dirname(config_path), exist_ok=True)
-            import json
-            basic_config = {
-              "train": {
-                "log_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001,
-                "betas": [0.8, 0.99], "eps": 1e-09, "batch_size": 4, "fp16_run": True,
-                "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1,
-                "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0
-              },
-              "data": {
-                "max_wav_value": 32768.0, "sampling_rate": 40000, "filter_length": 2048,
-                "hop_length": 400, "win_length": 2048, "n_mel_channels": 125, "mel_fmin": 0.0, "mel_fmax": None
-              },
-              "model": {
-                "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768,
-                "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0, "resblock": "1",
-                "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-                "upsample_rates": [10, 10, 2, 2], "upsample_initial_channel": 512,
-                "upsample_kernel_sizes": [16, 16, 4, 4], "n_layers_q": 3, "use_spectral_norm": False, "gin_channels": 256
-              }
-            }
-            with open(config_path, "w") as f:
-                json.dump(basic_config, f)
-        shutil.copy(config_path, os.path.join(logs_dir, "config.json"))
-        gpus_arg = "-g 0" if device.startswith("cuda") else ""
-        train_cmd = (
-            f"python infer/modules/train/train.py -e {model_name} -sr 40k -f0 1 -bs 4 {gpus_arg} "
-            f"-te {epochs} -se 25 -pg assets/pretrained_v2/f0G40k.pth -pd assets/pretrained_v2/f0D40k.pth "
-            f"-l 1 -c 0 -sw 1 -v v2"
         )
-        run_cmd(train_cmd, "Training RVC Model...")
-        # Build Index
-        p(0.8, "Building Index...")
-        try:
-            import numpy as np
-            import faiss
-            feature_dir = os.path.join(logs_dir, "3_feature768")
-            if os.path.exists(feature_dir):
-                npys = []
-                for name in sorted(os.listdir(feature_dir)):
-                    phone = np.load(os.path.join(feature_dir, name))
-                    npys.append(phone)
-                if npys:
-                    big_npy = np.concatenate(npys, 0)
-                    n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
-                    if n_ivf > 0:
-                        index = faiss.index_factory(768, f"IVF{n_ivf},Flat")
-                        index_ivf = faiss.extract_index_ivf(index)
-                        index_ivf.nprobe = 1
-                        index.train(big_npy)
-                        index.add(big_npy)
-                        faiss.write_index(index, os.path.join(logs_dir, f"added_{model_name}_v2.index"))
-        except Exception as e:
-            logger.warning(f"Failed to build index: {e}")
-        pth_path = os.path.join(weights_dir, f"{model_name}.pth")
-        index_path = os.path.join(logs_dir, f"added_{model_name}_v2.index")
-        if not os.path.exists(pth_path):
-            raise FileNotFoundError(f"Model .pth not found at {pth_path}")
-        p(0.9, "Uploading to Hugging Face Dataset...")
         api = HfApi(token=os.environ.get("HF_TOKEN", ""))
         repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
@@ -187,10 +107,11 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
                 )
         except Exception as e:
             logger.error(f"Upload to dataset failed: {e}")
-            return f"Model trained but failed to upload to HF: {e}", pth_path
         return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
     except Exception as e:
         tb = traceback.format_exc()
         return f"Error: {str(e)}\n\nDetails:\n{tb}", None

 import os
 import shutil
 import logging
 import traceback
+from huggingface_hub import HfApi
+# Set up logging
 logger = logging.getLogger(__name__)
 try:
                 return fn
             return decorator
+# Configuration for Ultimate-RVC paths
+# We set these environment variables BEFORE importing ultimate_rvc to ensure it uses our paths
+os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models")
+os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio")
+os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp")
+# Now we can import the core functions from ultimate_rvc
+try:
+    from ultimate_rvc.core.train import prepare, extract, train
+    from ultimate_rvc.typing_extra import TrainingSampleRate, F0Method, EmbedderModel
+    ULTIMATE_RVC_AVAILABLE = True
+except ImportError as e:
+    logger.error(f"Failed to import ultimate_rvc: {e}")
+    ULTIMATE_RVC_AVAILABLE = False
+@spaces.GPU(duration=1000) # Training takes time, let's request more
 def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
+    if not ULTIMATE_RVC_AVAILABLE:
+        return "Error: ultimate-rvc library not installed correctly.", None
     if not audio_path:
         return "Error: Please upload an audio file.", None
     if not model_name:
         logger.info(desc)
     model_name = model_name.strip().replace(" ", "_")
     try:
+        # 1. Populate Dataset
+        p(0.1, "Step 1/4: Preparing dataset...")
+        dataset_path = prepare.populate_dataset(model_name, [audio_path])
+        # 2. Preprocess
+        p(0.2, "Step 2/4: Preprocessing audio...")
+        prepare.preprocess_dataset(
+            model_name=model_name,
+            dataset=dataset_path,
+            sample_rate=TrainingSampleRate.HZ_40K
         )
+        # 3. Extract Features
+        p(0.4, "Step 3/4: Extracting features (F0 & Content)...")
+        extract.extract_features(
+            model_name=model_name,
+            f0_method=F0Method.RMVPE,
+            embedder_model=EmbedderModel.CONTENTVEC
+        )
+        # 4. Train
+        p(0.6, f"Step 4/4: Training for {epochs} epochs (this may take several minutes)...")
+        # ultimate-rvc's run_training returns [pth_path, index_path]
+        result_paths = train.run_training(
+            model_name=model_name,
+            num_epochs=epochs,
+            batch_size=4, # Safe for ZeroGPU
+            save_interval=epochs # Only save at the end
+        )
+        if not result_paths or len(result_paths) < 2:
+            return "Training completed but could not find the output files.", None
+        pth_path, index_path = result_paths[0], result_paths[1]
+        # 5. Upload to Hugging Face
+        p(0.9, "Final Step: Uploading to Hugging Face Dataset...")
         api = HfApi(token=os.environ.get("HF_TOKEN", ""))
         repo_id = os.environ.get("HF_DATASET_REPO", "dimensionalpulsar/rvc-models")
                 )
         except Exception as e:
             logger.error(f"Upload to dataset failed: {e}")
+            return f"Model trained but failed to upload to HF: {e}. Files are at {pth_path}", pth_path
         return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
     except Exception as e:
         tb = traceback.format_exc()
+        logger.error(f"Training error: {e}\n{tb}")
         return f"Error: {str(e)}\n\nDetails:\n{tb}", None

requirements.txt CHANGED Viewed

@@ -33,9 +33,9 @@ bigvgan
 descript-audio-codec
 vocos
-# RVC Training dependencies
-Cython
-fairseq @ git+https://github.com/facebookresearch/fairseq.git
-faiss-cpu
-praat-parselmouth
 tensorboardX

 descript-audio-codec
 vocos
+# RVC Training via Ultimate-RVC
+ultimate-rvc==0.6.0
+torchcrepe
+torchfcpe
 tensorboardX
+wget