rvc

Sleeping

ibcplateformes Claude Opus 4.6 commited on Mar 31

Commit

969158e

1 Parent(s): f729219

Add voice similarity control + improve reference audio processing

Voice similarity improvements:
- Add 'Similarite vocale' slider (0.0-1.0) in UI, controls inference_cfg_rate
0.5=natural, 0.7=balanced (default), 0.9=more faithful to reference
- Improve reference audio preprocessing in training.py:
- HPF 80Hz to remove noise before speaker embedding
- Light compression (2:1) to even out voice levels
- RMS normalize to -16 dBFS for strong speaker embedding signal
- Trim to 25s (Seed-VC's effective max, was 30s)
- More aggressive silence trimming (top_db=20)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

app.py +10 -0
pipeline/inference.py +4 -3
pipeline/training.py +32 -14

app.py CHANGED Viewed

@@ -113,6 +113,7 @@ def convert_song(
     model_choice,
     song_file,
     pitch,
     diffusion_steps,
     vocal_volume,
     instrumental_volume,
@@ -151,6 +152,7 @@ def convert_song(
             reference_path=reference_path,
             pitch=int(pitch),
             diffusion_steps=int(diffusion_steps),
         )
         progress(0.85, desc="Mixage final...")
@@ -306,6 +308,13 @@ with gr.Blocks(
                             step=1,
                             label="Transposition (demi-tons)",
                         )
                         convert_diffusion = gr.Slider(
                             minimum=5,
                             maximum=100,
@@ -366,6 +375,7 @@ with gr.Blocks(
                     convert_model,
                     convert_audio,
                     convert_pitch,
                     convert_diffusion,
                     convert_vocal_vol,
                     convert_inst_vol,

     model_choice,
     song_file,
     pitch,
+    similarity,
     diffusion_steps,
     vocal_volume,
     instrumental_volume,
             reference_path=reference_path,
             pitch=int(pitch),
             diffusion_steps=int(diffusion_steps),
+            similarity=float(similarity),
         )
         progress(0.85, desc="Mixage final...")
                             step=1,
                             label="Transposition (demi-tons)",
                         )
+                        convert_similarity = gr.Slider(
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=0.7,
+                            step=0.05,
+                            label="Similarite vocale (0.5=naturel, 0.7=equilibre, 0.9=plus fidele)",
+                        )
                         convert_diffusion = gr.Slider(
                             minimum=5,
                             maximum=100,
                     convert_model,
                     convert_audio,
                     convert_pitch,
+                    convert_similarity,
                     convert_diffusion,
                     convert_vocal_vol,
                     convert_inst_vol,

pipeline/inference.py CHANGED Viewed

@@ -190,6 +190,7 @@ def convert_voice(
     reference_path,
     pitch=0,
     diffusion_steps=25,
 ):
     """
     Convert voice using Seed-VC zero-shot singing voice conversion.
@@ -235,7 +236,7 @@ def convert_voice(
     try:
         return _convert_voice_impl(
-            audio_path, reference_path, pitch, diffusion_steps
         )
     except Exception as e:
         import traceback
@@ -251,7 +252,7 @@ def convert_voice(
 @torch.no_grad()
 @torch.inference_mode()
-def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
     """Actual conversion implementation (called from GPU-decorated wrapper)."""
     import soundfile as sf
@@ -385,7 +386,7 @@ def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
                 cat_condition,
                 torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                 mel2, style2, None, diffusion_steps,
-                inference_cfg_rate=0.7,
             )
             vc_target = vc_target[:, :, mel2.size(-1):]

     reference_path,
     pitch=0,
     diffusion_steps=25,
+    similarity=0.7,
 ):
     """
     Convert voice using Seed-VC zero-shot singing voice conversion.
     try:
         return _convert_voice_impl(
+            audio_path, reference_path, pitch, diffusion_steps, similarity
         )
     except Exception as e:
         import traceback
 @torch.no_grad()
 @torch.inference_mode()
+def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps, similarity=0.7):
     """Actual conversion implementation (called from GPU-decorated wrapper)."""
     import soundfile as sf
                 cat_condition,
                 torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                 mel2, style2, None, diffusion_steps,
+                inference_cfg_rate=similarity,
             )
             vc_target = vc_target[:, :, mel2.size(-1):]

pipeline/training.py CHANGED Viewed

@@ -68,25 +68,43 @@ def save_voice_reference(
             "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
         )
-    # Limit to 30 seconds (Seed-VC max reference length)
-    max_samples = 30 * sr
-    if len(audio) > max_samples:
-        audio = audio[:max_samples]
-        logger.info("Trimmed reference to 30s (Seed-VC max).")
     if progress_callback:
-        progress_callback(0.3, "Normalisation et nettoyage...")
-    # Normalize audio
-    peak = np.abs(audio).max()
-    if peak > 0:
-        audio = audio / peak * 0.95
-    # Trim silence from start and end
-    audio_trimmed, _ = librosa.effects.trim(audio, top_db=25)
     if len(audio_trimmed) > sr * 2:
         audio = audio_trimmed
     if progress_callback:
         progress_callback(0.6, "Sauvegarde de la reference vocale...")

             "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
         )
     if progress_callback:
+        progress_callback(0.3, "Optimisation de la reference vocale...")
+    # 1. Trim silence from start and end (aggressive: top_db=20)
+    audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
     if len(audio_trimmed) > sr * 2:
         audio = audio_trimmed
+    # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
+    max_samples = 25 * sr
+    if len(audio) > max_samples:
+        audio = audio[:max_samples]
+        logger.info("Trimmed reference to 25s (Seed-VC effective max).")
+    # 3. Remove low-frequency noise (high-pass filter at 80Hz)
+    try:
+        from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
+        ref_board = Pedalboard([
+            HighpassFilter(cutoff_frequency_hz=80.0),
+            # Light compression to even out the reference voice level
+            Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
+            Gain(gain_db=1.0),
+        ])
+        audio_2d = audio.reshape(1, -1).astype(np.float32)
+        audio_2d = ref_board(audio_2d, sr)
+        audio = audio_2d.squeeze()
+    except Exception as e:
+        logger.warning("Pedalboard processing skipped: {}".format(e))
+    # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
+    # to give the speaker embedding model a strong signal)
+    rms = np.sqrt(np.mean(audio ** 2))
+    target_rms = 10 ** (-16.0 / 20.0)
+    if rms > 1e-6:
+        audio = audio * (target_rms / rms)
+    audio = np.clip(audio, -0.99, 0.99)
     if progress_callback:
         progress_callback(0.6, "Sauvegarde de la reference vocale...")