rvc

Sleeping

ibcplateformes Claude Opus 4.6 commited on Mar 31

Commit

f729219

1 Parent(s): 6098f78

Upgrade audio quality: pro mixing chain, better inference params, htdemucs_ft

Major quality improvements to match professional platforms:

1. mixing.py: Add Pedalboard DSP chain for vocals before mixing
- HighpassFilter (80Hz) removes rumble
- Compressor (4:1, -16dB threshold) for consistent dynamics
- PeakFilter (3kHz, +2.5dB) for vocal presence
- LowShelfFilter (6kHz, -2dB) as simplified de-esser
- Limiter (-1dB) on final mix replaces naive peak normalize

2. inference.py: Fix parameters and normalization
- Hardcode inference_cfg_rate=0.7 (was incorrectly using RVC index_rate)
- Remove unused RVC params (index_path, f0_method, protect, etc.)
- Replace peak normalization with RMS normalization (-18 dBFS)

3. separation.py: Switch to htdemucs_ft (fine-tuned, better SDR)

4. app.py: Default diffusion steps 10 -> 25 for better quality

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show

app.py +2 -3
pipeline/inference.py +10 -13
pipeline/mixing.py +56 -11
pipeline/separation.py +1 -1

app.py CHANGED Viewed

@@ -151,7 +151,6 @@ def convert_song(
             reference_path=reference_path,
             pitch=int(pitch),
             diffusion_steps=int(diffusion_steps),
-            index_rate=0.7,
         )
         progress(0.85, desc="Mixage final...")
@@ -310,9 +309,9 @@ with gr.Blocks(
                         convert_diffusion = gr.Slider(
                             minimum=5,
                             maximum=100,
-                            value=10,
                             step=5,
-                            label="Qualite (10=rapide, 25=bon, 50-100=meilleure qualite)",
                         )
                         convert_vocal_vol = gr.Slider(
                             minimum=0.0,

             reference_path=reference_path,
             pitch=int(pitch),
             diffusion_steps=int(diffusion_steps),
         )
         progress(0.85, desc="Mixage final...")
                         convert_diffusion = gr.Slider(
                             minimum=5,
                             maximum=100,
+                            value=25,
                             step=5,
+                            label="Qualite (10=rapide, 25=equilibre, 50=haute qualite)",
                         )
                         convert_vocal_vol = gr.Slider(
                             minimum=0.0,

pipeline/inference.py CHANGED Viewed

@@ -188,13 +188,7 @@ def _test_import(name, module_path, subattr=None):
 def convert_voice(
     audio_path,
     reference_path,
-    index_path=None,
     pitch=0,
-    f0_method="rmvpe",
-    index_rate=0.7,
-    protect=0.33,
-    volume_envelope=1.0,
-    output_format="WAV",
     diffusion_steps=25,
 ):
     """
@@ -241,7 +235,7 @@ def convert_voice(
     try:
         return _convert_voice_impl(
-            audio_path, reference_path, pitch, index_rate, diffusion_steps
         )
     except Exception as e:
         import traceback
@@ -257,7 +251,7 @@ def convert_voice(
 @torch.no_grad()
 @torch.inference_mode()
-def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion_steps):
     """Actual conversion implementation (called from GPU-decorated wrapper)."""
     import soundfile as sf
@@ -391,7 +385,7 @@ def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion
                 cat_condition,
                 torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                 mel2, style2, None, diffusion_steps,
-                inference_cfg_rate=index_rate,
             )
             vc_target = vc_target[:, :, mel2.size(-1):]
@@ -424,11 +418,14 @@ def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
-    # Concatenate and normalize
     audio_out = np.concatenate(generated_wave_chunks)
-    audio_max = np.abs(audio_out).max()
-    if audio_max > 0.01:
-        audio_out = audio_out / audio_max * 0.95
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")

 def convert_voice(
     audio_path,
     reference_path,
     pitch=0,
     diffusion_steps=25,
 ):
     """
     try:
         return _convert_voice_impl(
+            audio_path, reference_path, pitch, diffusion_steps
         )
     except Exception as e:
         import traceback
 @torch.no_grad()
 @torch.inference_mode()
+def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
     """Actual conversion implementation (called from GPU-decorated wrapper)."""
     import soundfile as sf
                 cat_condition,
                 torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                 mel2, style2, None, diffusion_steps,
+                inference_cfg_rate=0.7,
             )
             vc_target = vc_target[:, :, mel2.size(-1):]
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
+    # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
     audio_out = np.concatenate(generated_wave_chunks)
+    rms = np.sqrt(np.mean(audio_out ** 2))
+    target_rms = 10 ** (-18.0 / 20.0)  # -18 dBFS
+    if rms > 1e-6:
+        audio_out = audio_out * (target_rms / rms)
+    # Safety clip to prevent any overflow
+    audio_out = np.clip(audio_out, -0.99, 0.99)
     # Save
     sf.write(output_path, audio_out, sr, subtype="PCM_16")

pipeline/mixing.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-Audio mixing module: combines converted vocals with instrumental track.
 """
 import os
@@ -7,12 +8,51 @@ import logging
 import numpy as np
 import librosa
 import soundfile as sf
 logger = logging.getLogger(__name__)
 OUTPUT_DIR = "/tmp/rvc_output"
 def mix_audio(
     vocals_path: str,
     instruments_path: str,
@@ -22,15 +62,16 @@ def mix_audio(
 ):
     """
     Mix converted vocals with instrumental track.
     Output: WAV 44.1kHz 16-bit.
     Returns path to mixed audio file.
     """
     os.makedirs(OUTPUT_DIR, exist_ok=True)
-    logger.info(f"Loading vocals: {vocals_path}")
     vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)
-    logger.info(f"Loading instruments: {instruments_path}")
     instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)
     # Ensure both are 2D (channels, samples)
@@ -42,26 +83,30 @@ def mix_audio(
     # Match lengths (pad shorter with silence)
     max_len = max(vocals.shape[-1], instruments.shape[-1])
     if vocals.shape[-1] < max_len:
-        pad_width = [(0, 0)] * (vocals.ndim - 1) + [(0, max_len - vocals.shape[-1])]
         vocals = np.pad(vocals, pad_width)
     if instruments.shape[-1] < max_len:
-        pad_width = [(0, 0)] * (instruments.ndim - 1) + [(0, max_len - instruments.shape[-1])]
         instruments = np.pad(instruments, pad_width)
     # Mix with volume controls
     mixed = vocals * vocal_volume + instruments * instrumental_volume
-    # Normalize to prevent clipping
-    peak = np.abs(mixed).max()
-    if peak > 0.95:
-        mixed = mixed * (0.95 / peak)
     # Generate output filename
     vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
-    output_path = os.path.join(OUTPUT_DIR, f"{vocals_base}_mix_final.wav")
     # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
     sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")
-    logger.info(f"Mix complete: {output_path}")
     return output_path

 """
+Audio mixing module: professional vocal processing + mix with instrumentals.
+Uses Pedalboard for studio-quality DSP chain.
 """
 import os
 import numpy as np
 import librosa
 import soundfile as sf
+from pedalboard import (
+    Pedalboard, Compressor, HighpassFilter,
+    PeakFilter, LowShelfFilter, Limiter, Gain,
+)
 logger = logging.getLogger(__name__)
 OUTPUT_DIR = "/tmp/rvc_output"
+def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray:
+    """
+    Apply professional vocal processing chain before mixing.
+    Input/output shape: (channels, samples), float32.
+    """
+    board = Pedalboard([
+        # 1. Remove sub-bass rumble and proximity effect
+        HighpassFilter(cutoff_frequency_hz=80.0),
+        # 2. Compress dynamics for consistent vocal level (standard vocal settings)
+        Compressor(
+            threshold_db=-16.0,
+            ratio=4.0,
+            attack_ms=5.0,
+            release_ms=100.0,
+        ),
+        # 3. Presence boost — helps vocal cut through the mix
+        PeakFilter(
+            cutoff_frequency_hz=3000.0,
+            gain_db=2.5,
+            q=1.0,
+        ),
+        # 4. Simple de-esser — gentle high-freq reduction to tame sibilance
+        LowShelfFilter(
+            cutoff_frequency_hz=6000.0,
+            gain_db=-2.0,
+        ),
+        # 5. Makeup gain after compression
+        Gain(gain_db=1.0),
+    ])
+    processed = board(vocals.astype(np.float32), sr)
+    logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)")
+    return processed
 def mix_audio(
     vocals_path: str,
     instruments_path: str,
 ):
     """
     Mix converted vocals with instrumental track.
+    Applies professional vocal processing before mixing.
     Output: WAV 44.1kHz 16-bit.
     Returns path to mixed audio file.
     """
     os.makedirs(OUTPUT_DIR, exist_ok=True)
+    logger.info("Loading vocals: {}".format(vocals_path))
     vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)
+    logger.info("Loading instruments: {}".format(instruments_path))
     instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)
     # Ensure both are 2D (channels, samples)
     # Match lengths (pad shorter with silence)
     max_len = max(vocals.shape[-1], instruments.shape[-1])
     if vocals.shape[-1] < max_len:
+        pad_width = [(0, 0), (0, max_len - vocals.shape[-1])]
         vocals = np.pad(vocals, pad_width)
     if instruments.shape[-1] < max_len:
+        pad_width = [(0, 0), (0, max_len - instruments.shape[-1])]
         instruments = np.pad(instruments, pad_width)
+    # Apply professional vocal processing chain
+    vocals = _process_vocals(vocals, output_sr)
     # Mix with volume controls
     mixed = vocals * vocal_volume + instruments * instrumental_volume
+    # Apply limiter to final mix (replaces naive peak normalization)
+    limiter = Pedalboard([
+        Limiter(threshold_db=-1.0, release_ms=100.0),
+    ])
+    mixed = limiter(mixed.astype(np.float32), output_sr)
     # Generate output filename
     vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
+    output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base))
     # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
     sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")
+    logger.info("Mix complete: {}".format(output_path))
     return output_path

pipeline/separation.py CHANGED Viewed

@@ -23,7 +23,7 @@ OUTPUT_DIR = "/tmp/demucs_output"
 @spaces.GPU(duration=60)
-def separate_audio(audio_path: str, model_name: str = "htdemucs"):
     """
     Separate audio into vocals and instruments using Demucs.
     Returns (vocals_path, instruments_path).

 @spaces.GPU(duration=60)
+def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
     """
     Separate audio into vocals and instruments using Demucs.
     Returns (vocals_path, instruments_path).