ibcplateformes Claude Opus 4.6 commited on
Commit
f729219
·
1 Parent(s): 6098f78

Upgrade audio quality: pro mixing chain, better inference params, htdemucs_ft

Browse files

Major quality improvements to match professional platforms:

1. mixing.py: Add Pedalboard DSP chain for vocals before mixing
- HighpassFilter (80Hz) removes rumble
- Compressor (4:1, -16dB threshold) for consistent dynamics
- PeakFilter (3kHz, +2.5dB) for vocal presence
- LowShelfFilter (6kHz, -2dB) as simplified de-esser
- Limiter (-1dB) on final mix replaces naive peak normalize

2. inference.py: Fix parameters and normalization
- Hardcode inference_cfg_rate=0.7 (was incorrectly using RVC index_rate)
- Remove unused RVC params (index_path, f0_method, protect, etc.)
- Replace peak normalization with RMS normalization (-18 dBFS)

3. separation.py: Switch to htdemucs_ft (fine-tuned, better SDR)

4. app.py: Default diffusion steps 10 -> 25 for better quality

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. app.py +2 -3
  2. pipeline/inference.py +10 -13
  3. pipeline/mixing.py +56 -11
  4. pipeline/separation.py +1 -1
app.py CHANGED
@@ -151,7 +151,6 @@ def convert_song(
151
  reference_path=reference_path,
152
  pitch=int(pitch),
153
  diffusion_steps=int(diffusion_steps),
154
- index_rate=0.7,
155
  )
156
 
157
  progress(0.85, desc="Mixage final...")
@@ -310,9 +309,9 @@ with gr.Blocks(
310
  convert_diffusion = gr.Slider(
311
  minimum=5,
312
  maximum=100,
313
- value=10,
314
  step=5,
315
- label="Qualite (10=rapide, 25=bon, 50-100=meilleure qualite)",
316
  )
317
  convert_vocal_vol = gr.Slider(
318
  minimum=0.0,
 
151
  reference_path=reference_path,
152
  pitch=int(pitch),
153
  diffusion_steps=int(diffusion_steps),
 
154
  )
155
 
156
  progress(0.85, desc="Mixage final...")
 
309
  convert_diffusion = gr.Slider(
310
  minimum=5,
311
  maximum=100,
312
+ value=25,
313
  step=5,
314
+ label="Qualite (10=rapide, 25=equilibre, 50=haute qualite)",
315
  )
316
  convert_vocal_vol = gr.Slider(
317
  minimum=0.0,
pipeline/inference.py CHANGED
@@ -188,13 +188,7 @@ def _test_import(name, module_path, subattr=None):
188
  def convert_voice(
189
  audio_path,
190
  reference_path,
191
- index_path=None,
192
  pitch=0,
193
- f0_method="rmvpe",
194
- index_rate=0.7,
195
- protect=0.33,
196
- volume_envelope=1.0,
197
- output_format="WAV",
198
  diffusion_steps=25,
199
  ):
200
  """
@@ -241,7 +235,7 @@ def convert_voice(
241
 
242
  try:
243
  return _convert_voice_impl(
244
- audio_path, reference_path, pitch, index_rate, diffusion_steps
245
  )
246
  except Exception as e:
247
  import traceback
@@ -257,7 +251,7 @@ def convert_voice(
257
 
258
  @torch.no_grad()
259
  @torch.inference_mode()
260
- def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion_steps):
261
  """Actual conversion implementation (called from GPU-decorated wrapper)."""
262
  import soundfile as sf
263
 
@@ -391,7 +385,7 @@ def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion
391
  cat_condition,
392
  torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
393
  mel2, style2, None, diffusion_steps,
394
- inference_cfg_rate=index_rate,
395
  )
396
  vc_target = vc_target[:, :, mel2.size(-1):]
397
 
@@ -424,11 +418,14 @@ def _convert_voice_impl(audio_path, reference_path, pitch, index_rate, diffusion
424
  previous_chunk = vc_wave[0, -overlap_wave_len:]
425
  processed_frames += vc_target.size(2) - overlap_frame_len
426
 
427
- # Concatenate and normalize
428
  audio_out = np.concatenate(generated_wave_chunks)
429
- audio_max = np.abs(audio_out).max()
430
- if audio_max > 0.01:
431
- audio_out = audio_out / audio_max * 0.95
 
 
 
432
 
433
  # Save
434
  sf.write(output_path, audio_out, sr, subtype="PCM_16")
 
188
  def convert_voice(
189
  audio_path,
190
  reference_path,
 
191
  pitch=0,
 
 
 
 
 
192
  diffusion_steps=25,
193
  ):
194
  """
 
235
 
236
  try:
237
  return _convert_voice_impl(
238
+ audio_path, reference_path, pitch, diffusion_steps
239
  )
240
  except Exception as e:
241
  import traceback
 
251
 
252
  @torch.no_grad()
253
  @torch.inference_mode()
254
+ def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
255
  """Actual conversion implementation (called from GPU-decorated wrapper)."""
256
  import soundfile as sf
257
 
 
385
  cat_condition,
386
  torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
387
  mel2, style2, None, diffusion_steps,
388
+ inference_cfg_rate=0.7,
389
  )
390
  vc_target = vc_target[:, :, mel2.size(-1):]
391
 
 
418
  previous_chunk = vc_wave[0, -overlap_wave_len:]
419
  processed_frames += vc_target.size(2) - overlap_frame_len
420
 
421
+ # Concatenate and normalize to -18 dBFS RMS (standard vocal level before mixing)
422
  audio_out = np.concatenate(generated_wave_chunks)
423
+ rms = np.sqrt(np.mean(audio_out ** 2))
424
+ target_rms = 10 ** (-18.0 / 20.0) # -18 dBFS
425
+ if rms > 1e-6:
426
+ audio_out = audio_out * (target_rms / rms)
427
+ # Safety clip to prevent any overflow
428
+ audio_out = np.clip(audio_out, -0.99, 0.99)
429
 
430
  # Save
431
  sf.write(output_path, audio_out, sr, subtype="PCM_16")
pipeline/mixing.py CHANGED
@@ -1,5 +1,6 @@
1
  """
2
- Audio mixing module: combines converted vocals with instrumental track.
 
3
  """
4
 
5
  import os
@@ -7,12 +8,51 @@ import logging
7
  import numpy as np
8
  import librosa
9
  import soundfile as sf
 
 
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
  OUTPUT_DIR = "/tmp/rvc_output"
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def mix_audio(
17
  vocals_path: str,
18
  instruments_path: str,
@@ -22,15 +62,16 @@ def mix_audio(
22
  ):
23
  """
24
  Mix converted vocals with instrumental track.
 
25
  Output: WAV 44.1kHz 16-bit.
26
  Returns path to mixed audio file.
27
  """
28
  os.makedirs(OUTPUT_DIR, exist_ok=True)
29
 
30
- logger.info(f"Loading vocals: {vocals_path}")
31
  vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)
32
 
33
- logger.info(f"Loading instruments: {instruments_path}")
34
  instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)
35
 
36
  # Ensure both are 2D (channels, samples)
@@ -42,26 +83,30 @@ def mix_audio(
42
  # Match lengths (pad shorter with silence)
43
  max_len = max(vocals.shape[-1], instruments.shape[-1])
44
  if vocals.shape[-1] < max_len:
45
- pad_width = [(0, 0)] * (vocals.ndim - 1) + [(0, max_len - vocals.shape[-1])]
46
  vocals = np.pad(vocals, pad_width)
47
  if instruments.shape[-1] < max_len:
48
- pad_width = [(0, 0)] * (instruments.ndim - 1) + [(0, max_len - instruments.shape[-1])]
49
  instruments = np.pad(instruments, pad_width)
50
 
 
 
 
51
  # Mix with volume controls
52
  mixed = vocals * vocal_volume + instruments * instrumental_volume
53
 
54
- # Normalize to prevent clipping
55
- peak = np.abs(mixed).max()
56
- if peak > 0.95:
57
- mixed = mixed * (0.95 / peak)
 
58
 
59
  # Generate output filename
60
  vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
61
- output_path = os.path.join(OUTPUT_DIR, f"{vocals_base}_mix_final.wav")
62
 
63
  # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
64
  sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")
65
 
66
- logger.info(f"Mix complete: {output_path}")
67
  return output_path
 
1
  """
2
+ Audio mixing module: professional vocal processing + mix with instrumentals.
3
+ Uses Pedalboard for studio-quality DSP chain.
4
  """
5
 
6
  import os
 
8
  import numpy as np
9
  import librosa
10
  import soundfile as sf
11
+ from pedalboard import (
12
+ Pedalboard, Compressor, HighpassFilter,
13
+ PeakFilter, LowShelfFilter, Limiter, Gain,
14
+ )
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
  OUTPUT_DIR = "/tmp/rvc_output"
19
 
20
 
21
+ def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray:
22
+ """
23
+ Apply professional vocal processing chain before mixing.
24
+ Input/output shape: (channels, samples), float32.
25
+ """
26
+ board = Pedalboard([
27
+ # 1. Remove sub-bass rumble and proximity effect
28
+ HighpassFilter(cutoff_frequency_hz=80.0),
29
+ # 2. Compress dynamics for consistent vocal level (standard vocal settings)
30
+ Compressor(
31
+ threshold_db=-16.0,
32
+ ratio=4.0,
33
+ attack_ms=5.0,
34
+ release_ms=100.0,
35
+ ),
36
+ # 3. Presence boost — helps vocal cut through the mix
37
+ PeakFilter(
38
+ cutoff_frequency_hz=3000.0,
39
+ gain_db=2.5,
40
+ q=1.0,
41
+ ),
42
+ # 4. Simple de-esser — gentle high-freq reduction to tame sibilance
43
+ LowShelfFilter(
44
+ cutoff_frequency_hz=6000.0,
45
+ gain_db=-2.0,
46
+ ),
47
+ # 5. Makeup gain after compression
48
+ Gain(gain_db=1.0),
49
+ ])
50
+
51
+ processed = board(vocals.astype(np.float32), sr)
52
+ logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)")
53
+ return processed
54
+
55
+
56
  def mix_audio(
57
  vocals_path: str,
58
  instruments_path: str,
 
62
  ):
63
  """
64
  Mix converted vocals with instrumental track.
65
+ Applies professional vocal processing before mixing.
66
  Output: WAV 44.1kHz 16-bit.
67
  Returns path to mixed audio file.
68
  """
69
  os.makedirs(OUTPUT_DIR, exist_ok=True)
70
 
71
+ logger.info("Loading vocals: {}".format(vocals_path))
72
  vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)
73
 
74
+ logger.info("Loading instruments: {}".format(instruments_path))
75
  instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)
76
 
77
  # Ensure both are 2D (channels, samples)
 
83
  # Match lengths (pad shorter with silence)
84
  max_len = max(vocals.shape[-1], instruments.shape[-1])
85
  if vocals.shape[-1] < max_len:
86
+ pad_width = [(0, 0), (0, max_len - vocals.shape[-1])]
87
  vocals = np.pad(vocals, pad_width)
88
  if instruments.shape[-1] < max_len:
89
+ pad_width = [(0, 0), (0, max_len - instruments.shape[-1])]
90
  instruments = np.pad(instruments, pad_width)
91
 
92
+ # Apply professional vocal processing chain
93
+ vocals = _process_vocals(vocals, output_sr)
94
+
95
  # Mix with volume controls
96
  mixed = vocals * vocal_volume + instruments * instrumental_volume
97
 
98
+ # Apply limiter to final mix (replaces naive peak normalization)
99
+ limiter = Pedalboard([
100
+ Limiter(threshold_db=-1.0, release_ms=100.0),
101
+ ])
102
+ mixed = limiter(mixed.astype(np.float32), output_sr)
103
 
104
  # Generate output filename
105
  vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
106
+ output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base))
107
 
108
  # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
109
  sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")
110
 
111
+ logger.info("Mix complete: {}".format(output_path))
112
  return output_path
pipeline/separation.py CHANGED
@@ -23,7 +23,7 @@ OUTPUT_DIR = "/tmp/demucs_output"
23
 
24
 
25
  @spaces.GPU(duration=60)
26
- def separate_audio(audio_path: str, model_name: str = "htdemucs"):
27
  """
28
  Separate audio into vocals and instruments using Demucs.
29
  Returns (vocals_path, instruments_path).
 
23
 
24
 
25
  @spaces.GPU(duration=60)
26
+ def separate_audio(audio_path: str, model_name: str = "htdemucs_ft"):
27
  """
28
  Separate audio into vocals and instruments using Demucs.
29
  Returns (vocals_path, instruments_path).