ibcplateformes Claude Opus 4.6 commited on
Commit
969158e
·
1 Parent(s): f729219

Add voice similarity control + improve reference audio processing

Browse files

Voice similarity improvements:
- Add 'Similarite vocale' slider (0.0-1.0) in UI, controls inference_cfg_rate
0.5=natural, 0.7=balanced (default), 0.9=more faithful to reference
- Improve reference audio preprocessing in training.py:
- HPF 80Hz to remove noise before speaker embedding
- Light compression (2:1) to even out voice levels
- RMS normalize to -16 dBFS for strong speaker embedding signal
- Trim to 25s (Seed-VC's effective max, was 30s)
- More aggressive silence trimming (top_db=20)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +10 -0
  2. pipeline/inference.py +4 -3
  3. pipeline/training.py +32 -14
app.py CHANGED
@@ -113,6 +113,7 @@ def convert_song(
113
  model_choice,
114
  song_file,
115
  pitch,
 
116
  diffusion_steps,
117
  vocal_volume,
118
  instrumental_volume,
@@ -151,6 +152,7 @@ def convert_song(
151
  reference_path=reference_path,
152
  pitch=int(pitch),
153
  diffusion_steps=int(diffusion_steps),
 
154
  )
155
 
156
  progress(0.85, desc="Mixage final...")
@@ -306,6 +308,13 @@ with gr.Blocks(
306
  step=1,
307
  label="Transposition (demi-tons)",
308
  )
 
 
 
 
 
 
 
309
  convert_diffusion = gr.Slider(
310
  minimum=5,
311
  maximum=100,
@@ -366,6 +375,7 @@ with gr.Blocks(
366
  convert_model,
367
  convert_audio,
368
  convert_pitch,
 
369
  convert_diffusion,
370
  convert_vocal_vol,
371
  convert_inst_vol,
 
113
  model_choice,
114
  song_file,
115
  pitch,
116
+ similarity,
117
  diffusion_steps,
118
  vocal_volume,
119
  instrumental_volume,
 
152
  reference_path=reference_path,
153
  pitch=int(pitch),
154
  diffusion_steps=int(diffusion_steps),
155
+ similarity=float(similarity),
156
  )
157
 
158
  progress(0.85, desc="Mixage final...")
 
308
  step=1,
309
  label="Transposition (demi-tons)",
310
  )
311
+ convert_similarity = gr.Slider(
312
+ minimum=0.0,
313
+ maximum=1.0,
314
+ value=0.7,
315
+ step=0.05,
316
+ label="Similarite vocale (0.5=naturel, 0.7=equilibre, 0.9=plus fidele)",
317
+ )
318
  convert_diffusion = gr.Slider(
319
  minimum=5,
320
  maximum=100,
 
375
  convert_model,
376
  convert_audio,
377
  convert_pitch,
378
+ convert_similarity,
379
  convert_diffusion,
380
  convert_vocal_vol,
381
  convert_inst_vol,
pipeline/inference.py CHANGED
@@ -190,6 +190,7 @@ def convert_voice(
190
  reference_path,
191
  pitch=0,
192
  diffusion_steps=25,
 
193
  ):
194
  """
195
  Convert voice using Seed-VC zero-shot singing voice conversion.
@@ -235,7 +236,7 @@ def convert_voice(
235
 
236
  try:
237
  return _convert_voice_impl(
238
- audio_path, reference_path, pitch, diffusion_steps
239
  )
240
  except Exception as e:
241
  import traceback
@@ -251,7 +252,7 @@ def convert_voice(
251
 
252
  @torch.no_grad()
253
  @torch.inference_mode()
254
- def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
255
  """Actual conversion implementation (called from GPU-decorated wrapper)."""
256
  import soundfile as sf
257
 
@@ -385,7 +386,7 @@ def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps):
385
  cat_condition,
386
  torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
387
  mel2, style2, None, diffusion_steps,
388
- inference_cfg_rate=0.7,
389
  )
390
  vc_target = vc_target[:, :, mel2.size(-1):]
391
 
 
190
  reference_path,
191
  pitch=0,
192
  diffusion_steps=25,
193
+ similarity=0.7,
194
  ):
195
  """
196
  Convert voice using Seed-VC zero-shot singing voice conversion.
 
236
 
237
  try:
238
  return _convert_voice_impl(
239
+ audio_path, reference_path, pitch, diffusion_steps, similarity
240
  )
241
  except Exception as e:
242
  import traceback
 
252
 
253
  @torch.no_grad()
254
  @torch.inference_mode()
255
+ def _convert_voice_impl(audio_path, reference_path, pitch, diffusion_steps, similarity=0.7):
256
  """Actual conversion implementation (called from GPU-decorated wrapper)."""
257
  import soundfile as sf
258
 
 
386
  cat_condition,
387
  torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
388
  mel2, style2, None, diffusion_steps,
389
+ inference_cfg_rate=similarity,
390
  )
391
  vc_target = vc_target[:, :, mel2.size(-1):]
392
 
pipeline/training.py CHANGED
@@ -68,25 +68,43 @@ def save_voice_reference(
68
  "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
69
  )
70
 
71
- # Limit to 30 seconds (Seed-VC max reference length)
72
- max_samples = 30 * sr
73
- if len(audio) > max_samples:
74
- audio = audio[:max_samples]
75
- logger.info("Trimmed reference to 30s (Seed-VC max).")
76
-
77
  if progress_callback:
78
- progress_callback(0.3, "Normalisation et nettoyage...")
79
 
80
- # Normalize audio
81
- peak = np.abs(audio).max()
82
- if peak > 0:
83
- audio = audio / peak * 0.95
84
-
85
- # Trim silence from start and end
86
- audio_trimmed, _ = librosa.effects.trim(audio, top_db=25)
87
  if len(audio_trimmed) > sr * 2:
88
  audio = audio_trimmed
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if progress_callback:
91
  progress_callback(0.6, "Sauvegarde de la reference vocale...")
92
 
 
68
  "Audio trop court ({:.1f}s). Minimum 3 secondes recommande.".format(duration)
69
  )
70
 
 
 
 
 
 
 
71
  if progress_callback:
72
+ progress_callback(0.3, "Optimisation de la reference vocale...")
73
 
74
+ # 1. Trim silence from start and end (aggressive: top_db=20)
75
+ audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)
 
 
 
 
 
76
  if len(audio_trimmed) > sr * 2:
77
  audio = audio_trimmed
78
 
79
+ # 2. Limit to 25 seconds (Seed-VC clips reference to 25s internally)
80
+ max_samples = 25 * sr
81
+ if len(audio) > max_samples:
82
+ audio = audio[:max_samples]
83
+ logger.info("Trimmed reference to 25s (Seed-VC effective max).")
84
+
85
+ # 3. Remove low-frequency noise (high-pass filter at 80Hz)
86
+ try:
87
+ from pedalboard import Pedalboard, HighpassFilter, Compressor, Gain
88
+ ref_board = Pedalboard([
89
+ HighpassFilter(cutoff_frequency_hz=80.0),
90
+ # Light compression to even out the reference voice level
91
+ Compressor(threshold_db=-20.0, ratio=2.0, attack_ms=10.0, release_ms=150.0),
92
+ Gain(gain_db=1.0),
93
+ ])
94
+ audio_2d = audio.reshape(1, -1).astype(np.float32)
95
+ audio_2d = ref_board(audio_2d, sr)
96
+ audio = audio_2d.squeeze()
97
+ except Exception as e:
98
+ logger.warning("Pedalboard processing skipped: {}".format(e))
99
+
100
+ # 4. RMS normalize to -16 dBFS (slightly louder than converted vocals
101
+ # to give the speaker embedding model a strong signal)
102
+ rms = np.sqrt(np.mean(audio ** 2))
103
+ target_rms = 10 ** (-16.0 / 20.0)
104
+ if rms > 1e-6:
105
+ audio = audio * (target_rms / rms)
106
+ audio = np.clip(audio, -0.99, 0.99)
107
+
108
  if progress_callback:
109
  progress_callback(0.6, "Sauvegarde de la reference vocale...")
110