Spaces:
Sleeping
Sleeping
ibcplateformes
Upgrade audio quality: pro mixing chain, better inference params, htdemucs_ft
f729219 | """ | |
| Audio mixing module: professional vocal processing + mix with instrumentals. | |
| Uses Pedalboard for studio-quality DSP chain. | |
| """ | |
| import os | |
| import logging | |
| import numpy as np | |
| import librosa | |
| import soundfile as sf | |
| from pedalboard import ( | |
| Pedalboard, Compressor, HighpassFilter, | |
| PeakFilter, LowShelfFilter, Limiter, Gain, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| OUTPUT_DIR = "/tmp/rvc_output" | |
| def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray: | |
| """ | |
| Apply professional vocal processing chain before mixing. | |
| Input/output shape: (channels, samples), float32. | |
| """ | |
| board = Pedalboard([ | |
| # 1. Remove sub-bass rumble and proximity effect | |
| HighpassFilter(cutoff_frequency_hz=80.0), | |
| # 2. Compress dynamics for consistent vocal level (standard vocal settings) | |
| Compressor( | |
| threshold_db=-16.0, | |
| ratio=4.0, | |
| attack_ms=5.0, | |
| release_ms=100.0, | |
| ), | |
| # 3. Presence boost — helps vocal cut through the mix | |
| PeakFilter( | |
| cutoff_frequency_hz=3000.0, | |
| gain_db=2.5, | |
| q=1.0, | |
| ), | |
| # 4. Simple de-esser — gentle high-freq reduction to tame sibilance | |
| LowShelfFilter( | |
| cutoff_frequency_hz=6000.0, | |
| gain_db=-2.0, | |
| ), | |
| # 5. Makeup gain after compression | |
| Gain(gain_db=1.0), | |
| ]) | |
| processed = board(vocals.astype(np.float32), sr) | |
| logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)") | |
| return processed | |
| def mix_audio( | |
| vocals_path: str, | |
| instruments_path: str, | |
| vocal_volume: float = 1.0, | |
| instrumental_volume: float = 1.0, | |
| output_sr: int = 44100, | |
| ): | |
| """ | |
| Mix converted vocals with instrumental track. | |
| Applies professional vocal processing before mixing. | |
| Output: WAV 44.1kHz 16-bit. | |
| Returns path to mixed audio file. | |
| """ | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| logger.info("Loading vocals: {}".format(vocals_path)) | |
| vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False) | |
| logger.info("Loading instruments: {}".format(instruments_path)) | |
| instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False) | |
| # Ensure both are 2D (channels, samples) | |
| if vocals.ndim == 1: | |
| vocals = np.stack([vocals, vocals]) | |
| if instruments.ndim == 1: | |
| instruments = np.stack([instruments, instruments]) | |
| # Match lengths (pad shorter with silence) | |
| max_len = max(vocals.shape[-1], instruments.shape[-1]) | |
| if vocals.shape[-1] < max_len: | |
| pad_width = [(0, 0), (0, max_len - vocals.shape[-1])] | |
| vocals = np.pad(vocals, pad_width) | |
| if instruments.shape[-1] < max_len: | |
| pad_width = [(0, 0), (0, max_len - instruments.shape[-1])] | |
| instruments = np.pad(instruments, pad_width) | |
| # Apply professional vocal processing chain | |
| vocals = _process_vocals(vocals, output_sr) | |
| # Mix with volume controls | |
| mixed = vocals * vocal_volume + instruments * instrumental_volume | |
| # Apply limiter to final mix (replaces naive peak normalization) | |
| limiter = Pedalboard([ | |
| Limiter(threshold_db=-1.0, release_ms=100.0), | |
| ]) | |
| mixed = limiter(mixed.astype(np.float32), output_sr) | |
| # Generate output filename | |
| vocals_base = os.path.splitext(os.path.basename(vocals_path))[0] | |
| output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base)) | |
| # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels)) | |
| sf.write(output_path, mixed.T, output_sr, subtype="PCM_16") | |
| logger.info("Mix complete: {}".format(output_path)) | |
| return output_path | |