""" Audio mixing module: professional vocal processing + mix with instrumentals. Uses Pedalboard for studio-quality DSP chain. """ import os import logging import numpy as np import librosa import soundfile as sf from pedalboard import ( Pedalboard, Compressor, HighpassFilter, PeakFilter, LowShelfFilter, Limiter, Gain, ) logger = logging.getLogger(__name__) OUTPUT_DIR = "/tmp/rvc_output" def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray: """ Apply professional vocal processing chain before mixing. Input/output shape: (channels, samples), float32. """ board = Pedalboard([ # 1. Remove sub-bass rumble and proximity effect HighpassFilter(cutoff_frequency_hz=80.0), # 2. Compress dynamics for consistent vocal level (standard vocal settings) Compressor( threshold_db=-16.0, ratio=4.0, attack_ms=5.0, release_ms=100.0, ), # 3. Presence boost — helps vocal cut through the mix PeakFilter( cutoff_frequency_hz=3000.0, gain_db=2.5, q=1.0, ), # 4. Simple de-esser — gentle high-freq reduction to tame sibilance LowShelfFilter( cutoff_frequency_hz=6000.0, gain_db=-2.0, ), # 5. Makeup gain after compression Gain(gain_db=1.0), ]) processed = board(vocals.astype(np.float32), sr) logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)") return processed def mix_audio( vocals_path: str, instruments_path: str, vocal_volume: float = 1.0, instrumental_volume: float = 1.0, output_sr: int = 44100, ): """ Mix converted vocals with instrumental track. Applies professional vocal processing before mixing. Output: WAV 44.1kHz 16-bit. Returns path to mixed audio file. """ os.makedirs(OUTPUT_DIR, exist_ok=True) logger.info("Loading vocals: {}".format(vocals_path)) vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False) logger.info("Loading instruments: {}".format(instruments_path)) instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False) # Ensure both are 2D (channels, samples) if vocals.ndim == 1: vocals = np.stack([vocals, vocals]) if instruments.ndim == 1: instruments = np.stack([instruments, instruments]) # Match lengths (pad shorter with silence) max_len = max(vocals.shape[-1], instruments.shape[-1]) if vocals.shape[-1] < max_len: pad_width = [(0, 0), (0, max_len - vocals.shape[-1])] vocals = np.pad(vocals, pad_width) if instruments.shape[-1] < max_len: pad_width = [(0, 0), (0, max_len - instruments.shape[-1])] instruments = np.pad(instruments, pad_width) # Apply professional vocal processing chain vocals = _process_vocals(vocals, output_sr) # Mix with volume controls mixed = vocals * vocal_volume + instruments * instrumental_volume # Apply limiter to final mix (replaces naive peak normalization) limiter = Pedalboard([ Limiter(threshold_db=-1.0, release_ms=100.0), ]) mixed = limiter(mixed.astype(np.float32), output_sr) # Generate output filename vocals_base = os.path.splitext(os.path.basename(vocals_path))[0] output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base)) # Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels)) sf.write(output_path, mixed.T, output_sr, subtype="PCM_16") logger.info("Mix complete: {}".format(output_path)) return output_path