rvc / pipeline /mixing.py
ibcplateformes
Upgrade audio quality: pro mixing chain, better inference params, htdemucs_ft
f729219
"""
Audio mixing module: professional vocal processing + mix with instrumentals.
Uses Pedalboard for studio-quality DSP chain.
"""
import os
import logging
import numpy as np
import librosa
import soundfile as sf
from pedalboard import (
Pedalboard, Compressor, HighpassFilter,
PeakFilter, LowShelfFilter, Limiter, Gain,
)
logger = logging.getLogger(__name__)
OUTPUT_DIR = "/tmp/rvc_output"
def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray:
"""
Apply professional vocal processing chain before mixing.
Input/output shape: (channels, samples), float32.
"""
board = Pedalboard([
# 1. Remove sub-bass rumble and proximity effect
HighpassFilter(cutoff_frequency_hz=80.0),
# 2. Compress dynamics for consistent vocal level (standard vocal settings)
Compressor(
threshold_db=-16.0,
ratio=4.0,
attack_ms=5.0,
release_ms=100.0,
),
# 3. Presence boost — helps vocal cut through the mix
PeakFilter(
cutoff_frequency_hz=3000.0,
gain_db=2.5,
q=1.0,
),
# 4. Simple de-esser — gentle high-freq reduction to tame sibilance
LowShelfFilter(
cutoff_frequency_hz=6000.0,
gain_db=-2.0,
),
# 5. Makeup gain after compression
Gain(gain_db=1.0),
])
processed = board(vocals.astype(np.float32), sr)
logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)")
return processed
def mix_audio(
vocals_path: str,
instruments_path: str,
vocal_volume: float = 1.0,
instrumental_volume: float = 1.0,
output_sr: int = 44100,
):
"""
Mix converted vocals with instrumental track.
Applies professional vocal processing before mixing.
Output: WAV 44.1kHz 16-bit.
Returns path to mixed audio file.
"""
os.makedirs(OUTPUT_DIR, exist_ok=True)
logger.info("Loading vocals: {}".format(vocals_path))
vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)
logger.info("Loading instruments: {}".format(instruments_path))
instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)
# Ensure both are 2D (channels, samples)
if vocals.ndim == 1:
vocals = np.stack([vocals, vocals])
if instruments.ndim == 1:
instruments = np.stack([instruments, instruments])
# Match lengths (pad shorter with silence)
max_len = max(vocals.shape[-1], instruments.shape[-1])
if vocals.shape[-1] < max_len:
pad_width = [(0, 0), (0, max_len - vocals.shape[-1])]
vocals = np.pad(vocals, pad_width)
if instruments.shape[-1] < max_len:
pad_width = [(0, 0), (0, max_len - instruments.shape[-1])]
instruments = np.pad(instruments, pad_width)
# Apply professional vocal processing chain
vocals = _process_vocals(vocals, output_sr)
# Mix with volume controls
mixed = vocals * vocal_volume + instruments * instrumental_volume
# Apply limiter to final mix (replaces naive peak normalization)
limiter = Pedalboard([
Limiter(threshold_db=-1.0, release_ms=100.0),
])
mixed = limiter(mixed.astype(np.float32), output_sr)
# Generate output filename
vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base))
# Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")
logger.info("Mix complete: {}".format(output_path))
return output_path