rvc

Sleeping

ibcplateformes

Upgrade audio quality: pro mixing chain, better inference params, htdemucs_ft

f729219 about 2 months ago

3.66 kB

	"""
	Audio mixing module: professional vocal processing + mix with instrumentals.
	Uses Pedalboard for studio-quality DSP chain.
	"""

	import os
	import logging
	import numpy as np
	import librosa
	import soundfile as sf
	from pedalboard import (
	Pedalboard, Compressor, HighpassFilter,
	PeakFilter, LowShelfFilter, Limiter, Gain,
	)

	logger = logging.getLogger(__name__)

	OUTPUT_DIR = "/tmp/rvc_output"


	def _process_vocals(vocals: np.ndarray, sr: int) -> np.ndarray:
	"""
	Apply professional vocal processing chain before mixing.
	Input/output shape: (channels, samples), float32.
	"""
	board = Pedalboard([
	# 1. Remove sub-bass rumble and proximity effect
	HighpassFilter(cutoff_frequency_hz=80.0),
	# 2. Compress dynamics for consistent vocal level (standard vocal settings)
	Compressor(
	threshold_db=-16.0,
	ratio=4.0,
	attack_ms=5.0,
	release_ms=100.0,
	),
	# 3. Presence boost — helps vocal cut through the mix
	PeakFilter(
	cutoff_frequency_hz=3000.0,
	gain_db=2.5,
	q=1.0,
	),
	# 4. Simple de-esser — gentle high-freq reduction to tame sibilance
	LowShelfFilter(
	cutoff_frequency_hz=6000.0,
	gain_db=-2.0,
	),
	# 5. Makeup gain after compression
	Gain(gain_db=1.0),
	])

	processed = board(vocals.astype(np.float32), sr)
	logger.info("Vocal processing chain applied (HPF+Comp+EQ+DeEss+Gain)")
	return processed


	def mix_audio(
	vocals_path: str,
	instruments_path: str,
	vocal_volume: float = 1.0,
	instrumental_volume: float = 1.0,
	output_sr: int = 44100,
	):
	"""
	Mix converted vocals with instrumental track.
	Applies professional vocal processing before mixing.
	Output: WAV 44.1kHz 16-bit.
	Returns path to mixed audio file.
	"""
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	logger.info("Loading vocals: {}".format(vocals_path))
	vocals, _ = librosa.load(vocals_path, sr=output_sr, mono=False)

	logger.info("Loading instruments: {}".format(instruments_path))
	instruments, _ = librosa.load(instruments_path, sr=output_sr, mono=False)

	# Ensure both are 2D (channels, samples)
	if vocals.ndim == 1:
	vocals = np.stack([vocals, vocals])
	if instruments.ndim == 1:
	instruments = np.stack([instruments, instruments])

	# Match lengths (pad shorter with silence)
	max_len = max(vocals.shape[-1], instruments.shape[-1])
	if vocals.shape[-1] < max_len:
	pad_width = [(0, 0), (0, max_len - vocals.shape[-1])]
	vocals = np.pad(vocals, pad_width)
	if instruments.shape[-1] < max_len:
	pad_width = [(0, 0), (0, max_len - instruments.shape[-1])]
	instruments = np.pad(instruments, pad_width)

	# Apply professional vocal processing chain
	vocals = _process_vocals(vocals, output_sr)

	# Mix with volume controls
	mixed = vocals * vocal_volume + instruments * instrumental_volume

	# Apply limiter to final mix (replaces naive peak normalization)
	limiter = Pedalboard([
	Limiter(threshold_db=-1.0, release_ms=100.0),
	])
	mixed = limiter(mixed.astype(np.float32), output_sr)

	# Generate output filename
	vocals_base = os.path.splitext(os.path.basename(vocals_path))[0]
	output_path = os.path.join(OUTPUT_DIR, "{}_mix_final.wav".format(vocals_base))

	# Save as WAV 44.1kHz 16-bit (transposed: soundfile expects (samples, channels))
	sf.write(output_path, mixed.T, output_sr, subtype="PCM_16")

	logger.info("Mix complete: {}".format(output_path))
	return output_path