scenema-audio / src /audio_core /enhancer.py
multimodalart
Initial Gradio ZeroGPU app for Scenema Audio
cdc4405
# Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT
"""VoiceFixer audio post-processing for Scenema Audio.
Applies neural speech restoration to improve clarity, remove artifacts,
and bring speech to studio quality. Runs on GPU after SeedVC as the
final processing step.
Model is downloaded on first use and cached to disk for subsequent runs.
"""
import logging
import os
import subprocess
import sys
import tempfile
import numpy as np
import soundfile as sf
import torchaudio
logger = logging.getLogger(__name__)
_voicefixer = None
def _ensure_installed():
"""Install voicefixer if not available."""
try:
import voicefixer # noqa: F401
except ImportError:
logger.info("Installing voicefixer...")
try:
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "voicefixer", "--quiet"],
)
logger.info("voicefixer installed")
except subprocess.CalledProcessError:
logger.warning("Failed to install voicefixer, enhancement will be skipped")
raise ImportError("voicefixer not available")
def _get_voicefixer():
"""Get or initialize the VoiceFixer model.
Downloaded on first use and cached by the library's default cache.
"""
global _voicefixer
if _voicefixer is not None:
return _voicefixer
_ensure_installed()
from voicefixer import VoiceFixer # noqa: E402
_voicefixer = VoiceFixer()
logger.info("VoiceFixer model loaded")
return _voicefixer
def enhance_audio(audio_np: np.ndarray, sr: int) -> np.ndarray:
"""Apply VoiceFixer to audio for studio-quality output.
VoiceFixer works on WAV files, so we write to temp, process, and read back.
Args:
audio_np: Audio array (mono or stereo), any sample rate.
sr: Sample rate.
Returns:
Enhanced audio array at original sample rate.
"""
try:
vf = _get_voicefixer()
except (ImportError, Exception) as e:
logger.warning("VoiceFixer unavailable: %s, skipping", e)
return audio_np
is_stereo = audio_np.ndim == 2 and audio_np.shape[1] == 2
with tempfile.TemporaryDirectory() as tmp:
input_path = os.path.join(tmp, "input.wav")
output_path = os.path.join(tmp, "output.wav")
sf.write(input_path, audio_np, sr)
try:
vf.restore(
input=input_path,
output=output_path,
cuda=True,
mode=0, # 0=general, 1=speech-specific
)
enhanced, enhanced_sr = sf.read(output_path)
# Resample back to original sr if needed
if enhanced_sr != sr:
import torch
t = torch.from_numpy(
enhanced.T if enhanced.ndim == 2 else enhanced
).float()
if t.ndim == 1:
t = t.unsqueeze(0)
t = torchaudio.functional.resample(t, enhanced_sr, sr)
enhanced = t.squeeze(0).numpy()
if enhanced.ndim == 1 and is_stereo:
enhanced = np.stack([enhanced, enhanced], axis=1)
elif enhanced.ndim == 2:
enhanced = enhanced.T
logger.info("Enhanced audio: %.1fs", len(enhanced) / sr)
return enhanced
except Exception as e:
logger.warning("VoiceFixer failed: %s, returning original", e)
return audio_np