Spaces:
Running on Zero
Running on Zero
| # Copyright (c) 2026 Scenema AI | |
| # https://scenema.ai | |
| # SPDX-License-Identifier: MIT | |
| """VoiceFixer audio post-processing for Scenema Audio. | |
| Applies neural speech restoration to improve clarity, remove artifacts, | |
| and bring speech to studio quality. Runs on GPU after SeedVC as the | |
| final processing step. | |
| Model is downloaded on first use and cached to disk for subsequent runs. | |
| """ | |
| import logging | |
| import os | |
| import subprocess | |
| import sys | |
| import tempfile | |
| import numpy as np | |
| import soundfile as sf | |
| import torchaudio | |
| logger = logging.getLogger(__name__) | |
| _voicefixer = None | |
| def _ensure_installed(): | |
| """Install voicefixer if not available.""" | |
| try: | |
| import voicefixer # noqa: F401 | |
| except ImportError: | |
| logger.info("Installing voicefixer...") | |
| try: | |
| subprocess.check_call( | |
| [sys.executable, "-m", "pip", "install", "voicefixer", "--quiet"], | |
| ) | |
| logger.info("voicefixer installed") | |
| except subprocess.CalledProcessError: | |
| logger.warning("Failed to install voicefixer, enhancement will be skipped") | |
| raise ImportError("voicefixer not available") | |
| def _get_voicefixer(): | |
| """Get or initialize the VoiceFixer model. | |
| Downloaded on first use and cached by the library's default cache. | |
| """ | |
| global _voicefixer | |
| if _voicefixer is not None: | |
| return _voicefixer | |
| _ensure_installed() | |
| from voicefixer import VoiceFixer # noqa: E402 | |
| _voicefixer = VoiceFixer() | |
| logger.info("VoiceFixer model loaded") | |
| return _voicefixer | |
| def enhance_audio(audio_np: np.ndarray, sr: int) -> np.ndarray: | |
| """Apply VoiceFixer to audio for studio-quality output. | |
| VoiceFixer works on WAV files, so we write to temp, process, and read back. | |
| Args: | |
| audio_np: Audio array (mono or stereo), any sample rate. | |
| sr: Sample rate. | |
| Returns: | |
| Enhanced audio array at original sample rate. | |
| """ | |
| try: | |
| vf = _get_voicefixer() | |
| except (ImportError, Exception) as e: | |
| logger.warning("VoiceFixer unavailable: %s, skipping", e) | |
| return audio_np | |
| is_stereo = audio_np.ndim == 2 and audio_np.shape[1] == 2 | |
| with tempfile.TemporaryDirectory() as tmp: | |
| input_path = os.path.join(tmp, "input.wav") | |
| output_path = os.path.join(tmp, "output.wav") | |
| sf.write(input_path, audio_np, sr) | |
| try: | |
| vf.restore( | |
| input=input_path, | |
| output=output_path, | |
| cuda=True, | |
| mode=0, # 0=general, 1=speech-specific | |
| ) | |
| enhanced, enhanced_sr = sf.read(output_path) | |
| # Resample back to original sr if needed | |
| if enhanced_sr != sr: | |
| import torch | |
| t = torch.from_numpy( | |
| enhanced.T if enhanced.ndim == 2 else enhanced | |
| ).float() | |
| if t.ndim == 1: | |
| t = t.unsqueeze(0) | |
| t = torchaudio.functional.resample(t, enhanced_sr, sr) | |
| enhanced = t.squeeze(0).numpy() | |
| if enhanced.ndim == 1 and is_stereo: | |
| enhanced = np.stack([enhanced, enhanced], axis=1) | |
| elif enhanced.ndim == 2: | |
| enhanced = enhanced.T | |
| logger.info("Enhanced audio: %.1fs", len(enhanced) / sr) | |
| return enhanced | |
| except Exception as e: | |
| logger.warning("VoiceFixer failed: %s, returning original", e) | |
| return audio_np | |