Spaces:

multimodalart
/

scenema-audio

Running on Zero

scenema-audio / src /audio_core /enhancer.py

multimodalart

Initial Gradio ZeroGPU app for Scenema Audio

cdc4405 10 days ago

3.52 kB

	# Copyright (c) 2026 Scenema AI
	# https://scenema.ai
	# SPDX-License-Identifier: MIT

	"""VoiceFixer audio post-processing for Scenema Audio.

	Applies neural speech restoration to improve clarity, remove artifacts,
	and bring speech to studio quality. Runs on GPU after SeedVC as the
	final processing step.

	Model is downloaded on first use and cached to disk for subsequent runs.
	"""

	import logging
	import os
	import subprocess
	import sys
	import tempfile

	import numpy as np
	import soundfile as sf
	import torchaudio

	logger = logging.getLogger(__name__)

	_voicefixer = None


	def _ensure_installed():
	"""Install voicefixer if not available."""
	try:
	import voicefixer # noqa: F401
	except ImportError:
	logger.info("Installing voicefixer...")
	try:
	subprocess.check_call(
	[sys.executable, "-m", "pip", "install", "voicefixer", "--quiet"],
	)
	logger.info("voicefixer installed")
	except subprocess.CalledProcessError:
	logger.warning("Failed to install voicefixer, enhancement will be skipped")
	raise ImportError("voicefixer not available")


	def _get_voicefixer():
	"""Get or initialize the VoiceFixer model.

	Downloaded on first use and cached by the library's default cache.
	"""
	global _voicefixer

	if _voicefixer is not None:
	return _voicefixer

	_ensure_installed()

	from voicefixer import VoiceFixer # noqa: E402

	_voicefixer = VoiceFixer()
	logger.info("VoiceFixer model loaded")
	return _voicefixer


	def enhance_audio(audio_np: np.ndarray, sr: int) -> np.ndarray:
	"""Apply VoiceFixer to audio for studio-quality output.

	VoiceFixer works on WAV files, so we write to temp, process, and read back.

	Args:
	audio_np: Audio array (mono or stereo), any sample rate.
	sr: Sample rate.

	Returns:
	Enhanced audio array at original sample rate.
	"""
	try:
	vf = _get_voicefixer()
	except (ImportError, Exception) as e:
	logger.warning("VoiceFixer unavailable: %s, skipping", e)
	return audio_np

	is_stereo = audio_np.ndim == 2 and audio_np.shape[1] == 2

	with tempfile.TemporaryDirectory() as tmp:
	input_path = os.path.join(tmp, "input.wav")
	output_path = os.path.join(tmp, "output.wav")

	sf.write(input_path, audio_np, sr)

	try:
	vf.restore(
	input=input_path,
	output=output_path,
	cuda=True,
	mode=0, # 0=general, 1=speech-specific
	)

	enhanced, enhanced_sr = sf.read(output_path)

	# Resample back to original sr if needed
	if enhanced_sr != sr:
	import torch

	t = torch.from_numpy(
	enhanced.T if enhanced.ndim == 2 else enhanced
	).float()
	if t.ndim == 1:
	t = t.unsqueeze(0)
	t = torchaudio.functional.resample(t, enhanced_sr, sr)
	enhanced = t.squeeze(0).numpy()
	if enhanced.ndim == 1 and is_stereo:
	enhanced = np.stack([enhanced, enhanced], axis=1)
	elif enhanced.ndim == 2:
	enhanced = enhanced.T

	logger.info("Enhanced audio: %.1fs", len(enhanced) / sr)
	return enhanced

	except Exception as e:
	logger.warning("VoiceFixer failed: %s, returning original", e)
	return audio_np