Spaces:

multimodalart
/

scenema-audio

Running on Zero

File size: 3,521 Bytes

cdc4405

# Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT

"""VoiceFixer audio post-processing for Scenema Audio.

Applies neural speech restoration to improve clarity, remove artifacts,
and bring speech to studio quality. Runs on GPU after SeedVC as the
final processing step.

Model is downloaded on first use and cached to disk for subsequent runs.
"""

import logging
import os
import subprocess
import sys
import tempfile

import numpy as np
import soundfile as sf
import torchaudio

logger = logging.getLogger(__name__)

_voicefixer = None


def _ensure_installed():
    """Install voicefixer if not available."""
    try:
        import voicefixer  # noqa: F401
    except ImportError:
        logger.info("Installing voicefixer...")
        try:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "voicefixer", "--quiet"],
            )
            logger.info("voicefixer installed")
        except subprocess.CalledProcessError:
            logger.warning("Failed to install voicefixer, enhancement will be skipped")
            raise ImportError("voicefixer not available")


def _get_voicefixer():
    """Get or initialize the VoiceFixer model.

    Downloaded on first use and cached by the library's default cache.
    """
    global _voicefixer

    if _voicefixer is not None:
        return _voicefixer

    _ensure_installed()

    from voicefixer import VoiceFixer  # noqa: E402

    _voicefixer = VoiceFixer()
    logger.info("VoiceFixer model loaded")
    return _voicefixer


def enhance_audio(audio_np: np.ndarray, sr: int) -> np.ndarray:
    """Apply VoiceFixer to audio for studio-quality output.

    VoiceFixer works on WAV files, so we write to temp, process, and read back.

    Args:
        audio_np: Audio array (mono or stereo), any sample rate.
        sr: Sample rate.

    Returns:
        Enhanced audio array at original sample rate.
    """
    try:
        vf = _get_voicefixer()
    except (ImportError, Exception) as e:
        logger.warning("VoiceFixer unavailable: %s, skipping", e)
        return audio_np

    is_stereo = audio_np.ndim == 2 and audio_np.shape[1] == 2

    with tempfile.TemporaryDirectory() as tmp:
        input_path = os.path.join(tmp, "input.wav")
        output_path = os.path.join(tmp, "output.wav")

        sf.write(input_path, audio_np, sr)

        try:
            vf.restore(
                input=input_path,
                output=output_path,
                cuda=True,
                mode=0,  # 0=general, 1=speech-specific
            )

            enhanced, enhanced_sr = sf.read(output_path)

            # Resample back to original sr if needed
            if enhanced_sr != sr:
                import torch

                t = torch.from_numpy(
                    enhanced.T if enhanced.ndim == 2 else enhanced
                ).float()
                if t.ndim == 1:
                    t = t.unsqueeze(0)
                t = torchaudio.functional.resample(t, enhanced_sr, sr)
                enhanced = t.squeeze(0).numpy()
                if enhanced.ndim == 1 and is_stereo:
                    enhanced = np.stack([enhanced, enhanced], axis=1)
                elif enhanced.ndim == 2:
                    enhanced = enhanced.T

            logger.info("Enhanced audio: %.1fs", len(enhanced) / sr)
            return enhanced

        except Exception as e:
            logger.warning("VoiceFixer failed: %s, returning original", e)
            return audio_np