# Copyright (c) 2026 Scenema AI # https://scenema.ai # SPDX-License-Identifier: MIT """SeedVC voice conversion for Scenema Audio. Converts the voice identity of generated audio to match a reference speaker while preserving prosody, rhythm, and emotion. Uses the Seed-VC model with DiT backbone, CAMPPlus speaker encoder, and BigVGAN vocoder. Expects 22050Hz mono WAV input for both source and target. """ import inspect import logging import os import sys import types from argparse import Namespace from pathlib import Path import numpy as np import torch logger = logging.getLogger(__name__) DEFAULT_SEEDVC_PATH = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc")) DEFAULT_DIFFUSION_STEPS = 25 DEFAULT_CFG_RATE = 0.5 class SeedVC: """Voice conversion engine using Seed-VC. Converts source audio voice identity to match a target speaker while preserving the source's delivery, emotion, and pacing. """ def __init__(self, seedvc_path: Path = DEFAULT_SEEDVC_PATH): self.seedvc_path = seedvc_path self._loaded = False self._original_cwd: str | None = None self._app_vc = None def load(self) -> None: """Load SeedVC models to GPU. Changes working directory to seedvc_path (required by SeedVC internals), stubs gradio, and loads all models via app_vc.load_models(). """ if self._loaded: return logger.info("Loading SeedVC from %s", self.seedvc_path) self._original_cwd = os.getcwd() os.chdir(self.seedvc_path) if "gradio" not in sys.modules: sys.modules["gradio"] = types.ModuleType("gradio") seedvc_str = str(self.seedvc_path) if seedvc_str not in sys.path: sys.path.insert(0, seedvc_str) os.environ.setdefault( "HF_HUB_CACHE", str(self.seedvc_path / "checkpoints" / "hf_cache"), ) # Patch BigVGAN for huggingface_hub compat (same as gpu_vc) import modules.bigvgan.bigvgan as _bigvgan_mod _orig = _bigvgan_mod.BigVGAN._from_pretrained @classmethod def _patched(cls, **kwargs): kwargs.setdefault("proxies", None) kwargs.setdefault("resume_download", False) return _orig.__func__(cls, **kwargs) _bigvgan_mod.BigVGAN._from_pretrained = _patched # Load models (exact pattern from gpu_vc/seedvc_engine.py) import app_vc self._app_vc = app_vc app_vc.device = torch.device("cuda") args = Namespace(checkpoint=None, config=None, fp16=True, gpu=0) ( app_vc.model, app_vc.semantic_fn, app_vc.vocoder_fn, app_vc.campplus_model, app_vc.to_mel, app_vc.mel_fn_args, ) = app_vc.load_models(args) app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30 app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length self._loaded = True logger.info("SeedVC loaded: sr=%d, device=%s", app_vc.sr, app_vc.device) def unload(self) -> None: """Free SeedVC models from GPU.""" if not self._loaded: return if self._app_vc is not None: for attr in [ "model", "semantic_fn", "vocoder_fn", "campplus_model", "to_mel", ]: if hasattr(self._app_vc, attr): delattr(self._app_vc, attr) self._app_vc = None torch.cuda.empty_cache() if self._original_cwd: os.chdir(self._original_cwd) self._original_cwd = None self._loaded = False logger.info("SeedVC unloaded") def convert( self, source_wav_path: str, target_wav_path: str, diffusion_steps: int = DEFAULT_DIFFUSION_STEPS, cfg_rate: float = DEFAULT_CFG_RATE, ) -> np.ndarray: """Convert voice identity of source to match target. Both files must be 22050Hz mono WAV. Args: source_wav_path: Path to source audio (generated speech) target_wav_path: Path to target audio (reference voice) diffusion_steps: Number of diffusion steps (quality vs speed) cfg_rate: Classifier-free guidance rate Returns: Converted audio as float32 numpy array at 22050Hz mono """ if not self._loaded: raise RuntimeError("SeedVC not loaded. Call load() first.") logger.info( "Converting voice: %s -> %s (%d steps, cfg_rate=%.2f)", source_wav_path, target_wav_path, diffusion_steps, cfg_rate, ) audio_tuple = None vc_kwargs = { "source": source_wav_path, "target": target_wav_path, "diffusion_steps": diffusion_steps, "length_adjust": 1.0, "inference_cfg_rate": cfg_rate, } # n_quantizers removed in newer SeedVC versions sig = inspect.signature(self._app_vc.voice_conversion) if "n_quantizers" in sig.parameters: vc_kwargs["n_quantizers"] = 3 for result in self._app_vc.voice_conversion(**vc_kwargs): if isinstance(result, tuple) and len(result) == 2: _, audio_tuple = result if audio_tuple is None: raise RuntimeError("SeedVC produced no output") sample_rate, samples = audio_tuple if samples.dtype == np.int16: samples = samples.astype(np.float32) / 32768.0 elif samples.dtype != np.float32: samples = samples.astype(np.float32) peak = np.abs(samples).max() if peak > 1.0: samples = samples / peak logger.info("Converted: %.1fs at %dHz", len(samples) / sample_rate, sample_rate) return samples