Spaces:
Running on Zero
Running on Zero
| # Copyright (c) 2026 Scenema AI | |
| # https://scenema.ai | |
| # SPDX-License-Identifier: MIT | |
| """SeedVC voice conversion for Scenema Audio. | |
| Converts the voice identity of generated audio to match a reference speaker | |
| while preserving prosody, rhythm, and emotion. Uses the Seed-VC model with | |
| DiT backbone, CAMPPlus speaker encoder, and BigVGAN vocoder. | |
| Expects 22050Hz mono WAV input for both source and target. | |
| """ | |
| import inspect | |
| import logging | |
| import os | |
| import sys | |
| import types | |
| from argparse import Namespace | |
| from pathlib import Path | |
| import numpy as np | |
| import torch | |
| logger = logging.getLogger(__name__) | |
| DEFAULT_SEEDVC_PATH = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc")) | |
| DEFAULT_DIFFUSION_STEPS = 25 | |
| DEFAULT_CFG_RATE = 0.5 | |
| class SeedVC: | |
| """Voice conversion engine using Seed-VC. | |
| Converts source audio voice identity to match a target speaker | |
| while preserving the source's delivery, emotion, and pacing. | |
| """ | |
| def __init__(self, seedvc_path: Path = DEFAULT_SEEDVC_PATH): | |
| self.seedvc_path = seedvc_path | |
| self._loaded = False | |
| self._original_cwd: str | None = None | |
| self._app_vc = None | |
| def load(self) -> None: | |
| """Load SeedVC models to GPU. | |
| Changes working directory to seedvc_path (required by SeedVC internals), | |
| stubs gradio, and loads all models via app_vc.load_models(). | |
| """ | |
| if self._loaded: | |
| return | |
| logger.info("Loading SeedVC from %s", self.seedvc_path) | |
| self._original_cwd = os.getcwd() | |
| os.chdir(self.seedvc_path) | |
| if "gradio" not in sys.modules: | |
| sys.modules["gradio"] = types.ModuleType("gradio") | |
| seedvc_str = str(self.seedvc_path) | |
| if seedvc_str not in sys.path: | |
| sys.path.insert(0, seedvc_str) | |
| os.environ.setdefault( | |
| "HF_HUB_CACHE", | |
| str(self.seedvc_path / "checkpoints" / "hf_cache"), | |
| ) | |
| # Patch BigVGAN for huggingface_hub compat (same as gpu_vc) | |
| import modules.bigvgan.bigvgan as _bigvgan_mod | |
| _orig = _bigvgan_mod.BigVGAN._from_pretrained | |
| def _patched(cls, **kwargs): | |
| kwargs.setdefault("proxies", None) | |
| kwargs.setdefault("resume_download", False) | |
| return _orig.__func__(cls, **kwargs) | |
| _bigvgan_mod.BigVGAN._from_pretrained = _patched | |
| # Load models (exact pattern from gpu_vc/seedvc_engine.py) | |
| import app_vc | |
| self._app_vc = app_vc | |
| app_vc.device = torch.device("cuda") | |
| args = Namespace(checkpoint=None, config=None, fp16=True, gpu=0) | |
| ( | |
| app_vc.model, | |
| app_vc.semantic_fn, | |
| app_vc.vocoder_fn, | |
| app_vc.campplus_model, | |
| app_vc.to_mel, | |
| app_vc.mel_fn_args, | |
| ) = app_vc.load_models(args) | |
| app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30 | |
| app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length | |
| self._loaded = True | |
| logger.info("SeedVC loaded: sr=%d, device=%s", app_vc.sr, app_vc.device) | |
| def unload(self) -> None: | |
| """Free SeedVC models from GPU.""" | |
| if not self._loaded: | |
| return | |
| if self._app_vc is not None: | |
| for attr in [ | |
| "model", | |
| "semantic_fn", | |
| "vocoder_fn", | |
| "campplus_model", | |
| "to_mel", | |
| ]: | |
| if hasattr(self._app_vc, attr): | |
| delattr(self._app_vc, attr) | |
| self._app_vc = None | |
| torch.cuda.empty_cache() | |
| if self._original_cwd: | |
| os.chdir(self._original_cwd) | |
| self._original_cwd = None | |
| self._loaded = False | |
| logger.info("SeedVC unloaded") | |
| def convert( | |
| self, | |
| source_wav_path: str, | |
| target_wav_path: str, | |
| diffusion_steps: int = DEFAULT_DIFFUSION_STEPS, | |
| cfg_rate: float = DEFAULT_CFG_RATE, | |
| ) -> np.ndarray: | |
| """Convert voice identity of source to match target. | |
| Both files must be 22050Hz mono WAV. | |
| Args: | |
| source_wav_path: Path to source audio (generated speech) | |
| target_wav_path: Path to target audio (reference voice) | |
| diffusion_steps: Number of diffusion steps (quality vs speed) | |
| cfg_rate: Classifier-free guidance rate | |
| Returns: | |
| Converted audio as float32 numpy array at 22050Hz mono | |
| """ | |
| if not self._loaded: | |
| raise RuntimeError("SeedVC not loaded. Call load() first.") | |
| logger.info( | |
| "Converting voice: %s -> %s (%d steps, cfg_rate=%.2f)", | |
| source_wav_path, | |
| target_wav_path, | |
| diffusion_steps, | |
| cfg_rate, | |
| ) | |
| audio_tuple = None | |
| vc_kwargs = { | |
| "source": source_wav_path, | |
| "target": target_wav_path, | |
| "diffusion_steps": diffusion_steps, | |
| "length_adjust": 1.0, | |
| "inference_cfg_rate": cfg_rate, | |
| } | |
| # n_quantizers removed in newer SeedVC versions | |
| sig = inspect.signature(self._app_vc.voice_conversion) | |
| if "n_quantizers" in sig.parameters: | |
| vc_kwargs["n_quantizers"] = 3 | |
| for result in self._app_vc.voice_conversion(**vc_kwargs): | |
| if isinstance(result, tuple) and len(result) == 2: | |
| _, audio_tuple = result | |
| if audio_tuple is None: | |
| raise RuntimeError("SeedVC produced no output") | |
| sample_rate, samples = audio_tuple | |
| if samples.dtype == np.int16: | |
| samples = samples.astype(np.float32) / 32768.0 | |
| elif samples.dtype != np.float32: | |
| samples = samples.astype(np.float32) | |
| peak = np.abs(samples).max() | |
| if peak > 1.0: | |
| samples = samples / peak | |
| logger.info("Converted: %.1fs at %dHz", len(samples) / sample_rate, sample_rate) | |
| return samples | |