Spaces:
Runtime error
Runtime error
File size: 5,968 Bytes
cdc4405 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | # Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT
"""SeedVC voice conversion for Scenema Audio.
Converts the voice identity of generated audio to match a reference speaker
while preserving prosody, rhythm, and emotion. Uses the Seed-VC model with
DiT backbone, CAMPPlus speaker encoder, and BigVGAN vocoder.
Expects 22050Hz mono WAV input for both source and target.
"""
import inspect
import logging
import os
import sys
import types
from argparse import Namespace
from pathlib import Path
import numpy as np
import torch
logger = logging.getLogger(__name__)
DEFAULT_SEEDVC_PATH = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc"))
DEFAULT_DIFFUSION_STEPS = 25
DEFAULT_CFG_RATE = 0.5
class SeedVC:
"""Voice conversion engine using Seed-VC.
Converts source audio voice identity to match a target speaker
while preserving the source's delivery, emotion, and pacing.
"""
def __init__(self, seedvc_path: Path = DEFAULT_SEEDVC_PATH):
self.seedvc_path = seedvc_path
self._loaded = False
self._original_cwd: str | None = None
self._app_vc = None
def load(self) -> None:
"""Load SeedVC models to GPU.
Changes working directory to seedvc_path (required by SeedVC internals),
stubs gradio, and loads all models via app_vc.load_models().
"""
if self._loaded:
return
logger.info("Loading SeedVC from %s", self.seedvc_path)
self._original_cwd = os.getcwd()
os.chdir(self.seedvc_path)
if "gradio" not in sys.modules:
sys.modules["gradio"] = types.ModuleType("gradio")
seedvc_str = str(self.seedvc_path)
if seedvc_str not in sys.path:
sys.path.insert(0, seedvc_str)
os.environ.setdefault(
"HF_HUB_CACHE",
str(self.seedvc_path / "checkpoints" / "hf_cache"),
)
# Patch BigVGAN for huggingface_hub compat (same as gpu_vc)
import modules.bigvgan.bigvgan as _bigvgan_mod
_orig = _bigvgan_mod.BigVGAN._from_pretrained
@classmethod
def _patched(cls, **kwargs):
kwargs.setdefault("proxies", None)
kwargs.setdefault("resume_download", False)
return _orig.__func__(cls, **kwargs)
_bigvgan_mod.BigVGAN._from_pretrained = _patched
# Load models (exact pattern from gpu_vc/seedvc_engine.py)
import app_vc
self._app_vc = app_vc
app_vc.device = torch.device("cuda")
args = Namespace(checkpoint=None, config=None, fp16=True, gpu=0)
(
app_vc.model,
app_vc.semantic_fn,
app_vc.vocoder_fn,
app_vc.campplus_model,
app_vc.to_mel,
app_vc.mel_fn_args,
) = app_vc.load_models(args)
app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30
app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length
self._loaded = True
logger.info("SeedVC loaded: sr=%d, device=%s", app_vc.sr, app_vc.device)
def unload(self) -> None:
"""Free SeedVC models from GPU."""
if not self._loaded:
return
if self._app_vc is not None:
for attr in [
"model",
"semantic_fn",
"vocoder_fn",
"campplus_model",
"to_mel",
]:
if hasattr(self._app_vc, attr):
delattr(self._app_vc, attr)
self._app_vc = None
torch.cuda.empty_cache()
if self._original_cwd:
os.chdir(self._original_cwd)
self._original_cwd = None
self._loaded = False
logger.info("SeedVC unloaded")
def convert(
self,
source_wav_path: str,
target_wav_path: str,
diffusion_steps: int = DEFAULT_DIFFUSION_STEPS,
cfg_rate: float = DEFAULT_CFG_RATE,
) -> np.ndarray:
"""Convert voice identity of source to match target.
Both files must be 22050Hz mono WAV.
Args:
source_wav_path: Path to source audio (generated speech)
target_wav_path: Path to target audio (reference voice)
diffusion_steps: Number of diffusion steps (quality vs speed)
cfg_rate: Classifier-free guidance rate
Returns:
Converted audio as float32 numpy array at 22050Hz mono
"""
if not self._loaded:
raise RuntimeError("SeedVC not loaded. Call load() first.")
logger.info(
"Converting voice: %s -> %s (%d steps, cfg_rate=%.2f)",
source_wav_path,
target_wav_path,
diffusion_steps,
cfg_rate,
)
audio_tuple = None
vc_kwargs = {
"source": source_wav_path,
"target": target_wav_path,
"diffusion_steps": diffusion_steps,
"length_adjust": 1.0,
"inference_cfg_rate": cfg_rate,
}
# n_quantizers removed in newer SeedVC versions
sig = inspect.signature(self._app_vc.voice_conversion)
if "n_quantizers" in sig.parameters:
vc_kwargs["n_quantizers"] = 3
for result in self._app_vc.voice_conversion(**vc_kwargs):
if isinstance(result, tuple) and len(result) == 2:
_, audio_tuple = result
if audio_tuple is None:
raise RuntimeError("SeedVC produced no output")
sample_rate, samples = audio_tuple
if samples.dtype == np.int16:
samples = samples.astype(np.float32) / 32768.0
elif samples.dtype != np.float32:
samples = samples.astype(np.float32)
peak = np.abs(samples).max()
if peak > 1.0:
samples = samples / peak
logger.info("Converted: %.1fs at %dHz", len(samples) / sample_rate, sample_rate)
return samples
|