File size: 5,968 Bytes
cdc4405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT

"""SeedVC voice conversion for Scenema Audio.

Converts the voice identity of generated audio to match a reference speaker
while preserving prosody, rhythm, and emotion. Uses the Seed-VC model with
DiT backbone, CAMPPlus speaker encoder, and BigVGAN vocoder.

Expects 22050Hz mono WAV input for both source and target.
"""

import inspect
import logging
import os
import sys
import types
from argparse import Namespace
from pathlib import Path

import numpy as np
import torch

logger = logging.getLogger(__name__)

DEFAULT_SEEDVC_PATH = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc"))
DEFAULT_DIFFUSION_STEPS = 25
DEFAULT_CFG_RATE = 0.5


class SeedVC:
    """Voice conversion engine using Seed-VC.

    Converts source audio voice identity to match a target speaker
    while preserving the source's delivery, emotion, and pacing.
    """

    def __init__(self, seedvc_path: Path = DEFAULT_SEEDVC_PATH):
        self.seedvc_path = seedvc_path
        self._loaded = False
        self._original_cwd: str | None = None
        self._app_vc = None

    def load(self) -> None:
        """Load SeedVC models to GPU.

        Changes working directory to seedvc_path (required by SeedVC internals),
        stubs gradio, and loads all models via app_vc.load_models().
        """
        if self._loaded:
            return

        logger.info("Loading SeedVC from %s", self.seedvc_path)

        self._original_cwd = os.getcwd()
        os.chdir(self.seedvc_path)

        if "gradio" not in sys.modules:
            sys.modules["gradio"] = types.ModuleType("gradio")

        seedvc_str = str(self.seedvc_path)
        if seedvc_str not in sys.path:
            sys.path.insert(0, seedvc_str)

        os.environ.setdefault(
            "HF_HUB_CACHE",
            str(self.seedvc_path / "checkpoints" / "hf_cache"),
        )

        # Patch BigVGAN for huggingface_hub compat (same as gpu_vc)
        import modules.bigvgan.bigvgan as _bigvgan_mod

        _orig = _bigvgan_mod.BigVGAN._from_pretrained

        @classmethod
        def _patched(cls, **kwargs):
            kwargs.setdefault("proxies", None)
            kwargs.setdefault("resume_download", False)
            return _orig.__func__(cls, **kwargs)

        _bigvgan_mod.BigVGAN._from_pretrained = _patched

        # Load models (exact pattern from gpu_vc/seedvc_engine.py)
        import app_vc

        self._app_vc = app_vc
        app_vc.device = torch.device("cuda")

        args = Namespace(checkpoint=None, config=None, fp16=True, gpu=0)
        (
            app_vc.model,
            app_vc.semantic_fn,
            app_vc.vocoder_fn,
            app_vc.campplus_model,
            app_vc.to_mel,
            app_vc.mel_fn_args,
        ) = app_vc.load_models(args)

        app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30
        app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length

        self._loaded = True
        logger.info("SeedVC loaded: sr=%d, device=%s", app_vc.sr, app_vc.device)

    def unload(self) -> None:
        """Free SeedVC models from GPU."""
        if not self._loaded:
            return

        if self._app_vc is not None:
            for attr in [
                "model",
                "semantic_fn",
                "vocoder_fn",
                "campplus_model",
                "to_mel",
            ]:
                if hasattr(self._app_vc, attr):
                    delattr(self._app_vc, attr)
            self._app_vc = None

        torch.cuda.empty_cache()

        if self._original_cwd:
            os.chdir(self._original_cwd)
            self._original_cwd = None

        self._loaded = False
        logger.info("SeedVC unloaded")

    def convert(
        self,
        source_wav_path: str,
        target_wav_path: str,
        diffusion_steps: int = DEFAULT_DIFFUSION_STEPS,
        cfg_rate: float = DEFAULT_CFG_RATE,
    ) -> np.ndarray:
        """Convert voice identity of source to match target.

        Both files must be 22050Hz mono WAV.

        Args:
            source_wav_path: Path to source audio (generated speech)
            target_wav_path: Path to target audio (reference voice)
            diffusion_steps: Number of diffusion steps (quality vs speed)
            cfg_rate: Classifier-free guidance rate

        Returns:
            Converted audio as float32 numpy array at 22050Hz mono
        """
        if not self._loaded:
            raise RuntimeError("SeedVC not loaded. Call load() first.")

        logger.info(
            "Converting voice: %s -> %s (%d steps, cfg_rate=%.2f)",
            source_wav_path,
            target_wav_path,
            diffusion_steps,
            cfg_rate,
        )

        audio_tuple = None
        vc_kwargs = {
            "source": source_wav_path,
            "target": target_wav_path,
            "diffusion_steps": diffusion_steps,
            "length_adjust": 1.0,
            "inference_cfg_rate": cfg_rate,
        }
        # n_quantizers removed in newer SeedVC versions
        sig = inspect.signature(self._app_vc.voice_conversion)
        if "n_quantizers" in sig.parameters:
            vc_kwargs["n_quantizers"] = 3
        for result in self._app_vc.voice_conversion(**vc_kwargs):
            if isinstance(result, tuple) and len(result) == 2:
                _, audio_tuple = result

        if audio_tuple is None:
            raise RuntimeError("SeedVC produced no output")

        sample_rate, samples = audio_tuple

        if samples.dtype == np.int16:
            samples = samples.astype(np.float32) / 32768.0
        elif samples.dtype != np.float32:
            samples = samples.astype(np.float32)

        peak = np.abs(samples).max()
        if peak > 1.0:
            samples = samples / peak

        logger.info("Converted: %.1fs at %dHz", len(samples) / sample_rate, sample_rate)
        return samples