Spaces:
Paused
Paused
刘鑫 commited on
Commit ·
09dc185
1
Parent(s): 29b439e
fix: switch Gradio demo backend to nanovllm_voxcpm
Browse filesKeep the existing VoxCPM demo UI while routing inference through Nano-vLLM on persistent GPU Spaces. Add runtime/system dependency setup so the managed Gradio environment can attempt the non-Docker backend path.
Made-with: Cursor
- README.md +32 -1
- app.py +238 -136
- packages.txt +7 -0
- requirements.txt +13 -16
README.md
CHANGED
|
@@ -9,5 +9,36 @@ app_file: app.py
|
|
| 9 |
python_version: "3.10"
|
| 10 |
pinned: true
|
| 11 |
license: apache-2.0
|
| 12 |
-
short_description: VoxCPM2
|
| 13 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
python_version: "3.10"
|
| 10 |
pinned: true
|
| 11 |
license: apache-2.0
|
| 12 |
+
short_description: VoxCPM2 Nano-vLLM Demo
|
| 13 |
---
|
| 14 |
+
|
| 15 |
+
Experimental Gradio Space demo for `VoxCPM2` powered by `nanovllm-voxcpm`.
|
| 16 |
+
|
| 17 |
+
This repo keeps the existing Gradio frontend layout and swaps only the backend inference path to Nano-vLLM.
|
| 18 |
+
|
| 19 |
+
Notes:
|
| 20 |
+
|
| 21 |
+
- This is the non-Docker experiment path. It relies on a persistent GPU Gradio Space.
|
| 22 |
+
- `flash-attn` and `nanovllm-voxcpm` are installed at runtime on first backend initialization.
|
| 23 |
+
- The first generation request may spend extra time installing dependencies, downloading the model, and loading the server.
|
| 24 |
+
- `ASR_DEVICE` defaults to `cpu` to avoid competing with TTS GPU memory.
|
| 25 |
+
- The `LocDiT flow-matching steps` slider is wired to Nano-vLLM server `inference_timesteps`; changing it rebuilds the backend server.
|
| 26 |
+
- The existing `normalize` / `denoise` frontend toggles are kept for UI compatibility, but Nano-vLLM currently ignores them.
|
| 27 |
+
- `packages.txt` is required because this path needs extra system build dependencies.
|
| 28 |
+
|
| 29 |
+
Recommended environment variables:
|
| 30 |
+
|
| 31 |
+
- `HF_REPO_ID`: Hugging Face model repo id. Defaults to `openbmb/VoxCPM2`
|
| 32 |
+
- `HF_TOKEN`: required if the model repo is private
|
| 33 |
+
- `NANOVLLM_MODEL`: optional direct model ref override. Can be a local path or HF repo id
|
| 34 |
+
- `NANOVLLM_MODEL_PATH`: optional local model path override
|
| 35 |
+
- `ASR_DEVICE`: defaults to `cpu`
|
| 36 |
+
- `NANOVLLM_INFERENCE_TIMESTEPS`: initial default is `10`
|
| 37 |
+
- `NANOVLLM_SERVERPOOL_MAX_NUM_BATCHED_TOKENS`: defaults to `8192`
|
| 38 |
+
- `NANOVLLM_SERVERPOOL_MAX_NUM_SEQS`: defaults to `16`
|
| 39 |
+
- `NANOVLLM_SERVERPOOL_MAX_MODEL_LEN`: defaults to `4096`
|
| 40 |
+
- `NANOVLLM_SERVERPOOL_GPU_MEMORY_UTILIZATION`: defaults to `0.95`
|
| 41 |
+
- `NANOVLLM_SERVERPOOL_ENFORCE_EAGER`: defaults to `false`
|
| 42 |
+
- `NANOVLLM_SERVERPOOL_DEVICES`: defaults to `0`
|
| 43 |
+
- `NANOVLLM_MAX_GENERATE_LENGTH`: defaults to `2000`
|
| 44 |
+
- `NANOVLLM_TEMPERATURE`: defaults to `1.0`
|
app.py
CHANGED
|
@@ -1,54 +1,27 @@
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
import subprocess
|
| 4 |
import sys
|
| 5 |
from pathlib import Path
|
|
|
|
| 6 |
from typing import Optional, Tuple
|
| 7 |
|
| 8 |
-
def _ensure_torchaudio():
|
| 9 |
-
"""Install torchaudio matching ZeroGPU's pre-installed torch + CUDA version."""
|
| 10 |
-
try:
|
| 11 |
-
import torchaudio # noqa: F401
|
| 12 |
-
return
|
| 13 |
-
except (ImportError, OSError):
|
| 14 |
-
pass
|
| 15 |
-
import torch
|
| 16 |
-
torch_ver = torch.__version__.split("+")[0]
|
| 17 |
-
cuda_ver = torch.version.cuda
|
| 18 |
-
if cuda_ver:
|
| 19 |
-
tag = "cu" + cuda_ver.replace(".", "")
|
| 20 |
-
else:
|
| 21 |
-
tag = "cpu"
|
| 22 |
-
index = f"https://download.pytorch.org/whl/{tag}"
|
| 23 |
-
subprocess.check_call([
|
| 24 |
-
sys.executable, "-m", "pip", "install", "--no-deps",
|
| 25 |
-
"--index-url", index,
|
| 26 |
-
f"torchaudio=={torch_ver}",
|
| 27 |
-
])
|
| 28 |
-
|
| 29 |
-
_ensure_torchaudio()
|
| 30 |
-
|
| 31 |
-
try:
|
| 32 |
-
import voxcpm # noqa: F401
|
| 33 |
-
except ImportError:
|
| 34 |
-
subprocess.check_call([
|
| 35 |
-
sys.executable, "-m", "pip", "install", "--no-deps",
|
| 36 |
-
"voxcpm @ git+https://github.com/OpenBMB/VoxCPM.git@dev_2.0",
|
| 37 |
-
])
|
| 38 |
-
import voxcpm # noqa: F401
|
| 39 |
-
|
| 40 |
import gradio as gr
|
| 41 |
import numpy as np
|
| 42 |
-
import spaces
|
| 43 |
-
import torch
|
| 44 |
|
| 45 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 46 |
os.environ["OPENBLAS_NUM_THREADS"] = "4"
|
| 47 |
os.environ["OMP_NUM_THREADS"] = "4"
|
| 48 |
os.environ["MKL_NUM_THREADS"] = "4"
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
logging.basicConfig(
|
| 54 |
level=logging.INFO,
|
|
@@ -57,6 +30,132 @@ logging.basicConfig(
|
|
| 57 |
)
|
| 58 |
logger = logging.getLogger(__name__)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# ---------- Inline i18n (en + zh-CN only) ----------
|
| 61 |
|
| 62 |
_USAGE_INSTRUCTIONS_EN = (
|
|
@@ -253,62 +352,15 @@ _APP_THEME = gr.themes.Soft(
|
|
| 253 |
font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
|
| 254 |
)
|
| 255 |
|
| 256 |
-
# ---------- Model Pre-download & Loading ----------
|
| 257 |
-
|
| 258 |
-
ASR_LOCAL_DIR = "./models/SenseVoiceSmall"
|
| 259 |
-
VOXCPM_LOCAL_DIR = "./models/VoxCPM2"
|
| 260 |
-
|
| 261 |
-
_asr_model = None
|
| 262 |
-
_voxcpm_model = None
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def predownload_models():
|
| 266 |
-
from huggingface_hub import snapshot_download
|
| 267 |
-
|
| 268 |
-
if not os.path.isdir(ASR_LOCAL_DIR) or not os.path.exists(
|
| 269 |
-
os.path.join(ASR_LOCAL_DIR, "model.pt")
|
| 270 |
-
):
|
| 271 |
-
logger.info(f"Pre-downloading ASR model to {ASR_LOCAL_DIR} ...")
|
| 272 |
-
os.makedirs(ASR_LOCAL_DIR, exist_ok=True)
|
| 273 |
-
try:
|
| 274 |
-
snapshot_download(
|
| 275 |
-
repo_id="FunAudioLLM/SenseVoiceSmall", local_dir=ASR_LOCAL_DIR
|
| 276 |
-
)
|
| 277 |
-
logger.info("ASR model downloaded.")
|
| 278 |
-
except Exception as exc:
|
| 279 |
-
logger.warning(f"Failed to pre-download ASR model: {exc}")
|
| 280 |
-
else:
|
| 281 |
-
logger.info(f"ASR model already at {ASR_LOCAL_DIR}")
|
| 282 |
-
|
| 283 |
-
voxcpm_repo_id = os.environ.get("HF_REPO_ID", "openbmb/VoxCPM2")
|
| 284 |
-
if not os.path.isdir(VOXCPM_LOCAL_DIR) or not os.path.exists(
|
| 285 |
-
os.path.join(VOXCPM_LOCAL_DIR, "config.json")
|
| 286 |
-
):
|
| 287 |
-
logger.info(
|
| 288 |
-
f"Pre-downloading VoxCPM model {voxcpm_repo_id} to {VOXCPM_LOCAL_DIR} ..."
|
| 289 |
-
)
|
| 290 |
-
os.makedirs(VOXCPM_LOCAL_DIR, exist_ok=True)
|
| 291 |
-
try:
|
| 292 |
-
snapshot_download(repo_id=voxcpm_repo_id, local_dir=VOXCPM_LOCAL_DIR)
|
| 293 |
-
logger.info("VoxCPM model downloaded.")
|
| 294 |
-
except Exception as exc:
|
| 295 |
-
logger.warning(f"Failed to pre-download VoxCPM model: {exc}")
|
| 296 |
-
else:
|
| 297 |
-
logger.info(f"VoxCPM model already at {VOXCPM_LOCAL_DIR}")
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
predownload_models()
|
| 301 |
-
|
| 302 |
-
|
| 303 |
def get_asr_model():
|
| 304 |
global _asr_model
|
| 305 |
if _asr_model is None:
|
| 306 |
from funasr import AutoModel
|
| 307 |
|
| 308 |
-
device = "
|
| 309 |
logger.info(f"Loading ASR model on {device} ...")
|
| 310 |
_asr_model = AutoModel(
|
| 311 |
-
model=
|
| 312 |
disable_update=True,
|
| 313 |
log_level="INFO",
|
| 314 |
device=device,
|
|
@@ -317,40 +369,66 @@ def get_asr_model():
|
|
| 317 |
return _asr_model
|
| 318 |
|
| 319 |
|
| 320 |
-
def
|
| 321 |
-
global
|
| 322 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
logger.info(
|
| 324 |
-
f"
|
| 325 |
-
f"
|
| 326 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
-
if torch.cuda.is_available():
|
| 329 |
-
torch.backends.cuda.enable_flash_sdp(False)
|
| 330 |
-
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
| 331 |
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
return
|
| 338 |
|
| 339 |
|
| 340 |
# ---------- GPU-accelerated inference ----------
|
| 341 |
|
| 342 |
|
| 343 |
-
@spaces.GPU
|
| 344 |
def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
|
| 345 |
if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
|
| 346 |
return ""
|
| 347 |
|
| 348 |
asr_model = get_asr_model()
|
| 349 |
res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
|
| 350 |
-
return
|
| 351 |
|
| 352 |
|
| 353 |
-
@spaces.GPU(duration=600)
|
| 354 |
def generate_tts_audio(
|
| 355 |
text_input: str,
|
| 356 |
control_instruction: str = "",
|
|
@@ -362,7 +440,9 @@ def generate_tts_audio(
|
|
| 362 |
denoise: bool = True,
|
| 363 |
inference_timesteps: int = 10,
|
| 364 |
) -> Tuple[int, np.ndarray]:
|
| 365 |
-
|
|
|
|
|
|
|
| 366 |
|
| 367 |
text = (text_input or "").strip()
|
| 368 |
if len(text) == 0:
|
|
@@ -371,40 +451,65 @@ def generate_tts_audio(
|
|
| 371 |
control = (control_instruction or "").strip()
|
| 372 |
final_text = f"({control}){text}" if control and not use_prompt_text else text
|
| 373 |
|
| 374 |
-
|
| 375 |
-
prompt_text_clean = (prompt_text_input or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
if not use_prompt_text:
|
| 377 |
-
prompt_text_clean =
|
| 378 |
-
|
| 379 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
logger.info("[Ultimate Cloning] reference audio + transcript")
|
| 381 |
-
elif
|
| 382 |
logger.info("[Controllable Cloning] reference audio only")
|
| 383 |
else:
|
| 384 |
logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
cfg_value=float(cfg_value_input),
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
)
|
| 394 |
-
if prompt_text_clean and audio_path:
|
| 395 |
-
generate_kwargs["prompt_wav_path"] = audio_path
|
| 396 |
-
generate_kwargs["prompt_text"] = prompt_text_clean
|
| 397 |
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
|
|
|
| 401 |
|
| 402 |
|
| 403 |
# ---------- UI ----------
|
| 404 |
|
| 405 |
|
| 406 |
def create_demo_interface():
|
| 407 |
-
|
|
|
|
|
|
|
| 408 |
|
| 409 |
def _on_toggle_instant(checked):
|
| 410 |
if checked:
|
|
@@ -420,21 +525,18 @@ def create_demo_interface():
|
|
| 420 |
def _run_asr_if_needed(checked, audio_path):
|
| 421 |
if not checked or not audio_path:
|
| 422 |
return gr.update()
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
return gr.update(value=asr_text)
|
| 428 |
-
except Exception as e:
|
| 429 |
-
logger.warning(f"ASR recognition failed: {e}")
|
| 430 |
-
return gr.update(value="")
|
| 431 |
|
| 432 |
with gr.Blocks() as interface:
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
| 438 |
|
| 439 |
gr.Markdown(I18N("usage_instructions"))
|
| 440 |
|
|
|
|
| 1 |
+
import atexit
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
import subprocess
|
| 5 |
import sys
|
| 6 |
from pathlib import Path
|
| 7 |
+
from threading import Lock
|
| 8 |
from typing import Optional, Tuple
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import gradio as gr
|
| 11 |
import numpy as np
|
|
|
|
|
|
|
| 12 |
|
| 13 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 14 |
os.environ["OPENBLAS_NUM_THREADS"] = "4"
|
| 15 |
os.environ["OMP_NUM_THREADS"] = "4"
|
| 16 |
os.environ["MKL_NUM_THREADS"] = "4"
|
| 17 |
|
| 18 |
+
DEFAULT_MODEL_REF = "openbmb/VoxCPM2"
|
| 19 |
+
if (
|
| 20 |
+
os.environ.get("NANOVLLM_MODEL", "").strip() == ""
|
| 21 |
+
and os.environ.get("NANOVLLM_MODEL_PATH", "").strip() == ""
|
| 22 |
+
and os.environ.get("HF_REPO_ID", "").strip() == ""
|
| 23 |
+
):
|
| 24 |
+
os.environ["HF_REPO_ID"] = DEFAULT_MODEL_REF
|
| 25 |
|
| 26 |
logging.basicConfig(
|
| 27 |
level=logging.INFO,
|
|
|
|
| 30 |
)
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
|
| 33 |
+
_asr_model = None
|
| 34 |
+
_voxcpm_server = None
|
| 35 |
+
_model_info = None
|
| 36 |
+
_server_inference_timesteps = None
|
| 37 |
+
_server_lock = Lock()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _get_int_env(name: str, default: int) -> int:
|
| 41 |
+
value = os.environ.get(name, "").strip()
|
| 42 |
+
if not value:
|
| 43 |
+
return default
|
| 44 |
+
return int(value)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _get_float_env(name: str, default: float) -> float:
|
| 48 |
+
value = os.environ.get(name, "").strip()
|
| 49 |
+
if not value:
|
| 50 |
+
return default
|
| 51 |
+
return float(value)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _get_bool_env(name: str, default: bool) -> bool:
|
| 55 |
+
value = os.environ.get(name, "").strip().lower()
|
| 56 |
+
if not value:
|
| 57 |
+
return default
|
| 58 |
+
if value in {"1", "true", "yes", "on"}:
|
| 59 |
+
return True
|
| 60 |
+
if value in {"0", "false", "no", "off"}:
|
| 61 |
+
return False
|
| 62 |
+
raise ValueError(f"Invalid boolean env: {name}={value!r}")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _get_devices_env() -> list[int]:
|
| 66 |
+
raw = os.environ.get("NANOVLLM_SERVERPOOL_DEVICES", "0").strip()
|
| 67 |
+
values = [part.strip() for part in raw.split(",") if part.strip()]
|
| 68 |
+
if not values:
|
| 69 |
+
return [0]
|
| 70 |
+
return [int(part) for part in values]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _resolve_model_ref() -> str:
|
| 74 |
+
for env_name in ("NANOVLLM_MODEL", "NANOVLLM_MODEL_PATH", "HF_REPO_ID"):
|
| 75 |
+
value = os.environ.get(env_name, "").strip()
|
| 76 |
+
if value:
|
| 77 |
+
return value
|
| 78 |
+
return DEFAULT_MODEL_REF
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _ensure_nanovllm_runtime() -> None:
|
| 82 |
+
try:
|
| 83 |
+
import flash_attn # noqa: F401
|
| 84 |
+
except ImportError:
|
| 85 |
+
logger.info("Installing flash-attn at runtime ...")
|
| 86 |
+
subprocess.check_call(
|
| 87 |
+
[
|
| 88 |
+
sys.executable,
|
| 89 |
+
"-m",
|
| 90 |
+
"pip",
|
| 91 |
+
"install",
|
| 92 |
+
"--no-build-isolation",
|
| 93 |
+
"flash-attn",
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
import nanovllm_voxcpm # noqa: F401
|
| 99 |
+
except ImportError:
|
| 100 |
+
logger.info("Installing nanovllm-voxcpm at runtime ...")
|
| 101 |
+
subprocess.check_call(
|
| 102 |
+
[
|
| 103 |
+
sys.executable,
|
| 104 |
+
"-m",
|
| 105 |
+
"pip",
|
| 106 |
+
"install",
|
| 107 |
+
"--no-deps",
|
| 108 |
+
"git+https://github.com/a710128/nanovllm-voxcpm.git",
|
| 109 |
+
]
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _extract_asr_text(asr_result) -> str:
|
| 114 |
+
if not asr_result:
|
| 115 |
+
return ""
|
| 116 |
+
|
| 117 |
+
first_item = asr_result[0]
|
| 118 |
+
if isinstance(first_item, dict):
|
| 119 |
+
return str(first_item.get("text", "")).split("|>")[-1].strip()
|
| 120 |
+
return ""
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _read_audio_bytes(audio_path: Optional[str]) -> tuple[bytes | None, str | None]:
|
| 124 |
+
if audio_path is None or not audio_path.strip():
|
| 125 |
+
return None, None
|
| 126 |
+
|
| 127 |
+
path = Path(audio_path)
|
| 128 |
+
audio_format = path.suffix.lstrip(".").lower() or "wav"
|
| 129 |
+
return path.read_bytes(), audio_format
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _safe_prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
|
| 133 |
+
try:
|
| 134 |
+
return prompt_wav_recognition(use_prompt_text, prompt_wav)
|
| 135 |
+
except Exception as exc:
|
| 136 |
+
logger.warning(f"ASR recognition failed: {exc}")
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _stop_server_if_needed() -> None:
|
| 141 |
+
global _voxcpm_server, _model_info, _server_inference_timesteps
|
| 142 |
+
if _voxcpm_server is None:
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
stop = getattr(_voxcpm_server, "stop", None)
|
| 146 |
+
if callable(stop):
|
| 147 |
+
try:
|
| 148 |
+
stop()
|
| 149 |
+
except Exception as exc:
|
| 150 |
+
logger.warning(f"Failed to stop nano-vLLM server cleanly: {exc}")
|
| 151 |
+
|
| 152 |
+
_voxcpm_server = None
|
| 153 |
+
_model_info = None
|
| 154 |
+
_server_inference_timesteps = None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
atexit.register(_stop_server_if_needed)
|
| 158 |
+
|
| 159 |
# ---------- Inline i18n (en + zh-CN only) ----------
|
| 160 |
|
| 161 |
_USAGE_INSTRUCTIONS_EN = (
|
|
|
|
| 352 |
font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
|
| 353 |
)
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
def get_asr_model():
|
| 356 |
global _asr_model
|
| 357 |
if _asr_model is None:
|
| 358 |
from funasr import AutoModel
|
| 359 |
|
| 360 |
+
device = os.environ.get("ASR_DEVICE", "cpu").strip() or "cpu"
|
| 361 |
logger.info(f"Loading ASR model on {device} ...")
|
| 362 |
_asr_model = AutoModel(
|
| 363 |
+
model="iic/SenseVoiceSmall",
|
| 364 |
disable_update=True,
|
| 365 |
log_level="INFO",
|
| 366 |
device=device,
|
|
|
|
| 369 |
return _asr_model
|
| 370 |
|
| 371 |
|
| 372 |
+
def get_voxcpm_server(inference_timesteps: int):
|
| 373 |
+
global _voxcpm_server, _model_info, _server_inference_timesteps
|
| 374 |
+
if _voxcpm_server is not None and _server_inference_timesteps == inference_timesteps:
|
| 375 |
+
return _voxcpm_server
|
| 376 |
+
|
| 377 |
+
with _server_lock:
|
| 378 |
+
if _voxcpm_server is not None and _server_inference_timesteps == inference_timesteps:
|
| 379 |
+
return _voxcpm_server
|
| 380 |
+
|
| 381 |
+
if _voxcpm_server is not None and _server_inference_timesteps != inference_timesteps:
|
| 382 |
+
logger.info(
|
| 383 |
+
f"Rebuilding nano-vLLM server for inference_timesteps={inference_timesteps} "
|
| 384 |
+
f"(previous={_server_inference_timesteps})"
|
| 385 |
+
)
|
| 386 |
+
_stop_server_if_needed()
|
| 387 |
+
|
| 388 |
+
_ensure_nanovllm_runtime()
|
| 389 |
+
from nanovllm_voxcpm import VoxCPM
|
| 390 |
+
|
| 391 |
+
model_ref = _resolve_model_ref()
|
| 392 |
logger.info(
|
| 393 |
+
f"Loading nano-vLLM VoxCPM server from {model_ref} "
|
| 394 |
+
f"with inference_timesteps={inference_timesteps} ..."
|
| 395 |
)
|
| 396 |
+
_voxcpm_server = VoxCPM.from_pretrained(
|
| 397 |
+
model=model_ref,
|
| 398 |
+
inference_timesteps=int(inference_timesteps),
|
| 399 |
+
max_num_batched_tokens=_get_int_env("NANOVLLM_SERVERPOOL_MAX_NUM_BATCHED_TOKENS", 8192),
|
| 400 |
+
max_num_seqs=_get_int_env("NANOVLLM_SERVERPOOL_MAX_NUM_SEQS", 16),
|
| 401 |
+
max_model_len=_get_int_env("NANOVLLM_SERVERPOOL_MAX_MODEL_LEN", 4096),
|
| 402 |
+
gpu_memory_utilization=_get_float_env("NANOVLLM_SERVERPOOL_GPU_MEMORY_UTILIZATION", 0.95),
|
| 403 |
+
enforce_eager=_get_bool_env("NANOVLLM_SERVERPOOL_ENFORCE_EAGER", False),
|
| 404 |
+
devices=_get_devices_env(),
|
| 405 |
+
)
|
| 406 |
+
_model_info = _voxcpm_server.get_model_info()
|
| 407 |
+
_server_inference_timesteps = inference_timesteps
|
| 408 |
+
logger.info(f"nano-vLLM VoxCPM server loaded: {_model_info}")
|
| 409 |
+
return _voxcpm_server
|
| 410 |
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
+
def get_model_info(inference_timesteps: int) -> dict:
|
| 413 |
+
global _model_info
|
| 414 |
+
if _model_info is None or _server_inference_timesteps != inference_timesteps:
|
| 415 |
+
get_voxcpm_server(inference_timesteps)
|
| 416 |
+
assert _model_info is not None
|
| 417 |
+
return _model_info
|
| 418 |
|
| 419 |
|
| 420 |
# ---------- GPU-accelerated inference ----------
|
| 421 |
|
| 422 |
|
|
|
|
| 423 |
def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
|
| 424 |
if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
|
| 425 |
return ""
|
| 426 |
|
| 427 |
asr_model = get_asr_model()
|
| 428 |
res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
|
| 429 |
+
return _extract_asr_text(res)
|
| 430 |
|
| 431 |
|
|
|
|
| 432 |
def generate_tts_audio(
|
| 433 |
text_input: str,
|
| 434 |
control_instruction: str = "",
|
|
|
|
| 440 |
denoise: bool = True,
|
| 441 |
inference_timesteps: int = 10,
|
| 442 |
) -> Tuple[int, np.ndarray]:
|
| 443 |
+
timesteps = int(inference_timesteps)
|
| 444 |
+
server = get_voxcpm_server(timesteps)
|
| 445 |
+
model_info = get_model_info(timesteps)
|
| 446 |
|
| 447 |
text = (text_input or "").strip()
|
| 448 |
if len(text) == 0:
|
|
|
|
| 451 |
control = (control_instruction or "").strip()
|
| 452 |
final_text = f"({control}){text}" if control and not use_prompt_text else text
|
| 453 |
|
| 454 |
+
audio_bytes, audio_format = _read_audio_bytes(reference_wav_path_input)
|
| 455 |
+
prompt_text_clean = (prompt_text_input or "").strip()
|
| 456 |
+
if use_prompt_text and audio_bytes is None:
|
| 457 |
+
raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
|
| 458 |
+
if use_prompt_text and not prompt_text_clean:
|
| 459 |
+
raise ValueError(
|
| 460 |
+
"Ultimate Cloning Mode requires a transcript. Please wait for ASR or fill it in manually."
|
| 461 |
+
)
|
| 462 |
if not use_prompt_text:
|
| 463 |
+
prompt_text_clean = ""
|
| 464 |
+
|
| 465 |
+
if do_normalize:
|
| 466 |
+
logger.info("Ignoring normalize option: nano-vLLM backend does not support per-request text normalization.")
|
| 467 |
+
if denoise:
|
| 468 |
+
logger.info("Ignoring denoise option: nano-vLLM backend does not support per-request reference denoising.")
|
| 469 |
+
|
| 470 |
+
prompt_latents = None
|
| 471 |
+
ref_audio_latents = None
|
| 472 |
+
if audio_bytes is not None and audio_format is not None and use_prompt_text:
|
| 473 |
+
logger.info(f"[Ultimate Cloning] encoding prompt audio as {audio_format}")
|
| 474 |
+
prompt_latents = server.encode_latents(audio_bytes, audio_format)
|
| 475 |
+
elif audio_bytes is not None and audio_format is not None:
|
| 476 |
+
logger.info(f"[Controllable Cloning] encoding reference audio as {audio_format}")
|
| 477 |
+
ref_audio_latents = server.encode_latents(audio_bytes, audio_format)
|
| 478 |
+
|
| 479 |
+
if prompt_latents is not None:
|
| 480 |
logger.info("[Ultimate Cloning] reference audio + transcript")
|
| 481 |
+
elif ref_audio_latents is not None:
|
| 482 |
logger.info("[Controllable Cloning] reference audio only")
|
| 483 |
else:
|
| 484 |
logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
|
| 485 |
|
| 486 |
+
chunks: list[np.ndarray] = []
|
| 487 |
+
logger.info(f"Generating: '{final_text[:80]}...'")
|
| 488 |
+
for chunk in server.generate(
|
| 489 |
+
target_text=final_text,
|
| 490 |
+
prompt_latents=prompt_latents,
|
| 491 |
+
prompt_text=prompt_text_clean if prompt_latents is not None else "",
|
| 492 |
+
max_generate_length=_get_int_env("NANOVLLM_MAX_GENERATE_LENGTH", 2000),
|
| 493 |
+
temperature=_get_float_env("NANOVLLM_TEMPERATURE", 1.0),
|
| 494 |
cfg_value=float(cfg_value_input),
|
| 495 |
+
ref_audio_latents=ref_audio_latents,
|
| 496 |
+
):
|
| 497 |
+
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
|
| 499 |
+
if not chunks:
|
| 500 |
+
raise RuntimeError("The model returned no audio chunks.")
|
| 501 |
+
|
| 502 |
+
wav = np.concatenate(chunks, axis=0).astype(np.float32, copy=False)
|
| 503 |
+
return (int(model_info["sample_rate"]), wav)
|
| 504 |
|
| 505 |
|
| 506 |
# ---------- UI ----------
|
| 507 |
|
| 508 |
|
| 509 |
def create_demo_interface():
|
| 510 |
+
assets_dir = Path.cwd().absolute() / "assets"
|
| 511 |
+
if assets_dir.exists():
|
| 512 |
+
gr.set_static_paths(paths=[assets_dir])
|
| 513 |
|
| 514 |
def _on_toggle_instant(checked):
|
| 515 |
if checked:
|
|
|
|
| 525 |
def _run_asr_if_needed(checked, audio_path):
|
| 526 |
if not checked or not audio_path:
|
| 527 |
return gr.update()
|
| 528 |
+
logger.info("Running ASR on reference audio...")
|
| 529 |
+
asr_text = _safe_prompt_wav_recognition(True, audio_path)
|
| 530 |
+
logger.info(f"ASR result: {asr_text[:60]}...")
|
| 531 |
+
return gr.update(value=asr_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
with gr.Blocks() as interface:
|
| 534 |
+
if (assets_dir / "voxcpm_logo.png").exists():
|
| 535 |
+
gr.HTML(
|
| 536 |
+
'<div class="logo-container">'
|
| 537 |
+
'<img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo">'
|
| 538 |
+
"</div>"
|
| 539 |
+
)
|
| 540 |
|
| 541 |
gr.Markdown(I18N("usage_instructions"))
|
| 542 |
|
packages.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
build-essential
|
| 2 |
+
git
|
| 3 |
+
ninja-build
|
| 4 |
+
pkg-config
|
| 5 |
+
ffmpeg
|
| 6 |
+
libsndfile1-dev
|
| 7 |
+
python3-dev
|
requirements.txt
CHANGED
|
@@ -1,22 +1,19 @@
|
|
|
|
|
| 1 |
huggingface-hub
|
| 2 |
funasr
|
| 3 |
numpy>=1.21.0
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
wetext
|
| 10 |
-
modelscope>=1.22.0
|
| 11 |
-
datasets>=3,<4
|
| 12 |
-
pydantic
|
| 13 |
tqdm
|
| 14 |
-
simplejson
|
| 15 |
-
sortedcontainers
|
| 16 |
-
soundfile
|
| 17 |
librosa
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
safetensors
|
| 21 |
-
torchaudio==2.5.0
|
| 22 |
torchcodec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==6.0.0
|
| 2 |
huggingface-hub
|
| 3 |
funasr
|
| 4 |
numpy>=1.21.0
|
| 5 |
+
torch>=2.5.0,!=2.6.*
|
| 6 |
+
torchaudio
|
| 7 |
+
triton>=3.0.0
|
| 8 |
+
transformers>=4.51.0
|
| 9 |
+
xxhash
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
tqdm
|
|
|
|
|
|
|
|
|
|
| 11 |
librosa
|
| 12 |
+
pydantic
|
| 13 |
+
soundfile>=0.13.1
|
|
|
|
|
|
|
| 14 |
torchcodec
|
| 15 |
+
packaging
|
| 16 |
+
psutil
|
| 17 |
+
ninja
|
| 18 |
+
setuptools
|
| 19 |
+
wheel
|