刘鑫 commited on
Commit
09dc185
·
1 Parent(s): 29b439e

fix: switch Gradio demo backend to nanovllm_voxcpm

Browse files

Keep the existing VoxCPM demo UI while routing inference through Nano-vLLM on persistent GPU Spaces. Add runtime/system dependency setup so the managed Gradio environment can attempt the non-Docker backend path.

Made-with: Cursor

Files changed (4) hide show
  1. README.md +32 -1
  2. app.py +238 -136
  3. packages.txt +7 -0
  4. requirements.txt +13 -16
README.md CHANGED
@@ -9,5 +9,36 @@ app_file: app.py
9
  python_version: "3.10"
10
  pinned: true
11
  license: apache-2.0
12
- short_description: VoxCPM2 Speech Synthesis
13
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  python_version: "3.10"
10
  pinned: true
11
  license: apache-2.0
12
+ short_description: VoxCPM2 Nano-vLLM Demo
13
  ---
14
+
15
+ Experimental Gradio Space demo for `VoxCPM2` powered by `nanovllm-voxcpm`.
16
+
17
+ This repo keeps the existing Gradio frontend layout and swaps only the backend inference path to Nano-vLLM.
18
+
19
+ Notes:
20
+
21
+ - This is the non-Docker experiment path. It relies on a persistent GPU Gradio Space.
22
+ - `flash-attn` and `nanovllm-voxcpm` are installed at runtime on first backend initialization.
23
+ - The first generation request may spend extra time installing dependencies, downloading the model, and loading the server.
24
+ - `ASR_DEVICE` defaults to `cpu` to avoid competing with TTS GPU memory.
25
+ - The `LocDiT flow-matching steps` slider is wired to Nano-vLLM server `inference_timesteps`; changing it rebuilds the backend server.
26
+ - The existing `normalize` / `denoise` frontend toggles are kept for UI compatibility, but Nano-vLLM currently ignores them.
27
+ - `packages.txt` is required because this path needs extra system build dependencies.
28
+
29
+ Recommended environment variables:
30
+
31
+ - `HF_REPO_ID`: Hugging Face model repo id. Defaults to `openbmb/VoxCPM2`
32
+ - `HF_TOKEN`: required if the model repo is private
33
+ - `NANOVLLM_MODEL`: optional direct model ref override. Can be a local path or HF repo id
34
+ - `NANOVLLM_MODEL_PATH`: optional local model path override
35
+ - `ASR_DEVICE`: defaults to `cpu`
36
+ - `NANOVLLM_INFERENCE_TIMESTEPS`: initial default is `10`
37
+ - `NANOVLLM_SERVERPOOL_MAX_NUM_BATCHED_TOKENS`: defaults to `8192`
38
+ - `NANOVLLM_SERVERPOOL_MAX_NUM_SEQS`: defaults to `16`
39
+ - `NANOVLLM_SERVERPOOL_MAX_MODEL_LEN`: defaults to `4096`
40
+ - `NANOVLLM_SERVERPOOL_GPU_MEMORY_UTILIZATION`: defaults to `0.95`
41
+ - `NANOVLLM_SERVERPOOL_ENFORCE_EAGER`: defaults to `false`
42
+ - `NANOVLLM_SERVERPOOL_DEVICES`: defaults to `0`
43
+ - `NANOVLLM_MAX_GENERATE_LENGTH`: defaults to `2000`
44
+ - `NANOVLLM_TEMPERATURE`: defaults to `1.0`
app.py CHANGED
@@ -1,54 +1,27 @@
 
1
  import logging
2
  import os
3
  import subprocess
4
  import sys
5
  from pathlib import Path
 
6
  from typing import Optional, Tuple
7
 
8
- def _ensure_torchaudio():
9
- """Install torchaudio matching ZeroGPU's pre-installed torch + CUDA version."""
10
- try:
11
- import torchaudio # noqa: F401
12
- return
13
- except (ImportError, OSError):
14
- pass
15
- import torch
16
- torch_ver = torch.__version__.split("+")[0]
17
- cuda_ver = torch.version.cuda
18
- if cuda_ver:
19
- tag = "cu" + cuda_ver.replace(".", "")
20
- else:
21
- tag = "cpu"
22
- index = f"https://download.pytorch.org/whl/{tag}"
23
- subprocess.check_call([
24
- sys.executable, "-m", "pip", "install", "--no-deps",
25
- "--index-url", index,
26
- f"torchaudio=={torch_ver}",
27
- ])
28
-
29
- _ensure_torchaudio()
30
-
31
- try:
32
- import voxcpm # noqa: F401
33
- except ImportError:
34
- subprocess.check_call([
35
- sys.executable, "-m", "pip", "install", "--no-deps",
36
- "voxcpm @ git+https://github.com/OpenBMB/VoxCPM.git@dev_2.0",
37
- ])
38
- import voxcpm # noqa: F401
39
-
40
  import gradio as gr
41
  import numpy as np
42
- import spaces
43
- import torch
44
 
45
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
46
  os.environ["OPENBLAS_NUM_THREADS"] = "4"
47
  os.environ["OMP_NUM_THREADS"] = "4"
48
  os.environ["MKL_NUM_THREADS"] = "4"
49
 
50
- if os.environ.get("HF_REPO_ID", "").strip() == "":
51
- os.environ["HF_REPO_ID"] = "openbmb/VoxCPM2"
 
 
 
 
 
52
 
53
  logging.basicConfig(
54
  level=logging.INFO,
@@ -57,6 +30,132 @@ logging.basicConfig(
57
  )
58
  logger = logging.getLogger(__name__)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # ---------- Inline i18n (en + zh-CN only) ----------
61
 
62
  _USAGE_INSTRUCTIONS_EN = (
@@ -253,62 +352,15 @@ _APP_THEME = gr.themes.Soft(
253
  font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
254
  )
255
 
256
- # ---------- Model Pre-download & Loading ----------
257
-
258
- ASR_LOCAL_DIR = "./models/SenseVoiceSmall"
259
- VOXCPM_LOCAL_DIR = "./models/VoxCPM2"
260
-
261
- _asr_model = None
262
- _voxcpm_model = None
263
-
264
-
265
- def predownload_models():
266
- from huggingface_hub import snapshot_download
267
-
268
- if not os.path.isdir(ASR_LOCAL_DIR) or not os.path.exists(
269
- os.path.join(ASR_LOCAL_DIR, "model.pt")
270
- ):
271
- logger.info(f"Pre-downloading ASR model to {ASR_LOCAL_DIR} ...")
272
- os.makedirs(ASR_LOCAL_DIR, exist_ok=True)
273
- try:
274
- snapshot_download(
275
- repo_id="FunAudioLLM/SenseVoiceSmall", local_dir=ASR_LOCAL_DIR
276
- )
277
- logger.info("ASR model downloaded.")
278
- except Exception as exc:
279
- logger.warning(f"Failed to pre-download ASR model: {exc}")
280
- else:
281
- logger.info(f"ASR model already at {ASR_LOCAL_DIR}")
282
-
283
- voxcpm_repo_id = os.environ.get("HF_REPO_ID", "openbmb/VoxCPM2")
284
- if not os.path.isdir(VOXCPM_LOCAL_DIR) or not os.path.exists(
285
- os.path.join(VOXCPM_LOCAL_DIR, "config.json")
286
- ):
287
- logger.info(
288
- f"Pre-downloading VoxCPM model {voxcpm_repo_id} to {VOXCPM_LOCAL_DIR} ..."
289
- )
290
- os.makedirs(VOXCPM_LOCAL_DIR, exist_ok=True)
291
- try:
292
- snapshot_download(repo_id=voxcpm_repo_id, local_dir=VOXCPM_LOCAL_DIR)
293
- logger.info("VoxCPM model downloaded.")
294
- except Exception as exc:
295
- logger.warning(f"Failed to pre-download VoxCPM model: {exc}")
296
- else:
297
- logger.info(f"VoxCPM model already at {VOXCPM_LOCAL_DIR}")
298
-
299
-
300
- predownload_models()
301
-
302
-
303
  def get_asr_model():
304
  global _asr_model
305
  if _asr_model is None:
306
  from funasr import AutoModel
307
 
308
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
309
  logger.info(f"Loading ASR model on {device} ...")
310
  _asr_model = AutoModel(
311
- model=ASR_LOCAL_DIR,
312
  disable_update=True,
313
  log_level="INFO",
314
  device=device,
@@ -317,40 +369,66 @@ def get_asr_model():
317
  return _asr_model
318
 
319
 
320
- def get_voxcpm_model():
321
- global _voxcpm_model
322
- if _voxcpm_model is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  logger.info(
324
- f"[DEBUG] CUDA available: {torch.cuda.is_available()}, "
325
- f"device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}"
326
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- if torch.cuda.is_available():
329
- torch.backends.cuda.enable_flash_sdp(False)
330
- torch.backends.cuda.enable_mem_efficient_sdp(False)
331
 
332
- logger.info(f"Loading VoxCPM model from {VOXCPM_LOCAL_DIR} ...")
333
- _voxcpm_model = voxcpm.VoxCPM(
334
- voxcpm_model_path=VOXCPM_LOCAL_DIR, optimize=True
335
- )
336
- logger.info("VoxCPM model loaded.")
337
- return _voxcpm_model
338
 
339
 
340
  # ---------- GPU-accelerated inference ----------
341
 
342
 
343
- @spaces.GPU
344
  def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
345
  if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
346
  return ""
347
 
348
  asr_model = get_asr_model()
349
  res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
350
- return res[0]["text"].split("|>")[-1]
351
 
352
 
353
- @spaces.GPU(duration=600)
354
  def generate_tts_audio(
355
  text_input: str,
356
  control_instruction: str = "",
@@ -362,7 +440,9 @@ def generate_tts_audio(
362
  denoise: bool = True,
363
  inference_timesteps: int = 10,
364
  ) -> Tuple[int, np.ndarray]:
365
- voxcpm_model = get_voxcpm_model()
 
 
366
 
367
  text = (text_input or "").strip()
368
  if len(text) == 0:
@@ -371,40 +451,65 @@ def generate_tts_audio(
371
  control = (control_instruction or "").strip()
372
  final_text = f"({control}){text}" if control and not use_prompt_text else text
373
 
374
- audio_path = reference_wav_path_input if reference_wav_path_input else None
375
- prompt_text_clean = (prompt_text_input or "").strip() or None
 
 
 
 
 
 
376
  if not use_prompt_text:
377
- prompt_text_clean = None
378
-
379
- if audio_path and prompt_text_clean:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  logger.info("[Ultimate Cloning] reference audio + transcript")
381
- elif audio_path:
382
  logger.info("[Controllable Cloning] reference audio only")
383
  else:
384
  logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
385
 
386
- generate_kwargs = dict(
387
- text=final_text,
388
- reference_wav_path=audio_path,
 
 
 
 
 
389
  cfg_value=float(cfg_value_input),
390
- inference_timesteps=int(inference_timesteps),
391
- normalize=do_normalize,
392
- denoise=denoise,
393
- )
394
- if prompt_text_clean and audio_path:
395
- generate_kwargs["prompt_wav_path"] = audio_path
396
- generate_kwargs["prompt_text"] = prompt_text_clean
397
 
398
- logger.info(f"Generating: '{final_text[:80]}...'")
399
- wav = voxcpm_model.generate(**generate_kwargs)
400
- return (voxcpm_model.tts_model.sample_rate, wav)
 
 
401
 
402
 
403
  # ---------- UI ----------
404
 
405
 
406
  def create_demo_interface():
407
- gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
 
 
408
 
409
  def _on_toggle_instant(checked):
410
  if checked:
@@ -420,21 +525,18 @@ def create_demo_interface():
420
  def _run_asr_if_needed(checked, audio_path):
421
  if not checked or not audio_path:
422
  return gr.update()
423
- try:
424
- logger.info("Running ASR on reference audio...")
425
- asr_text = prompt_wav_recognition(True, audio_path)
426
- logger.info(f"ASR result: {asr_text[:60]}...")
427
- return gr.update(value=asr_text)
428
- except Exception as e:
429
- logger.warning(f"ASR recognition failed: {e}")
430
- return gr.update(value="")
431
 
432
  with gr.Blocks() as interface:
433
- gr.HTML(
434
- '<div class="logo-container">'
435
- '<img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo">'
436
- "</div>"
437
- )
 
438
 
439
  gr.Markdown(I18N("usage_instructions"))
440
 
 
1
+ import atexit
2
  import logging
3
  import os
4
  import subprocess
5
  import sys
6
  from pathlib import Path
7
+ from threading import Lock
8
  from typing import Optional, Tuple
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import gradio as gr
11
  import numpy as np
 
 
12
 
13
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
  os.environ["OPENBLAS_NUM_THREADS"] = "4"
15
  os.environ["OMP_NUM_THREADS"] = "4"
16
  os.environ["MKL_NUM_THREADS"] = "4"
17
 
18
+ DEFAULT_MODEL_REF = "openbmb/VoxCPM2"
19
+ if (
20
+ os.environ.get("NANOVLLM_MODEL", "").strip() == ""
21
+ and os.environ.get("NANOVLLM_MODEL_PATH", "").strip() == ""
22
+ and os.environ.get("HF_REPO_ID", "").strip() == ""
23
+ ):
24
+ os.environ["HF_REPO_ID"] = DEFAULT_MODEL_REF
25
 
26
  logging.basicConfig(
27
  level=logging.INFO,
 
30
  )
31
  logger = logging.getLogger(__name__)
32
 
33
+ _asr_model = None
34
+ _voxcpm_server = None
35
+ _model_info = None
36
+ _server_inference_timesteps = None
37
+ _server_lock = Lock()
38
+
39
+
40
+ def _get_int_env(name: str, default: int) -> int:
41
+ value = os.environ.get(name, "").strip()
42
+ if not value:
43
+ return default
44
+ return int(value)
45
+
46
+
47
+ def _get_float_env(name: str, default: float) -> float:
48
+ value = os.environ.get(name, "").strip()
49
+ if not value:
50
+ return default
51
+ return float(value)
52
+
53
+
54
+ def _get_bool_env(name: str, default: bool) -> bool:
55
+ value = os.environ.get(name, "").strip().lower()
56
+ if not value:
57
+ return default
58
+ if value in {"1", "true", "yes", "on"}:
59
+ return True
60
+ if value in {"0", "false", "no", "off"}:
61
+ return False
62
+ raise ValueError(f"Invalid boolean env: {name}={value!r}")
63
+
64
+
65
+ def _get_devices_env() -> list[int]:
66
+ raw = os.environ.get("NANOVLLM_SERVERPOOL_DEVICES", "0").strip()
67
+ values = [part.strip() for part in raw.split(",") if part.strip()]
68
+ if not values:
69
+ return [0]
70
+ return [int(part) for part in values]
71
+
72
+
73
+ def _resolve_model_ref() -> str:
74
+ for env_name in ("NANOVLLM_MODEL", "NANOVLLM_MODEL_PATH", "HF_REPO_ID"):
75
+ value = os.environ.get(env_name, "").strip()
76
+ if value:
77
+ return value
78
+ return DEFAULT_MODEL_REF
79
+
80
+
81
+ def _ensure_nanovllm_runtime() -> None:
82
+ try:
83
+ import flash_attn # noqa: F401
84
+ except ImportError:
85
+ logger.info("Installing flash-attn at runtime ...")
86
+ subprocess.check_call(
87
+ [
88
+ sys.executable,
89
+ "-m",
90
+ "pip",
91
+ "install",
92
+ "--no-build-isolation",
93
+ "flash-attn",
94
+ ]
95
+ )
96
+
97
+ try:
98
+ import nanovllm_voxcpm # noqa: F401
99
+ except ImportError:
100
+ logger.info("Installing nanovllm-voxcpm at runtime ...")
101
+ subprocess.check_call(
102
+ [
103
+ sys.executable,
104
+ "-m",
105
+ "pip",
106
+ "install",
107
+ "--no-deps",
108
+ "git+https://github.com/a710128/nanovllm-voxcpm.git",
109
+ ]
110
+ )
111
+
112
+
113
+ def _extract_asr_text(asr_result) -> str:
114
+ if not asr_result:
115
+ return ""
116
+
117
+ first_item = asr_result[0]
118
+ if isinstance(first_item, dict):
119
+ return str(first_item.get("text", "")).split("|>")[-1].strip()
120
+ return ""
121
+
122
+
123
+ def _read_audio_bytes(audio_path: Optional[str]) -> tuple[bytes | None, str | None]:
124
+ if audio_path is None or not audio_path.strip():
125
+ return None, None
126
+
127
+ path = Path(audio_path)
128
+ audio_format = path.suffix.lstrip(".").lower() or "wav"
129
+ return path.read_bytes(), audio_format
130
+
131
+
132
+ def _safe_prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
133
+ try:
134
+ return prompt_wav_recognition(use_prompt_text, prompt_wav)
135
+ except Exception as exc:
136
+ logger.warning(f"ASR recognition failed: {exc}")
137
+ return ""
138
+
139
+
140
+ def _stop_server_if_needed() -> None:
141
+ global _voxcpm_server, _model_info, _server_inference_timesteps
142
+ if _voxcpm_server is None:
143
+ return
144
+
145
+ stop = getattr(_voxcpm_server, "stop", None)
146
+ if callable(stop):
147
+ try:
148
+ stop()
149
+ except Exception as exc:
150
+ logger.warning(f"Failed to stop nano-vLLM server cleanly: {exc}")
151
+
152
+ _voxcpm_server = None
153
+ _model_info = None
154
+ _server_inference_timesteps = None
155
+
156
+
157
+ atexit.register(_stop_server_if_needed)
158
+
159
  # ---------- Inline i18n (en + zh-CN only) ----------
160
 
161
  _USAGE_INSTRUCTIONS_EN = (
 
352
  font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
353
  )
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  def get_asr_model():
356
  global _asr_model
357
  if _asr_model is None:
358
  from funasr import AutoModel
359
 
360
+ device = os.environ.get("ASR_DEVICE", "cpu").strip() or "cpu"
361
  logger.info(f"Loading ASR model on {device} ...")
362
  _asr_model = AutoModel(
363
+ model="iic/SenseVoiceSmall",
364
  disable_update=True,
365
  log_level="INFO",
366
  device=device,
 
369
  return _asr_model
370
 
371
 
372
+ def get_voxcpm_server(inference_timesteps: int):
373
+ global _voxcpm_server, _model_info, _server_inference_timesteps
374
+ if _voxcpm_server is not None and _server_inference_timesteps == inference_timesteps:
375
+ return _voxcpm_server
376
+
377
+ with _server_lock:
378
+ if _voxcpm_server is not None and _server_inference_timesteps == inference_timesteps:
379
+ return _voxcpm_server
380
+
381
+ if _voxcpm_server is not None and _server_inference_timesteps != inference_timesteps:
382
+ logger.info(
383
+ f"Rebuilding nano-vLLM server for inference_timesteps={inference_timesteps} "
384
+ f"(previous={_server_inference_timesteps})"
385
+ )
386
+ _stop_server_if_needed()
387
+
388
+ _ensure_nanovllm_runtime()
389
+ from nanovllm_voxcpm import VoxCPM
390
+
391
+ model_ref = _resolve_model_ref()
392
  logger.info(
393
+ f"Loading nano-vLLM VoxCPM server from {model_ref} "
394
+ f"with inference_timesteps={inference_timesteps} ..."
395
  )
396
+ _voxcpm_server = VoxCPM.from_pretrained(
397
+ model=model_ref,
398
+ inference_timesteps=int(inference_timesteps),
399
+ max_num_batched_tokens=_get_int_env("NANOVLLM_SERVERPOOL_MAX_NUM_BATCHED_TOKENS", 8192),
400
+ max_num_seqs=_get_int_env("NANOVLLM_SERVERPOOL_MAX_NUM_SEQS", 16),
401
+ max_model_len=_get_int_env("NANOVLLM_SERVERPOOL_MAX_MODEL_LEN", 4096),
402
+ gpu_memory_utilization=_get_float_env("NANOVLLM_SERVERPOOL_GPU_MEMORY_UTILIZATION", 0.95),
403
+ enforce_eager=_get_bool_env("NANOVLLM_SERVERPOOL_ENFORCE_EAGER", False),
404
+ devices=_get_devices_env(),
405
+ )
406
+ _model_info = _voxcpm_server.get_model_info()
407
+ _server_inference_timesteps = inference_timesteps
408
+ logger.info(f"nano-vLLM VoxCPM server loaded: {_model_info}")
409
+ return _voxcpm_server
410
 
 
 
 
411
 
412
+ def get_model_info(inference_timesteps: int) -> dict:
413
+ global _model_info
414
+ if _model_info is None or _server_inference_timesteps != inference_timesteps:
415
+ get_voxcpm_server(inference_timesteps)
416
+ assert _model_info is not None
417
+ return _model_info
418
 
419
 
420
  # ---------- GPU-accelerated inference ----------
421
 
422
 
 
423
  def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
424
  if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
425
  return ""
426
 
427
  asr_model = get_asr_model()
428
  res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
429
+ return _extract_asr_text(res)
430
 
431
 
 
432
  def generate_tts_audio(
433
  text_input: str,
434
  control_instruction: str = "",
 
440
  denoise: bool = True,
441
  inference_timesteps: int = 10,
442
  ) -> Tuple[int, np.ndarray]:
443
+ timesteps = int(inference_timesteps)
444
+ server = get_voxcpm_server(timesteps)
445
+ model_info = get_model_info(timesteps)
446
 
447
  text = (text_input or "").strip()
448
  if len(text) == 0:
 
451
  control = (control_instruction or "").strip()
452
  final_text = f"({control}){text}" if control and not use_prompt_text else text
453
 
454
+ audio_bytes, audio_format = _read_audio_bytes(reference_wav_path_input)
455
+ prompt_text_clean = (prompt_text_input or "").strip()
456
+ if use_prompt_text and audio_bytes is None:
457
+ raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
458
+ if use_prompt_text and not prompt_text_clean:
459
+ raise ValueError(
460
+ "Ultimate Cloning Mode requires a transcript. Please wait for ASR or fill it in manually."
461
+ )
462
  if not use_prompt_text:
463
+ prompt_text_clean = ""
464
+
465
+ if do_normalize:
466
+ logger.info("Ignoring normalize option: nano-vLLM backend does not support per-request text normalization.")
467
+ if denoise:
468
+ logger.info("Ignoring denoise option: nano-vLLM backend does not support per-request reference denoising.")
469
+
470
+ prompt_latents = None
471
+ ref_audio_latents = None
472
+ if audio_bytes is not None and audio_format is not None and use_prompt_text:
473
+ logger.info(f"[Ultimate Cloning] encoding prompt audio as {audio_format}")
474
+ prompt_latents = server.encode_latents(audio_bytes, audio_format)
475
+ elif audio_bytes is not None and audio_format is not None:
476
+ logger.info(f"[Controllable Cloning] encoding reference audio as {audio_format}")
477
+ ref_audio_latents = server.encode_latents(audio_bytes, audio_format)
478
+
479
+ if prompt_latents is not None:
480
  logger.info("[Ultimate Cloning] reference audio + transcript")
481
+ elif ref_audio_latents is not None:
482
  logger.info("[Controllable Cloning] reference audio only")
483
  else:
484
  logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
485
 
486
+ chunks: list[np.ndarray] = []
487
+ logger.info(f"Generating: '{final_text[:80]}...'")
488
+ for chunk in server.generate(
489
+ target_text=final_text,
490
+ prompt_latents=prompt_latents,
491
+ prompt_text=prompt_text_clean if prompt_latents is not None else "",
492
+ max_generate_length=_get_int_env("NANOVLLM_MAX_GENERATE_LENGTH", 2000),
493
+ temperature=_get_float_env("NANOVLLM_TEMPERATURE", 1.0),
494
  cfg_value=float(cfg_value_input),
495
+ ref_audio_latents=ref_audio_latents,
496
+ ):
497
+ chunks.append(chunk)
 
 
 
 
498
 
499
+ if not chunks:
500
+ raise RuntimeError("The model returned no audio chunks.")
501
+
502
+ wav = np.concatenate(chunks, axis=0).astype(np.float32, copy=False)
503
+ return (int(model_info["sample_rate"]), wav)
504
 
505
 
506
  # ---------- UI ----------
507
 
508
 
509
  def create_demo_interface():
510
+ assets_dir = Path.cwd().absolute() / "assets"
511
+ if assets_dir.exists():
512
+ gr.set_static_paths(paths=[assets_dir])
513
 
514
  def _on_toggle_instant(checked):
515
  if checked:
 
525
  def _run_asr_if_needed(checked, audio_path):
526
  if not checked or not audio_path:
527
  return gr.update()
528
+ logger.info("Running ASR on reference audio...")
529
+ asr_text = _safe_prompt_wav_recognition(True, audio_path)
530
+ logger.info(f"ASR result: {asr_text[:60]}...")
531
+ return gr.update(value=asr_text)
 
 
 
 
532
 
533
  with gr.Blocks() as interface:
534
+ if (assets_dir / "voxcpm_logo.png").exists():
535
+ gr.HTML(
536
+ '<div class="logo-container">'
537
+ '<img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo">'
538
+ "</div>"
539
+ )
540
 
541
  gr.Markdown(I18N("usage_instructions"))
542
 
packages.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ build-essential
2
+ git
3
+ ninja-build
4
+ pkg-config
5
+ ffmpeg
6
+ libsndfile1-dev
7
+ python3-dev
requirements.txt CHANGED
@@ -1,22 +1,19 @@
 
1
  huggingface-hub
2
  funasr
3
  numpy>=1.21.0
4
- spaces
5
- transformers>=4.36.2
6
- einops
7
- inflect
8
- addict
9
- wetext
10
- modelscope>=1.22.0
11
- datasets>=3,<4
12
- pydantic
13
  tqdm
14
- simplejson
15
- sortedcontainers
16
- soundfile
17
  librosa
18
- matplotlib
19
- argbind
20
- safetensors
21
- torchaudio==2.5.0
22
  torchcodec
 
 
 
 
 
 
1
+ gradio==6.0.0
2
  huggingface-hub
3
  funasr
4
  numpy>=1.21.0
5
+ torch>=2.5.0,!=2.6.*
6
+ torchaudio
7
+ triton>=3.0.0
8
+ transformers>=4.51.0
9
+ xxhash
 
 
 
 
10
  tqdm
 
 
 
11
  librosa
12
+ pydantic
13
+ soundfile>=0.13.1
 
 
14
  torchcodec
15
+ packaging
16
+ psutil
17
+ ninja
18
+ setuptools
19
+ wheel