smartwang commited on
Commit
27ddaa6
·
1 Parent(s): e0707a4
Files changed (2) hide show
  1. app.py +93 -511
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,8 +1,4 @@
1
  # coding=utf-8
2
- # Qwen3-TTS Gradio Demo for HuggingFace Spaces with Zero GPU
3
- # Supports: Voice Design, Voice Clone (Base), TTS (CustomVoice)
4
- #import subprocess
5
- #subprocess.run('pip install flash-attn==2.7.4.post1', shell=True)
6
  import os
7
  import sys
8
  import logging
@@ -18,33 +14,26 @@ import uuid
18
  import random
19
  import whisper
20
  import librosa
 
 
21
  # 配置日志
22
  logging.basicConfig(
23
  level=logging.INFO,
24
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
25
- handlers=[
26
- logging.StreamHandler(sys.stdout)
27
- ]
28
  )
29
- # 屏蔽第三方库的冗余日志
30
- logging.getLogger("httpx").setLevel(logging.WARNING)
31
- logging.getLogger("urllib3").setLevel(logging.WARNING)
32
- logging.getLogger("httpcore").setLevel(logging.WARNING)
33
- logging.getLogger("gradio").setLevel(logging.WARNING)
34
-
35
  logger = logging.getLogger("Qwen3-TTS-Demo")
36
 
 
 
 
37
  HF_TOKEN = os.environ.get('HF_TOKEN')
38
- login(token=HF_TOKEN)
 
39
 
40
- # Model size options
41
  MODEL_SIZES = ["0.6B", "1.7B"]
42
-
43
- # Speaker and language choices for CustomVoice model
44
- SPEAKERS = [
45
- "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
46
- ]
47
  LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
 
48
  def seed_everything(seed=42):
49
  random.seed(seed)
50
  np.random.seed(seed)
@@ -55,42 +44,14 @@ def seed_everything(seed=42):
55
  torch.backends.cudnn.benchmark = False
56
 
57
  def get_model_path(model_type: str, model_size: str) -> str:
58
- """Get model path based on type and size."""
59
  return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
60
 
61
-
62
- # ============================================================================
63
- # GLOBAL MODEL LOADING - Load all models at startup
64
- # ============================================================================
65
- logger.info("正在加载所有模型到 CUDA...")
66
-
67
- # # Voice Design model (1.7B only)
68
- # logger.info("正在加载 VoiceDesign 1.7B 模型...")
69
- # voice_design_model = Qwen3TTSModel.from_pretrained(
70
- # get_model_path("VoiceDesign", "1.7B"),
71
- # device_map="cuda",
72
- # dtype=torch.bfloat16,
73
- # token=HF_TOKEN,
74
- # attn_implementation="kernels-community/flash-attn3",
75
- # )
76
-
77
- # # Base (Voice Clone) models - both sizes
78
- # logger.info("正在加载 Base 0.6B 模型...")
79
- # base_model_0_6b = Qwen3TTSModel.from_pretrained(
80
- # get_model_path("Base", "0.6B"),
81
- # device_map="cuda",
82
- # dtype=torch.bfloat16,
83
- # token=HF_TOKEN,
84
- # attn_implementation="kernels-community/flash-attn3",
85
- # )
86
-
87
- @functools.lru_cache(maxsize=1) # 只缓存当前正在使用的模型,节省显存
88
  def load_model(model_type, model_size):
89
- logger.info(f"正在按需加载 {model_type} {model_size} 模型...")
90
  path = get_model_path(model_type, model_size)
91
  return Qwen3TTSModel.from_pretrained(
92
  path,
93
- device_map="cuda", # 注意:在 ZeroGPU 环境下,这行只有在被装饰的函数内执行才有效
94
  dtype=torch.bfloat16,
95
  token=HF_TOKEN,
96
  attn_implementation="kernels-community/flash-attn3"
@@ -98,67 +59,14 @@ def load_model(model_type, model_size):
98
 
99
  @functools.lru_cache(maxsize=1)
100
  def load_whisper_model(model_name="large-v3"):
101
- logger.info(f"正在加载 Whisper 模型: {model_name}...")
102
- # whisper.load_model 会自动处理下载和缓存
103
  model = whisper.load_model(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
104
- logger.info("Whisper 模型加载成功!")
105
  return model
106
 
107
- # logger.info("正在加载 Base 1.7B 模型...")
108
- # base_model_1_7b = Qwen3TTSModel.from_pretrained(
109
- # get_model_path("Base", "1.7B"),
110
- # device_map="cuda",
111
- # dtype=torch.bfloat16,
112
- # token=HF_TOKEN,
113
- # attn_implementation="kernels-community/flash-attn3",
114
- # )
115
-
116
- # CustomVoice models - both sizes
117
- # logger.info("正在加载 CustomVoice 0.6B 模型...")
118
- # custom_voice_model_0_6b = Qwen3TTSModel.from_pretrained(
119
- # get_model_path("CustomVoice", "0.6B"),
120
- # device_map="cuda",
121
- # dtype=torch.bfloat16,
122
- # token=HF_TOKEN,
123
- # attn_implementation="kernels-community/flash-attn3",
124
- # )
125
-
126
- # logger.info("正在加载 CustomVoice 1.7B 模型...")
127
- # custom_voice_model_1_7b = Qwen3TTSModel.from_pretrained(
128
- # get_model_path("CustomVoice", "1.7B"),
129
- # device_map="cuda",
130
- # dtype=torch.bfloat16,
131
- # token=HF_TOKEN,
132
- # attn_implementation="kernels-community/flash-attn3",
133
- # )
134
-
135
- logger.info("所有模型加载成功!")
136
-
137
- # Model lookup dictionaries for easy access
138
- # BASE_MODELS = {
139
- # "0.6B": base_model_0_6b,
140
- # # "1.7B": base_model_1_7b,
141
- # }
142
-
143
- # CUSTOM_VOICE_MODELS = {
144
- # "0.6B": custom_voice_model_0_6b,
145
- # "1.7B": custom_voice_model_1_7b,
146
- # }
147
-
148
- # ============================================================================
149
-
150
-
151
  def _normalize_audio(wav, eps=1e-12, clip=True):
152
- """Normalize audio to float32 in [-1, 1] range."""
153
  x = np.asarray(wav)
154
-
155
  if np.issubdtype(x.dtype, np.integer):
156
  info = np.iinfo(x.dtype)
157
- if info.min < 0:
158
- y = x.astype(np.float32) / max(abs(info.min), info.max)
159
- else:
160
- mid = (info.max + 1) / 2.0
161
- y = (x.astype(np.float32) - mid) / mid
162
  elif np.issubdtype(x.dtype, np.floating):
163
  y = x.astype(np.float32)
164
  m = np.max(np.abs(y)) if y.size else 0.0
@@ -166,39 +74,27 @@ def _normalize_audio(wav, eps=1e-12, clip=True):
166
  y = y / (m + eps)
167
  else:
168
  raise TypeError(f"Unsupported dtype: {x.dtype}")
169
-
170
  if clip:
171
  y = np.clip(y, -1.0, 1.0)
172
-
173
  if y.ndim > 1:
174
  y = np.mean(y, axis=-1).astype(np.float32)
175
-
176
  return y
177
 
178
-
179
  def _audio_to_tuple(audio):
180
- """Convert Gradio audio input to (wav, sr) tuple."""
181
  if audio is None:
182
  return None
183
-
184
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
185
  sr, wav = audio
186
  wav = _normalize_audio(wav)
187
  return wav, int(sr)
188
-
189
  if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
190
  sr = int(audio["sampling_rate"])
191
  wav = _normalize_audio(audio["data"])
192
  return wav, sr
193
-
194
  return None
195
 
196
-
197
-
198
-
199
  @spaces.GPU
200
  def infer_voice_design(part, language, voice_description):
201
- """Single segment inference for Voice Design."""
202
  voice_design_model = load_model("VoiceDesign","1.7B")
203
  seed_everything(42)
204
  wavs, sr = voice_design_model.generate_voice_design(
@@ -210,13 +106,8 @@ def infer_voice_design(part, language, voice_description):
210
  )
211
  return wavs[0], sr
212
 
213
-
214
-
215
  @spaces.GPU
216
- def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
217
- """Single segment inference for Voice Clone using reference audio."""
218
- # tts = BASE_MODELS[model_size]
219
- # seed_everything(42)
220
  tts = load_model("Base", "0.6B")
221
  voice_clone_prompt = tts.create_voice_clone_prompt(
222
  ref_audio=audio_tuple,
@@ -228,97 +119,81 @@ def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
228
  language=language,
229
  voice_clone_prompt=voice_clone_prompt,
230
  max_new_tokens=2048,
231
- # 核心参数:固定 seed
232
  seed=42,
233
- temperature=0.3, # 配合低温度,音色会更稳
234
  top_p=0.85
235
  )
236
  return wavs[0], sr
237
 
238
  @spaces.GPU
239
  def infer_voice_clone_from_prompt(part, language, prompt_file_path):
240
- """Single segment inference for Voice Clone using pre-extracted prompt."""
241
- logger.info("正在加载音频特征文件...")
242
  loaded_data = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
243
-
244
- # 兼容旧版本直接保存对象的情况
245
  if isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], VoiceClonePromptItem):
246
  voice_clone_prompt = loaded_data
247
  elif isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], dict):
248
- # 从字典列表重建对象
249
  voice_clone_prompt = [VoiceClonePromptItem(**item) for item in loaded_data]
250
  else:
251
- # 尝试作为单个对象处理
252
  voice_clone_prompt = loaded_data
253
-
254
- # 维度校正:确保 ref_code 是 2D 的 (Time, Q)
255
  if isinstance(voice_clone_prompt, list):
256
  for item in voice_clone_prompt:
257
  if item.ref_code is not None and item.ref_code.ndim == 3:
258
- # [1, T, Q] -> [T, Q]
259
  item.ref_code = item.ref_code.squeeze(0)
260
-
261
- logger.info("音频特征文件加载成功。")
262
-
263
  tts = load_model("Base", "0.6B")
264
- logger.info(f"克隆音频,目标文本:{part}")
265
  wavs, sr = tts.generate_voice_clone(
266
  text=part,
267
  language=language,
268
  voice_clone_prompt=voice_clone_prompt,
269
  max_new_tokens=2048,
270
- # 核心参数:固定 seed
271
  seed=42,
272
- temperature=0.3, # 配合低温度,音色会更稳
273
  top_p=0.85
274
  )
275
  return wavs[0], sr
276
 
277
  @spaces.GPU
278
- def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
279
- logger.info("正在提取参考音频特征(仅执行一次)...")
280
  tts = load_model("Base", "0.6B")
281
  seed_everything(42)
282
  audio_tuple = _audio_to_tuple(ref_audio)
283
  if audio_tuple is None:
284
  return None, "错误:需要参考音频。"
285
-
286
- # if not use_xvector_only and (not ref_text or not ref_text.strip()):
287
- # return None, "错误:未启用 '仅使用 x-vector' 时需要参考文本。"
288
  r_text = ref_text
289
  uxo = use_xvector_only
290
-
291
- # 如果没有提供参考文本且未开启仅 x-vector 模式,尝试使用 Whisper 自动识别
292
  if not r_text or (isinstance(r_text, str) and not r_text.strip()):
293
  whisper_size = "base"
294
- logger.info(f"未提供参考文本,开始使用 Whisper 自动识别。模型: {whisper_size}")
295
  try:
296
  whisper_model = load_whisper_model(whisper_size)
297
  audio_data, sr = audio_tuple
298
-
299
- # 仅为 Whisper 识别进行重采样,不影响原始 audio_tuple
300
  if sr != 16000:
301
- logger.info(f"Whisper 识别:临时重采样音频 {sr}Hz -> 16000Hz")
302
  whisper_audio = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
303
  else:
304
  whisper_audio = audio_data
305
-
306
  result = whisper_model.transcribe(whisper_audio)
307
- r_text = result["text"].strip()
308
- logger.info(f"Whisper 识别成功:{r_text}")
 
 
 
 
 
309
  uxo = False
310
  except Exception as e:
311
  logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
312
- return None, f"错误:语音识别失败且未提供参考文本。{str(e)}"
313
-
 
 
 
 
 
 
 
 
314
  voice_clone_prompt_items = tts.create_voice_clone_prompt(
315
  ref_audio=audio_tuple,
316
- ref_text=r_text.strip() if (isinstance(r_text, str) and r_text) else None,
317
  x_vector_only_mode=uxo
318
  )
319
- logger.info("参考音频特征提取完成。")
320
-
321
- # 转换为字典列表保存,避免对象序列化问题
322
  prompt_data = []
323
  for item in voice_clone_prompt_items:
324
  prompt_data.append({
@@ -328,413 +203,120 @@ def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
328
  "icl_mode": item.icl_mode,
329
  "ref_text": item.ref_text
330
  })
331
-
332
- # 生成唯一的文件名
333
  file_id = str(uuid.uuid4())[:8]
334
  file_path = f"voice_clone_prompt_{file_id}.pt"
335
-
336
- # 保存到文件
337
  torch.save(prompt_data, file_path)
338
- logger.info(f"voice_clone_prompt 已保存到: {file_path}")
339
-
340
  return file_path
341
- # @spaces.GPU(duration=60)
342
- # def infer_custom_voice(model_size, part, language, speaker, instruct):
343
- # """Single segment inference for Custom Voice."""
344
- # tts = CUSTOM_VOICE_MODELS[model_size]
345
- # wavs, sr = tts.generate_custom_voice(
346
- # text=part,
347
- # language=language,
348
- # speaker=speaker.lower().replace(" ", "_"),
349
- # instruct=instruct.strip() if instruct else None,
350
- # non_streaming_mode=True,
351
- # max_new_tokens=2048,
352
- # )
353
- # return wavs[0], sr
354
 
355
-
356
- def generate_voice_design(text, language, voice_description, progress=gr.Progress(track_tqdm=True)):
357
- """Generate speech using Voice Design model (1.7B only)."""
358
  if not text or not text.strip():
359
  return None, "错误:文本不能为空。"
360
  if not voice_description or not voice_description.strip():
361
  return None, "错误:语音描述不能为空。"
362
-
363
- logger.info(f"开始 Voice Design 生成任务。语言: {language}, 文本长度: {len(text)}, 描述: {voice_description}")
364
  try:
365
  wav, sr = infer_voice_design(text.strip(), language, voice_description)
366
- logger.info("Voice Design 生成任务完成...")
367
  return (sr, wav), "语音设计生成成功!"
368
  except Exception as e:
369
  logger.error(f"Voice Design 生成失败: {str(e)}", exc_info=True)
370
- return None, f"错误: {type(e).__name__}: {e}"
371
-
372
 
373
- def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size, progress=gr.Progress(track_tqdm=True)):
374
- """Generate speech using Base (Voice Clone) model."""
375
- if not target_text or not target_text.strip():
376
  return None, "错误:目标文本不能为空。"
377
-
378
  audio_tuple = _audio_to_tuple(ref_audio)
379
  if audio_tuple is None:
380
  return None, "错误:需要参考音频。"
381
-
382
- if not use_xvector_only and (not ref_text or not ref_text.strip()):
383
  return None, "错误:未启用 '仅使用 x-vector' 时需要参考文本。"
384
-
385
- logger.info(f"开始 Voice Clone 生成任务。模型大小: {model_size}, 语言: {language}, 目标文本长度: {len(target_text)}, 仅使用 x-vector: {use_xvector_only}")
386
  try:
387
- wav, sr = infer_voice_clone(target_text.strip(), language, audio_tuple, ref_text, use_xvector_only)
388
- logger.info("Voice Clone 生成任务完成...")
389
  return (sr, wav), "语音克隆生成成功!"
390
  except Exception as e:
391
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
392
- return None, f"错误: {type(e).__name__}: {e}"
393
 
394
- def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language, progress=gr.Progress(track_tqdm=True)):
395
- """Generate speech using Base (Voice Clone) model with pre-extracted prompt file."""
396
- if not target_text or not target_text.strip():
397
  return None, "错误:目标文本不能为空。"
398
-
399
  if not prompt_file_path:
400
  return None, "错误:需要提供音频特征文件。"
401
-
402
- logger.info(f"开始 Voice Clone 生成任务(使用特征文件)。语言: {language}, 目标文本长度: {len(target_text)}, 特征文件: {prompt_file_path}")
403
  try:
404
- wav, sr = infer_voice_clone_from_prompt(target_text.strip(), language, prompt_file_path)
405
- logger.info("Voice Clone 生成任务完成...")
406
  return (sr, wav), "语音克隆生成成功(使用特征文件)!"
407
  except Exception as e:
408
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
409
- return None, f"错误: {type(e).__name__}: {e}"
410
-
411
 
412
  @spaces.GPU
413
- def infer_whisper_audio(audio_path, model_size="large-v3"):
414
- """Transcribe audio using Whisper model."""
415
  if not audio_path:
416
  return "错误:请上传音频文件或进行录音。"
417
-
418
- logger.info(f"开始 Whisper 语音识别任务。模型: {model_size}, 音频路径: {audio_path}")
419
  try:
420
  model = load_whisper_model(model_size)
421
-
422
- # 使用 transcribe 方法进行转录
423
- # whisper 会自动处理音频加载和重采样
424
  result = model.transcribe(audio_path)
425
 
426
- text = result["text"]
427
- logger.info(f"Whisper 识别完成。文本长度: {len(text)}")
428
- return text.strip()
 
 
 
 
429
  except Exception as e:
430
  logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
431
- return f"识别出错: {type(e).__name__}: {e}"
432
 
433
-
434
- # def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
435
- # """Generate speech using CustomVoice model with segment-based GPU allocation."""
436
- # if not text or not text.strip():
437
- # return None, "错误:文本不能为空。"
438
- # if not speaker:
439
- # return None, "错误:说话人不能为空。"
440
-
441
- # logger.info(f"开始 Custom Voice 生成任务。模型大小: {model_size}, 语言: {language}, 说话人: {speaker}, 指令: {instruct}, 文本长度: {len(text)}")
442
- # try:
443
- # text_parts = split_text(text.strip())
444
- # logger.info(f"文本已切分为 {len(text_parts)} 段。")
445
- # all_wavs = []
446
- # sr = 24000
447
-
448
- # for i, part in enumerate(progress.tqdm(text_parts, desc="正在生成分段")):
449
- # logger.info(f"正在处理第 {i+1}/{len(text_parts)} 段文本...")
450
- # wav, current_sr = infer_custom_voice(model_size, part, language, speaker, instruct)
451
- # all_wavs.append(wav)
452
- # sr = current_sr
453
-
454
- # combined_wav = np.concatenate(all_wavs)
455
- # logger.info("Custom Voice 生成任务完成,正在合并音频...")
456
- # return (sr, combined_wav), "语音生成成功!"
457
- # except Exception as e:
458
- # logger.error(f"Custom Voice 生成失败: {str(e)}", exc_info=True)
459
- # return None, f"错误: {type(e).__name__}: {e}"
460
-
461
-
462
- # Build Gradio UI
463
  def build_ui():
464
- theme = gr.themes.Soft(
465
- font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
466
- )
467
-
468
- css = """
469
- .gradio-container {max-width: none !important;}
470
- .tab-content {padding: 20px;}
471
- """
472
-
473
- with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo:
474
- gr.Markdown(
475
- """
476
- # Qwen3-TTS Demo
477
- A unified Text-to-Speech demo featuring three powerful modes:
478
- - **Voice Design**: Create custom voices using natural language descriptions
479
- - **Voice Clone (Base)**: Clone any voice from a reference audio
480
- - **ASR (Whisper)**: Accurate speech-to-text using OpenAI's Whisper model
481
- - **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions
482
- Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team.
483
- """
484
- )
485
-
486
  with gr.Tabs():
487
- # Tab 3: ASR (Whisper)
488
  with gr.Tab("ASR (Whisper)"):
489
- gr.Markdown("### 语音识别 (Speech Recognition)")
490
- gr.Markdown("使用 OpenAI Whisper 模型将语音转换为文本。")
491
-
492
  with gr.Row():
493
- with gr.Column(scale=1):
494
- asr_audio_input = gr.Audio(
495
- label="输入音频 (录音或上传)",
496
- type="filepath", # Whisper 需要文件路径
497
- sources=["microphone", "upload"]
498
- )
499
- asr_model_size = gr.Dropdown(
500
- label="Whisper 模型大小",
501
- choices=["base", "small", "medium", "large-v3"],
502
- value="large-v3",
503
- interactive=True,
504
- info="越大越准,但速度越慢"
505
- )
506
- asr_btn = gr.Button("开始识别 (Transcribe)", variant="primary")
507
-
508
- with gr.Column(scale=1):
509
- asr_text_output = gr.Textbox(
510
- label="识别结果",
511
- lines=10,
512
- show_copy_button=True
513
- )
514
-
515
- asr_btn.click(
516
- infer_whisper_audio,
517
- inputs=[asr_audio_input, asr_model_size],
518
- outputs=[asr_text_output],
519
- api_name="infer_whisper"
520
- )
521
-
522
- # Tab 1: Voice Design (Default, 1.7B only)
523
  with gr.Tab("Voice Design"):
524
- gr.Markdown("### Create Custom Voice with Natural Language")
525
  with gr.Row():
526
- with gr.Column(scale=2):
527
- design_text = gr.Textbox(
528
- label="Text to Synthesize",
529
- lines=4,
530
- placeholder="Enter the text you want to convert to speech...",
531
- value="It's in the top drawer... wait, it's empty? No way, that's impossible! I'm sure I put it there!"
532
- )
533
- design_language = gr.Dropdown(
534
- label="Language",
535
- choices=LANGUAGES,
536
- value="Auto",
537
- interactive=True,
538
- )
539
- design_instruct = gr.Textbox(
540
- label="Voice Description",
541
- lines=3,
542
- placeholder="Describe the voice characteristics you want...",
543
- value="Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice."
544
- )
545
- design_btn = gr.Button("Generate with Custom Voice", variant="primary")
546
-
547
- with gr.Column(scale=2):
548
- design_audio_out = gr.Audio(label="Generated Audio", type="numpy")
549
- design_status = gr.Textbox(label="Status", lines=2, interactive=False)
550
-
551
- design_btn.click(
552
- generate_voice_design,
553
- inputs=[design_text, design_language, design_instruct],
554
- outputs=[design_audio_out, design_status],
555
- api_name="generate_voice_design"
556
- )
557
-
558
- # Tab 2: Voice Clone (Base)
559
  with gr.Tab("Voice Clone (Base)"):
560
- # Section 1: Extract Voice Features
561
  gr.Markdown("### 1. 提取音频特征")
562
- gr.Markdown("上传参考音频并提取特征,保存为文件供后续使用。")
563
  with gr.Row():
564
- with gr.Column(scale=2):
565
- extract_ref_audio = gr.Audio(
566
- label="参考音频",
567
- type="numpy",
568
- )
569
- extract_ref_text = gr.Textbox(
570
- label="参考文本(参考音频的文字内容)",
571
- lines=2,
572
- placeholder="输入参考音频中的确切文字...",
573
- )
574
- extract_xvector = gr.Checkbox(
575
- label="仅使用 x-vector(无需参考文本,但质量较低)",
576
- value=False,
577
- )
578
  extract_btn = gr.Button("提取音频特征", variant="primary")
579
-
580
- with gr.Column(scale=2):
581
- extract_file_out = gr.File(label="下载特征文件 (.pt)")
582
- extract_status = gr.Textbox(label="状态", lines=2, interactive=False)
583
-
584
- extract_btn.click(
585
- extract_voice_clone_prompt,
586
- inputs=[extract_ref_audio, extract_ref_text, extract_xvector],
587
- outputs=[extract_file_out],
588
- api_name="extract_voice_clone_prompt"
589
- )
590
-
591
- gr.Markdown("---")
592
-
593
- # Section 2: Generate Voice from Features
594
- gr.Markdown("### 2. 使用特征文件生成语音")
595
- gr.Markdown("上传之前提取的特征文件,快速生成语音(无需重复提取特征)。")
596
  with gr.Row():
597
- with gr.Column(scale=2):
598
- prompt_file = gr.File(
599
- label="音频特征件 (.pt)",
600
- )
601
- prompt_target_text = gr.Textbox(
602
- label="目标文本(要用克隆音色合成的文字)",
603
- lines=4,
604
- placeholder="输入要让克隆音色说话的文字...",
605
- )
606
- prompt_language = gr.Dropdown(
607
- label="语言",
608
- choices=LANGUAGES,
609
- value="Auto",
610
- interactive=True,
611
- )
612
  prompt_btn = gr.Button("使用特征文件生成", variant="primary")
613
-
614
- with gr.Column(scale=2):
615
- prompt_audio_out = gr.Audio(label="生成的音频", type="numpy")
616
- prompt_status = gr.Textbox(label="状态", lines=2, interactive=False)
617
-
618
- prompt_btn.click(
619
- generate_voice_clone_from_prompt_file,
620
- inputs=[prompt_file, prompt_target_text, prompt_language],
621
- outputs=[prompt_audio_out, prompt_status],
622
- api_name="generate_voice_clone_from_prompt"
623
- )
624
-
625
- gr.Markdown("---")
626
-
627
- # Section 3: Traditional Voice Clone (Original)
628
- gr.Markdown("### 3. 传统音色克隆(直接使用参考音频)")
629
- gr.Markdown("直接上传参考音频生成语音(每次都需要提取特征)。")
630
- with gr.Row():
631
- with gr.Column(scale=2):
632
- clone_ref_audio = gr.Audio(
633
- label="参考音频",
634
- type="numpy",
635
- )
636
- clone_ref_text = gr.Textbox(
637
- label="参考文本",
638
- lines=2,
639
- placeholder="输入参考音频中的确切文字...",
640
- )
641
- clone_xvector = gr.Checkbox(
642
- label="仅使用 x-vector",
643
- value=False,
644
- )
645
-
646
- with gr.Column(scale=2):
647
- clone_target_text = gr.Textbox(
648
- label="目标文本",
649
- lines=4,
650
- placeholder="输入要让克隆音色说话的文字...",
651
- )
652
- with gr.Row():
653
- clone_language = gr.Dropdown(
654
- label="语言",
655
- choices=LANGUAGES,
656
- value="Auto",
657
- interactive=True,
658
- )
659
- clone_model_size = gr.Dropdown(
660
- label="模型大小",
661
- choices=MODEL_SIZES,
662
- value="1.7B",
663
- interactive=True,
664
- )
665
- clone_btn = gr.Button("克隆并生成", variant="primary")
666
-
667
- with gr.Row():
668
- clone_audio_out = gr.Audio(label="生成的音频", type="numpy")
669
- clone_status = gr.Textbox(label="状态", lines=2, interactive=False)
670
-
671
- clone_btn.click(
672
- generate_voice_clone,
673
- inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector, clone_model_size],
674
- outputs=[clone_audio_out, clone_status],
675
- api_name="generate_voice_clone"
676
- )
677
-
678
- # # Tab 3: TTS (CustomVoice)
679
- # with gr.Tab("TTS (CustomVoice)"):
680
- # gr.Markdown("### Text-to-Speech with Predefined Speakers")
681
- # with gr.Row():
682
- # with gr.Column(scale=2):
683
- # tts_text = gr.Textbox(
684
- # label="Text to Synthesize",
685
- # lines=4,
686
- # placeholder="Enter the text you want to convert to speech...",
687
- # value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities."
688
- # )
689
- # with gr.Row():
690
- # tts_language = gr.Dropdown(
691
- # label="Language",
692
- # choices=LANGUAGES,
693
- # value="English",
694
- # interactive=True,
695
- # )
696
- # tts_speaker = gr.Dropdown(
697
- # label="Speaker",
698
- # choices=SPEAKERS,
699
- # value="Ryan",
700
- # interactive=True,
701
- # )
702
- # with gr.Row():
703
- # tts_instruct = gr.Textbox(
704
- # label="Style Instruction (Optional)",
705
- # lines=2,
706
- # placeholder="e.g., Speak in a cheerful and energetic tone",
707
- # )
708
- # tts_model_size = gr.Dropdown(
709
- # label="Model Size",
710
- # choices=MODEL_SIZES,
711
- # value="1.7B",
712
- # interactive=True,
713
- # )
714
- # tts_btn = gr.Button("Generate Speech", variant="primary")
715
-
716
- # with gr.Column(scale=2):
717
- # tts_audio_out = gr.Audio(label="Generated Audio", type="numpy")
718
- # tts_status = gr.Textbox(label="Status", lines=2, interactive=False)
719
-
720
- # tts_btn.click(
721
- # generate_custom_voice,
722
- # inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size],
723
- # outputs=[tts_audio_out, tts_status],
724
- # api_name="generate_custom_voice"
725
- # )
726
-
727
- gr.Markdown(
728
- """
729
- ---
730
- **Note**: This demo uses HuggingFace Spaces Zero GPU. Each generation has a time limit.
731
- For longer texts, please split them into smaller segments.
732
- """
733
- )
734
-
735
  return demo
736
 
737
-
738
  if __name__ == "__main__":
739
- demo = build_ui()
740
- demo.launch()
 
1
  # coding=utf-8
 
 
 
 
2
  import os
3
  import sys
4
  import logging
 
14
  import random
15
  import whisper
16
  import librosa
17
+ from opencc import OpenCC
18
+
19
  # 配置日志
20
  logging.basicConfig(
21
  level=logging.INFO,
22
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
23
+ handlers=[logging.StreamHandler(sys.stdout)]
 
 
24
  )
 
 
 
 
 
 
25
  logger = logging.getLogger("Qwen3-TTS-Demo")
26
 
27
+ # 初始化简繁转换器
28
+ cc = OpenCC('t2s')
29
+
30
  HF_TOKEN = os.environ.get('HF_TOKEN')
31
+ if HF_TOKEN:
32
+ login(token=HF_TOKEN)
33
 
 
34
  MODEL_SIZES = ["0.6B", "1.7B"]
 
 
 
 
 
35
  LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
36
+
37
  def seed_everything(seed=42):
38
  random.seed(seed)
39
  np.random.seed(seed)
 
44
  torch.backends.cudnn.benchmark = False
45
 
46
  def get_model_path(model_type: str, model_size: str) -> str:
 
47
  return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
48
 
49
+ @functools.lru_cache(maxsize=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def load_model(model_type, model_size):
 
51
  path = get_model_path(model_type, model_size)
52
  return Qwen3TTSModel.from_pretrained(
53
  path,
54
+ device_map="cuda",
55
  dtype=torch.bfloat16,
56
  token=HF_TOKEN,
57
  attn_implementation="kernels-community/flash-attn3"
 
59
 
60
  @functools.lru_cache(maxsize=1)
61
  def load_whisper_model(model_name="large-v3"):
 
 
62
  model = whisper.load_model(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
 
63
  return model
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def _normalize_audio(wav, eps=1e-12, clip=True):
 
66
  x = np.asarray(wav)
 
67
  if np.issubdtype(x.dtype, np.integer):
68
  info = np.iinfo(x.dtype)
69
+ y = x.astype(np.float32) / max(abs(info.min), info.max)
 
 
 
 
70
  elif np.issubdtype(x.dtype, np.floating):
71
  y = x.astype(np.float32)
72
  m = np.max(np.abs(y)) if y.size else 0.0
 
74
  y = y / (m + eps)
75
  else:
76
  raise TypeError(f"Unsupported dtype: {x.dtype}")
 
77
  if clip:
78
  y = np.clip(y, -1.0, 1.0)
 
79
  if y.ndim > 1:
80
  y = np.mean(y, axis=-1).astype(np.float32)
 
81
  return y
82
 
 
83
  def _audio_to_tuple(audio):
 
84
  if audio is None:
85
  return None
 
86
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
87
  sr, wav = audio
88
  wav = _normalize_audio(wav)
89
  return wav, int(sr)
 
90
  if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
91
  sr = int(audio["sampling_rate"])
92
  wav = _normalize_audio(audio["data"])
93
  return wav, sr
 
94
  return None
95
 
 
 
 
96
  @spaces.GPU
97
  def infer_voice_design(part, language, voice_description):
 
98
  voice_design_model = load_model("VoiceDesign","1.7B")
99
  seed_everything(42)
100
  wavs, sr = voice_design_model.generate_voice_design(
 
106
  )
107
  return wavs[0], sr
108
 
 
 
109
  @spaces.GPU
110
+ def infer_voice_clone(part, language, audio_tuple, ref_text, use_xvector_only):
 
 
 
111
  tts = load_model("Base", "0.6B")
112
  voice_clone_prompt = tts.create_voice_clone_prompt(
113
  ref_audio=audio_tuple,
 
119
  language=language,
120
  voice_clone_prompt=voice_clone_prompt,
121
  max_new_tokens=2048,
 
122
  seed=42,
123
+ temperature=0.3,
124
  top_p=0.85
125
  )
126
  return wavs[0], sr
127
 
128
  @spaces.GPU
129
  def infer_voice_clone_from_prompt(part, language, prompt_file_path):
 
 
130
  loaded_data = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
 
 
131
  if isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], VoiceClonePromptItem):
132
  voice_clone_prompt = loaded_data
133
  elif isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], dict):
 
134
  voice_clone_prompt = [VoiceClonePromptItem(**item) for item in loaded_data]
135
  else:
 
136
  voice_clone_prompt = loaded_data
 
 
137
  if isinstance(voice_clone_prompt, list):
138
  for item in voice_clone_prompt:
139
  if item.ref_code is not None and item.ref_code.ndim == 3:
 
140
  item.ref_code = item.ref_code.squeeze(0)
 
 
 
141
  tts = load_model("Base", "0.6B")
 
142
  wavs, sr = tts.generate_voice_clone(
143
  text=part,
144
  language=language,
145
  voice_clone_prompt=voice_clone_prompt,
146
  max_new_tokens=2048,
 
147
  seed=42,
148
+ temperature=0.3,
149
  top_p=0.85
150
  )
151
  return wavs[0], sr
152
 
153
  @spaces.GPU
154
+ def extract_voice_clone_prompt(ref_audio, ref_text, use_xvector_only):
 
155
  tts = load_model("Base", "0.6B")
156
  seed_everything(42)
157
  audio_tuple = _audio_to_tuple(ref_audio)
158
  if audio_tuple is None:
159
  return None, "错误:需要参考音频。"
 
 
 
160
  r_text = ref_text
161
  uxo = use_xvector_only
 
 
162
  if not r_text or (isinstance(r_text, str) and not r_text.strip()):
163
  whisper_size = "base"
 
164
  try:
165
  whisper_model = load_whisper_model(whisper_size)
166
  audio_data, sr = audio_tuple
 
 
167
  if sr != 16000:
 
168
  whisper_audio = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
169
  else:
170
  whisper_audio = audio_data
 
171
  result = whisper_model.transcribe(whisper_audio)
172
+
173
+ res_val = result.get("text", "")
174
+ if isinstance(res_val, list) and len(res_val) > 0:
175
+ res_val = res_val[0]
176
+ if not isinstance(res_val, str):
177
+ res_val = str(res_val)
178
+ r_text = cc.convert(res_val.strip())
179
  uxo = False
180
  except Exception as e:
181
  logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
182
+ uxo = True
183
+ # return None, f"错误:语音识别失败且未提供参考文本。{str(e)}"
184
+
185
+ r_text_str = ""
186
+ if isinstance(r_text, str):
187
+ r_text_str = r_text.strip()
188
+ elif isinstance(r_text, list) and len(r_text) > 0 and isinstance(r_text[0], str):
189
+ r_text_str = r_text[0].strip()
190
+
191
+ logger.info(f"语音识别成功 :{r_text_str}")
192
  voice_clone_prompt_items = tts.create_voice_clone_prompt(
193
  ref_audio=audio_tuple,
194
+ ref_text=r_text_str if r_text_str else None,
195
  x_vector_only_mode=uxo
196
  )
 
 
 
197
  prompt_data = []
198
  for item in voice_clone_prompt_items:
199
  prompt_data.append({
 
203
  "icl_mode": item.icl_mode,
204
  "ref_text": item.ref_text
205
  })
 
 
206
  file_id = str(uuid.uuid4())[:8]
207
  file_path = f"voice_clone_prompt_{file_id}.pt"
 
 
208
  torch.save(prompt_data, file_path)
 
 
209
  return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ def generate_voice_design(text, language, voice_description):
 
 
212
  if not text or not text.strip():
213
  return None, "错误:文本不能为空。"
214
  if not voice_description or not voice_description.strip():
215
  return None, "错误:语音描述不能为空。"
 
 
216
  try:
217
  wav, sr = infer_voice_design(text.strip(), language, voice_description)
 
218
  return (sr, wav), "语音设计生成成功!"
219
  except Exception as e:
220
  logger.error(f"Voice Design 生成失败: {str(e)}", exc_info=True)
221
+ return None, f"错误: {e}"
 
222
 
223
+ def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only):
224
+ t_text = target_text.strip() if isinstance(target_text, str) else ""
225
+ if not t_text:
226
  return None, "错误:目标文本不能为空。"
 
227
  audio_tuple = _audio_to_tuple(ref_audio)
228
  if audio_tuple is None:
229
  return None, "错误:需要参考音频。"
230
+ r_text = ref_text.strip() if isinstance(ref_text, str) else ""
231
+ if not use_xvector_only and not r_text:
232
  return None, "错误:未启用 '仅使用 x-vector' 时需要参考文本。"
 
 
233
  try:
234
+ wav, sr = infer_voice_clone(t_text, language, audio_tuple, r_text, use_xvector_only)
 
235
  return (sr, wav), "语音克隆生成成功!"
236
  except Exception as e:
237
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
238
+ return None, f"错误: {e}"
239
 
240
+ def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language):
241
+ t_text = target_text.strip() if isinstance(target_text, str) else ""
242
+ if not t_text:
243
  return None, "错误:目标文本不能为空。"
 
244
  if not prompt_file_path:
245
  return None, "错误:需要提供音频特征文件。"
 
 
246
  try:
247
+ wav, sr = infer_voice_clone_from_prompt(t_text, language, prompt_file_path)
 
248
  return (sr, wav), "语音克隆生成成功(使用特征文件)!"
249
  except Exception as e:
250
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
251
+ return None, f"错误: {e}"
 
252
 
253
  @spaces.GPU
254
+ def infer_whisper_audio(audio_path, model_size="base"):
 
255
  if not audio_path:
256
  return "错误:请上传音频文件或进行录音。"
 
 
257
  try:
258
  model = load_whisper_model(model_size)
 
 
 
259
  result = model.transcribe(audio_path)
260
 
261
+ res_val = result.get("text", "")
262
+ if isinstance(res_val, list) and len(res_val) > 0:
263
+ res_val = res_val[0]
264
+ if not isinstance(res_val, str):
265
+ res_val = str(res_val)
266
+
267
+ return cc.convert(res_val.strip())
268
  except Exception as e:
269
  logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
270
+ return f"识别出错: {e}"
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  def build_ui():
273
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])
274
+ with gr.Blocks(theme=theme, title="Qwen3-TTS Demo") as demo:
275
+ gr.Markdown("# Qwen3-TTS Demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  with gr.Tabs():
 
277
  with gr.Tab("ASR (Whisper)"):
 
 
 
278
  with gr.Row():
279
+ with gr.Column():
280
+ asr_audio_input = gr.Audio(label="输入音频", type="filepath", sources=["microphone", "upload"])
281
+ asr_model_size = gr.Dropdown(label="Whisper 模型大小", choices=["base", "small", "medium", "large-v3"], value="large-v3")
282
+ asr_btn = gr.Button("开始识别", variant="primary")
283
+ with gr.Column():
284
+ asr_text_output = gr.Textbox(label="识别结果", lines=10, show_copy_button=True)
285
+ asr_btn.click(infer_whisper_audio, inputs=[asr_audio_input, asr_model_size], outputs=[asr_text_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  with gr.Tab("Voice Design"):
 
287
  with gr.Row():
288
+ with gr.Column():
289
+ design_text = gr.Textbox(label="目标文本", lines=4, value="It's in the top drawer... wait, it's empty?")
290
+ design_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
291
+ design_instruct = gr.Textbox(label="语音描述", lines=3, value="Speak in an incredulous tone.")
292
+ design_btn = gr.Button("开始生成", variant="primary")
293
+ with gr.Column():
294
+ design_audio_out = gr.Audio(label="生成音频", type="numpy")
295
+ design_status = gr.Textbox(label="状态", interactive=False)
296
+ design_btn.click(generate_voice_design, inputs=[design_text, design_language, design_instruct], outputs=[design_audio_out, design_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  with gr.Tab("Voice Clone (Base)"):
 
298
  gr.Markdown("### 1. 提取音频特征")
 
299
  with gr.Row():
300
+ with gr.Column():
301
+ extract_ref_audio = gr.Audio(label="参考音频", type="numpy")
302
+ extract_ref_text = gr.Textbox(label="参考文本", lines=2)
303
+ extract_xvector = gr.Checkbox(label="仅使用 x-vector", value=False)
 
 
 
 
 
 
 
 
 
 
304
  extract_btn = gr.Button("提取音频特征", variant="primary")
305
+ with gr.Column():
306
+ extract_file_out = gr.File(label="特征文件 (.pt)")
307
+ extract_btn.click(extract_voice_clone_prompt, inputs=[extract_ref_audio, extract_ref_text, extract_xvector], outputs=[extract_file_out])
308
+ gr.Markdown("### 2. 使用特征文件生成")
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  with gr.Row():
310
+ with gr.Column():
311
+ prompt_file = gr.File(label="特征文件 (.pt)")
312
+ prompt_target_text = gr.Textbox(label="目标", lines=4)
313
+ prompt_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
 
 
 
 
 
 
 
 
 
 
 
314
  prompt_btn = gr.Button("使用特征文件生成", variant="primary")
315
+ with gr.Column():
316
+ prompt_audio_out = gr.Audio(label="生成音频", type="numpy")
317
+ prompt_status = gr.Textbox(label="状态", interactive=False)
318
+ prompt_btn.click(generate_voice_clone_from_prompt_file, inputs=[prompt_file, prompt_target_text, prompt_language], outputs=[prompt_audio_out, prompt_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  return demo
320
 
 
321
  if __name__ == "__main__":
322
+ build_ui().launch()
 
requirements.txt CHANGED
@@ -13,3 +13,4 @@ spaces
13
  numpy
14
  kernels
15
  openai-whisper
 
 
13
  numpy
14
  kernels
15
  openai-whisper
16
+ opencc-python-reimplemented