llexieguo commited on
Commit
ed25084
·
1 Parent(s): f718c5e

updated audio

Browse files
Files changed (3) hide show
  1. README.md +40 -1
  2. app.py +160 -13
  3. requirements.txt +1 -0
README.md CHANGED
@@ -61,6 +61,46 @@ API_KEY="你的Key"
61
  - 讲解/MCQ 使用 OpenAI-compatible `/chat/completions`
62
  - TTS 优先尝试 `/audio/speech`,失败后回退 DashScope TTS 接口
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ## 角色目录结构(自动发现)
65
 
66
  下拉/角色按钮会自动读取 `characters/` 下的所有子目录。
@@ -112,4 +152,3 @@ pip install pypdf
112
  ### 3. MCQ 生成失败 / JSON 错误
113
 
114
  模型可能返回不完整 JSON,代码里已做重试与解析兜底;如果仍失败可重试一次或更换角色 prompt。
115
-
 
61
  - 讲解/MCQ 使用 OpenAI-compatible `/chat/completions`
62
  - TTS 优先尝试 `/audio/speech`,失败后回退 DashScope TTS 接口
63
 
64
+ ## 使用同 Organization 的 HF Audio Space 做 TTS
65
+
66
+ 你的 `audio` 项目已经暴露 API:`/tts_chunk(text, voice, language)`。
67
+
68
+ ### 1. 部署 `/Users/lexi/workplace/audio` 到 HF Space
69
+
70
+ 先在 Hugging Face 里创建组织下的 Space(例如 `your-org/audio`),然后推送代码:
71
+
72
+ ```bash
73
+ cd /Users/lexi/workplace/audio
74
+ git remote add hf https://huggingface.co/spaces/your-org/audio
75
+ git push hf main
76
+ ```
77
+
78
+ 如果该 Space 是私有仓库,请在 HF 里创建一个可访问该组织 Space 的 token(read 权限即可调用)。
79
+
80
+ ### 2. 在 `/Users/lexi/workplace/genai/.env` 配置调用
81
+
82
+ ```env
83
+ # 讲解/MCQ 仍走 DashScope/OpenAI-compatible
84
+ API_UR="https://dashscope.aliyuncs.com/compatible-mode/v1"
85
+ API_KEY="你的Key"
86
+ USE_MOCK_MODELS=0
87
+
88
+ # TTS 走 HF Space(优先)
89
+ HF_TTS_SPACE_ID="your-org/audio"
90
+ # 如果是私有 Space,填 token;公开 Space 可不填
91
+ HF_TOKEN="hf_xxx"
92
+ HF_TTS_API_NAME="/tts_chunk"
93
+ HF_TTS_VOICE="male" # male 或 female
94
+ HF_TTS_LANGUAGE="Chinese"
95
+ # 1=HF失败时回退到原有TTS;0=只用HF,失败就报错
96
+ HF_TTS_ALLOW_FALLBACK=1
97
+ ```
98
+
99
+ 可选:
100
+
101
+ - 如果你更希望用完整 URL,可以改为 `HF_TTS_SPACE_URL="https://your-org-audio.hf.space"`。
102
+ - 如果不想回退到原 TTS 接口,设置 `HF_TTS_ALLOW_FALLBACK=0`。
103
+
104
  ## 角色目录结构(自动发现)
105
 
106
  下拉/角色按钮会自动读取 `characters/` 下的所有子目录。
 
152
  ### 3. MCQ 生成失败 / JSON 错误
153
 
154
  模型可能返回不完整 JSON,代码里已做重试与解析兜底;如果仍失败可重试一次或更换角色 prompt。
 
app.py CHANGED
@@ -14,6 +14,11 @@ from typing import Any, Dict, List, Optional
14
  import gradio as gr
15
  import requests
16
 
 
 
 
 
 
17
  try:
18
  import spaces # type: ignore
19
  except Exception:
@@ -62,6 +67,18 @@ CHAT_MODEL_ID = os.getenv("QWEN_VL_MODEL_ID", "qwen-vl-max")
62
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
63
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
64
  TTS_FORMAT = os.getenv("QWEN_TTS_FORMAT", "wav")
 
 
 
 
 
 
 
 
 
 
 
 
65
  API_TIMEOUT_SEC = int(os.getenv("API_TIMEOUT_SEC", "180"))
66
  QWEN_VL_MAX_PAGES = int(os.getenv("QWEN_VL_MAX_PAGES", "4"))
67
  QWEN_VL_RENDER_SCALE = float(os.getenv("QWEN_VL_RENDER_SCALE", "1.5"))
@@ -388,6 +405,54 @@ def _save_binary_audio(audio_bytes: bytes, out_path: str) -> str:
388
  return out_path
389
 
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  def split_text_for_tts(text: str, max_len: int = 480) -> List[str]:
392
  cleaned = re.sub(r"\s+", " ", (text or "")).strip()
393
  if not cleaned:
@@ -456,7 +521,7 @@ class QwenPipelineEngine:
456
  This ships with a mock mode by default so the workflow is runnable immediately.
457
  When USE_MOCK_MODELS=0, it calls remote APIs:
458
  - VL: OpenAI-compatible /chat/completions (works with DashScope compatible-mode and vLLM-style APIs)
459
- - TTS: DashScope multimodal generation API (returns audio URL)
460
  """
461
 
462
  def __init__(self) -> None:
@@ -464,6 +529,7 @@ class QwenPipelineEngine:
464
  self.vl_loaded = False
465
  self.tts_loaded = False
466
  self._pdf_page_cache: Dict[str, List[str]] = {}
 
467
 
468
  def ensure_vl_loaded(self) -> None:
469
  if self.vl_loaded:
@@ -479,6 +545,10 @@ class QwenPipelineEngine:
479
  def ensure_tts_loaded(self) -> None:
480
  if self.tts_loaded:
481
  return
 
 
 
 
482
  if self.mock_mode:
483
  self.tts_loaded = True
484
  return
@@ -487,16 +557,76 @@ class QwenPipelineEngine:
487
  raise RuntimeError("Missing API_KEY for TTS API calls.")
488
  self.tts_loaded = True
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  def _mock_generate_lecture(self, pdf_excerpt: str) -> str:
491
  excerpt = re.sub(r"\s+", " ", pdf_excerpt).strip()
492
  excerpt = excerpt[:1000]
493
  return (
494
- "这是一段基于论文内容的课堂讲解(Mock 模式)。\n\n"
495
- "1. 论文问题与背景:该工作试图解决一个具体任务中的效率/性能/泛化问题,核心动机通常是现有方法在成本、准确性或可解释性方面存在不足。\n"
496
- "2. 核心方法:作者提出新的模型结构、训练策略或推理流程,并通过若干模块协同完成任务。\n"
497
- "3. 实验与结果:论文通常会在标准数据集上与基线比较,并报告性能提升、效率改善或更稳定的表现。\n"
498
- "4. 局限与适用场景:方法可能依赖特定数据分布、计算资源或任务设定,迁移到新领域需要额外验证。\n\n"
499
- f"论文节选(用于生成讲解): {excerpt}"
500
  )
501
 
502
  def _mock_generate_mcqs(self, lecture_text: str) -> List[MCQItem]:
@@ -609,6 +739,16 @@ class QwenPipelineEngine:
609
  def _real_tts_single(self, text: str, out_path: str) -> str:
610
  if not text.strip():
611
  return write_tone_wav("empty", out_path)
 
 
 
 
 
 
 
 
 
 
612
  openai_url = f"{_require_api_url()}/audio/speech"
613
  openai_payload = {
614
  "model": TTS_MODEL_ID,
@@ -787,7 +927,7 @@ class QwenPipelineEngine:
787
  def synthesize_tts(self, text: str, name_prefix: str = "audio") -> str:
788
  self.ensure_tts_loaded()
789
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
790
- if self.mock_mode:
791
  return write_tone_wav(text, out_path)
792
  return self._real_tts(text, out_path)
793
 
@@ -1576,14 +1716,20 @@ def play_lecture_audio(state: Dict[str, Any]):
1576
  if not state.get("lecture_text"):
1577
  state["status"] = "No lecture text available."
1578
  return state, state["status"], state.get("lecture_audio_path"), "Generate lecture first."
 
1579
  try:
1580
- state["status"] = "Generating lecture audio..."
1581
  state["lecture_audio_path"] = engine.synthesize_tts(state["lecture_text"], name_prefix="lecture")
1582
  state["status"] = "Lecture audio ready."
1583
- return state, state["status"], state["lecture_audio_path"], "Lecture audio generated."
1584
  except Exception as exc:
1585
  state["status"] = "Lecture audio generation failed."
1586
- return state, state["status"], state.get("lecture_audio_path"), f"TTS error: {type(exc).__name__}: {exc}"
 
 
 
 
 
1587
 
1588
 
1589
  def play_explanation_audio(state: Dict[str, Any]):
@@ -2454,6 +2600,7 @@ with gr.Blocks(css=CSS) as demo:
2454
  )
2455
  with gr.Row(elem_id="lecture-actions"):
2456
  play_lecture_btn = gr.Button("Play Lecture Audio", interactive=False, scale=0)
 
2457
  with gr.Row(elem_id="exam-entry-wrap"):
2458
  exam_btn = gr.Button("Go to Exam", interactive=False, variant="secondary", scale=0)
2459
 
@@ -2591,8 +2738,8 @@ with gr.Blocks(css=CSS) as demo:
2591
  play_lecture_btn.click(
2592
  fn=play_lecture_audio,
2593
  inputs=[state],
2594
- outputs=[state, status_box, lecture_audio, feedback_box],
2595
- show_progress="hidden",
2596
  )
2597
 
2598
 
 
14
  import gradio as gr
15
  import requests
16
 
17
+ try:
18
+ from gradio_client import Client as HFSpaceClient
19
+ except Exception: # pragma: no cover
20
+ HFSpaceClient = None # type: ignore
21
+
22
  try:
23
  import spaces # type: ignore
24
  except Exception:
 
67
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
68
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
69
  TTS_FORMAT = os.getenv("QWEN_TTS_FORMAT", "wav")
70
+ HF_TTS_SPACE_ID = os.getenv("HF_TTS_SPACE_ID", "").strip()
71
+ HF_TTS_SPACE_URL = os.getenv("HF_TTS_SPACE_URL", "").strip()
72
+ _hf_tts_api_name_raw = (os.getenv("HF_TTS_API_NAME", "/tts_chunk") or "").strip()
73
+ HF_TTS_API_NAME = f"/{_hf_tts_api_name_raw.lstrip('/')}" if _hf_tts_api_name_raw else "/tts_chunk"
74
+ HF_TTS_VOICE = os.getenv("HF_TTS_VOICE", "male")
75
+ HF_TTS_LANGUAGE = os.getenv("HF_TTS_LANGUAGE", "Chinese")
76
+ HF_TTS_ALLOW_FALLBACK = os.getenv("HF_TTS_ALLOW_FALLBACK", "1") == "1"
77
+ HF_TOKEN = (
78
+ os.getenv("HF_TOKEN")
79
+ or os.getenv("HUGGINGFACEHUB_API_TOKEN")
80
+ or os.getenv("HF_API_TOKEN", "")
81
+ )
82
  API_TIMEOUT_SEC = int(os.getenv("API_TIMEOUT_SEC", "180"))
83
  QWEN_VL_MAX_PAGES = int(os.getenv("QWEN_VL_MAX_PAGES", "4"))
84
  QWEN_VL_RENDER_SCALE = float(os.getenv("QWEN_VL_RENDER_SCALE", "1.5"))
 
405
  return out_path
406
 
407
 
408
+ def _is_hf_tts_enabled() -> bool:
409
+ return bool(HF_TTS_SPACE_ID or HF_TTS_SPACE_URL)
410
+
411
+
412
+ def _tts_backend_name() -> str:
413
+ if _is_hf_tts_enabled():
414
+ return f"hf_space:{HF_TTS_SPACE_ID or HF_TTS_SPACE_URL}"
415
+ if USE_MOCK_MODELS:
416
+ return "mock_tts"
417
+ return "api_tts"
418
+
419
+
420
+ def _extract_audio_source(result: Any) -> str:
421
+ if isinstance(result, str):
422
+ return result
423
+ if isinstance(result, dict):
424
+ for key in ("path", "name", "url"):
425
+ value = result.get(key)
426
+ if isinstance(value, str) and value.strip():
427
+ return value
428
+ nested = result.get("audio")
429
+ if nested is not None:
430
+ return _extract_audio_source(nested)
431
+ if isinstance(result, (list, tuple)):
432
+ for item in result:
433
+ try:
434
+ return _extract_audio_source(item)
435
+ except RuntimeError:
436
+ continue
437
+ raise RuntimeError(f"Unsupported HF Space audio output: {result!r}")
438
+
439
+
440
+ def _read_audio_bytes_from_source(source: str) -> bytes:
441
+ source = (source or "").strip()
442
+ if not source:
443
+ raise RuntimeError("HF Space returned an empty audio source.")
444
+ if source.startswith("http://") or source.startswith("https://"):
445
+ resp = requests.get(source, timeout=API_TIMEOUT_SEC)
446
+ if resp.status_code >= 400:
447
+ raise RuntimeError(f"Failed to fetch HF Space audio URL {resp.status_code}: {resp.text[:500]}")
448
+ return resp.content
449
+
450
+ path = Path(source)
451
+ if path.exists():
452
+ return path.read_bytes()
453
+ raise RuntimeError(f"HF Space audio path does not exist: {source}")
454
+
455
+
456
  def split_text_for_tts(text: str, max_len: int = 480) -> List[str]:
457
  cleaned = re.sub(r"\s+", " ", (text or "")).strip()
458
  if not cleaned:
 
521
  This ships with a mock mode by default so the workflow is runnable immediately.
522
  When USE_MOCK_MODELS=0, it calls remote APIs:
523
  - VL: OpenAI-compatible /chat/completions (works with DashScope compatible-mode and vLLM-style APIs)
524
+ - TTS: HF Space /tts_chunk (optional) or DashScope/OpenAI-compatible endpoints
525
  """
526
 
527
  def __init__(self) -> None:
 
529
  self.vl_loaded = False
530
  self.tts_loaded = False
531
  self._pdf_page_cache: Dict[str, List[str]] = {}
532
+ self._hf_tts_client: Any = None
533
 
534
  def ensure_vl_loaded(self) -> None:
535
  if self.vl_loaded:
 
545
  def ensure_tts_loaded(self) -> None:
546
  if self.tts_loaded:
547
  return
548
+ if _is_hf_tts_enabled():
549
+ self._ensure_hf_tts_client()
550
+ self.tts_loaded = True
551
+ return
552
  if self.mock_mode:
553
  self.tts_loaded = True
554
  return
 
557
  raise RuntimeError("Missing API_KEY for TTS API calls.")
558
  self.tts_loaded = True
559
 
560
+ def _ensure_hf_tts_client(self) -> Any:
561
+ if HFSpaceClient is None:
562
+ raise RuntimeError("Missing gradio_client. Please install with: pip install gradio_client")
563
+ if self._hf_tts_client is not None:
564
+ return self._hf_tts_client
565
+ src = HF_TTS_SPACE_URL or HF_TTS_SPACE_ID
566
+ if not src:
567
+ raise RuntimeError("Missing HF_TTS_SPACE_ID or HF_TTS_SPACE_URL.")
568
+ token = (HF_TOKEN or "").strip()
569
+ # gradio_client constructor args differ across versions; handle both old/new signatures.
570
+ if not token:
571
+ self._hf_tts_client = HFSpaceClient(src)
572
+ return self._hf_tts_client
573
+ try:
574
+ self._hf_tts_client = HFSpaceClient(src, hf_token=token)
575
+ except TypeError:
576
+ try:
577
+ self._hf_tts_client = HFSpaceClient(src, token=token)
578
+ except TypeError:
579
+ self._hf_tts_client = HFSpaceClient(src, headers={"Authorization": f"Bearer {token}"})
580
+ return self._hf_tts_client
581
+
582
+ def _hf_space_tts_single(self, text: str, out_path: str) -> str:
583
+ client = self._ensure_hf_tts_client()
584
+ configured = (HF_TTS_API_NAME or "").strip()
585
+ normalized = configured.lstrip("/")
586
+ api_candidates: List[str] = []
587
+ for cand in [configured, f"/{normalized}" if normalized else "", normalized, "/tts_chunk", "tts_chunk", "/predict", "predict"]:
588
+ cand = cand.strip()
589
+ if cand and cand not in api_candidates:
590
+ api_candidates.append(cand)
591
+
592
+ result: Any = None
593
+ last_exc: Optional[Exception] = None
594
+ for api_name in api_candidates:
595
+ try:
596
+ result = client.predict(
597
+ text,
598
+ HF_TTS_VOICE,
599
+ HF_TTS_LANGUAGE,
600
+ api_name=api_name,
601
+ )
602
+ last_exc = None
603
+ break
604
+ except Exception as exc:
605
+ msg = str(exc)
606
+ if "Cannot find a function with api_name" in msg:
607
+ last_exc = exc
608
+ continue
609
+ raise
610
+ if last_exc is not None:
611
+ available_hint = ""
612
+ view_api = getattr(client, "view_api", None)
613
+ if callable(view_api):
614
+ try:
615
+ api_info = view_api(return_format="dict")
616
+ available_hint = f" Available endpoints: {api_info}"
617
+ except Exception:
618
+ available_hint = ""
619
+ tried = ", ".join(api_candidates)
620
+ raise RuntimeError(f"No matching HF API endpoint. Tried: [{tried}].{available_hint}") from last_exc
621
+ source = _extract_audio_source(result)
622
+ audio_bytes = _read_audio_bytes_from_source(source)
623
+ return _save_binary_audio(audio_bytes, out_path)
624
+
625
  def _mock_generate_lecture(self, pdf_excerpt: str) -> str:
626
  excerpt = re.sub(r"\s+", " ", pdf_excerpt).strip()
627
  excerpt = excerpt[:1000]
628
  return (
629
+ f" {excerpt}"
 
 
 
 
 
630
  )
631
 
632
  def _mock_generate_mcqs(self, lecture_text: str) -> List[MCQItem]:
 
739
  def _real_tts_single(self, text: str, out_path: str) -> str:
740
  if not text.strip():
741
  return write_tone_wav("empty", out_path)
742
+
743
+ if _is_hf_tts_enabled():
744
+ try:
745
+ return self._hf_space_tts_single(text, out_path)
746
+ except Exception as exc:
747
+ if not HF_TTS_ALLOW_FALLBACK:
748
+ raise RuntimeError(f"HF Space TTS failed and fallback is disabled: {type(exc).__name__}: {exc}")
749
+ if self.mock_mode:
750
+ return write_tone_wav(text, out_path)
751
+
752
  openai_url = f"{_require_api_url()}/audio/speech"
753
  openai_payload = {
754
  "model": TTS_MODEL_ID,
 
927
  def synthesize_tts(self, text: str, name_prefix: str = "audio") -> str:
928
  self.ensure_tts_loaded()
929
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
930
+ if self.mock_mode and not _is_hf_tts_enabled():
931
  return write_tone_wav(text, out_path)
932
  return self._real_tts(text, out_path)
933
 
 
1716
  if not state.get("lecture_text"):
1717
  state["status"] = "No lecture text available."
1718
  return state, state["status"], state.get("lecture_audio_path"), "Generate lecture first."
1719
+ backend = _tts_backend_name()
1720
  try:
1721
+ state["status"] = f"Generating lecture audio ({backend})..."
1722
  state["lecture_audio_path"] = engine.synthesize_tts(state["lecture_text"], name_prefix="lecture")
1723
  state["status"] = "Lecture audio ready."
1724
+ return state, state["status"], state["lecture_audio_path"], f"Lecture audio generated via `{backend}`."
1725
  except Exception as exc:
1726
  state["status"] = "Lecture audio generation failed."
1727
+ return (
1728
+ state,
1729
+ state["status"],
1730
+ state.get("lecture_audio_path"),
1731
+ f"TTS error via `{backend}`: {type(exc).__name__}: {exc}",
1732
+ )
1733
 
1734
 
1735
  def play_explanation_audio(state: Dict[str, Any]):
 
2600
  )
2601
  with gr.Row(elem_id="lecture-actions"):
2602
  play_lecture_btn = gr.Button("Play Lecture Audio", interactive=False, scale=0)
2603
+ lecture_feedback = gr.Markdown("")
2604
  with gr.Row(elem_id="exam-entry-wrap"):
2605
  exam_btn = gr.Button("Go to Exam", interactive=False, variant="secondary", scale=0)
2606
 
 
2738
  play_lecture_btn.click(
2739
  fn=play_lecture_audio,
2740
  inputs=[state],
2741
+ outputs=[state, status_box, lecture_audio, lecture_feedback],
2742
+ show_progress="minimal",
2743
  )
2744
 
2745
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
 
2
  spaces
3
  requests
4
  pypdf
 
1
  gradio
2
+ gradio_client
3
  spaces
4
  requests
5
  pypdf