Isshi14 commited on
Commit
3828c7d
Β·
verified Β·
1 Parent(s): 3e9d922

Upload 11 files

Browse files
Files changed (11) hide show
  1. README.md +57 -13
  2. app.py +393 -0
  3. convert_to_word.ps1 +58 -0
  4. gitattributes +35 -0
  5. ingestion.py +217 -0
  6. packages.txt +0 -0
  7. rag.py +198 -0
  8. requirements.txt +13 -0
  9. script_gen.py +310 -0
  10. tts.py +293 -0
  11. utils.py +62 -0
README.md CHANGED
@@ -1,13 +1,57 @@
1
- ---
2
- title: Voiceover Ai 2
3
- emoji: πŸ‘€
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VoiceVerse AI
3
+ emoji: πŸŽ™οΈ
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "5.23.1"
8
+ python_version: "3.10"
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
+ # πŸŽ™οΈ VoiceVerse AI β€” Document to Audio
14
+
15
+ Transform uploaded documents into engaging, emotionally expressive podcast-style audio narrations.
16
+
17
+ ## Pipeline
18
+
19
+ ```
20
+ PDF/TXT β†’ Text Extraction β†’ RAG (chunk + embed + retrieve) β†’ Script Generation (Mistral-7B) β†’ TTS (Qwen3-TTS / Edge-TTS) β†’ Audio Playback
21
+ ```
22
+
23
+ ## Models Used
24
+
25
+ | Component | Model | How |
26
+ |-----------|-------|-----|
27
+ | Embeddings | `all-MiniLM-L6-v2` | Local (CPU) |
28
+ | Script Gen | `Mistral-7B-Instruct-v0.3` | HF Inference API |
29
+ | TTS (primary) | `Qwen3-TTS` | HF Inference API |
30
+ | TTS (fallback) | `Edge-TTS (AriaNeural)` | Local (CPU) |
31
+
32
+ ## Setup
33
+
34
+ ```bash
35
+ pip install -r requirements.txt
36
+ export HF_TOKEN="your_huggingface_token_here"
37
+ python app.py
38
+ ```
39
+
40
+ ## Deployment on HF Spaces
41
+
42
+ 1. Create a new Space (Gradio SDK)
43
+ 2. Upload all project files
44
+ 3. Set `HF_TOKEN` as a Space Secret
45
+ 4. The app will auto-launch on port 7860
46
+
47
+ ## Project Structure
48
+
49
+ ```
50
+ app.py # Gradio UI entry point
51
+ rag.py # Document ingestion, chunking, embedding, retrieval
52
+ script_gen.py # LLM script generation (Mistral-7B-Instruct)
53
+ tts.py # Text-to-speech (Qwen3-TTS + Edge-TTS fallback)
54
+ utils.py # Helpers (temp files, validation, error formatting)
55
+ requirements.txt # Python dependencies
56
+ packages.txt # System packages (ffmpeg)
57
+ ```
app.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Main Application.
3
+
4
+ Input sources (three tabs):
5
+ Tab 1 β€” Upload PDF or TXT file
6
+ Tab 2 β€” URL / YouTube link
7
+ Tab 3 β€” Paste raw text
8
+
9
+ Delivery Modes:
10
+ Summary / Podcast / Song / Rap / Debate
11
+
12
+ No status card shown. RAG pipeline unchanged.
13
+ """
14
+
15
+ import os
16
+ import gradio as gr
17
+ from utils import logger, validate_file, format_error
18
+ from rag import extract_text, RAGStore
19
+ from script_gen import generate_script
20
+ from tts import generate_audio, generate_audio_podcast, generate_audio_debate, generate_audio_rap, generate_audio_story
21
+ from ingestion import ingest_from_url_or_text, extract_pasted_text
22
+
23
+ # ── Global RAG Store ──────────────────────────────────────────────────────────
24
+ rag_store = RAGStore()
25
+
26
+
27
+ # ══════════════════════════════════════════════════════════════════════════════
28
+ # Shared RAG + Script + TTS pipeline
29
+ # ══════════════════════════════════════════════════════════════════════════════
30
+
31
+ def _run_pipeline(
32
+ text: str,
33
+ delivery_mode: str,
34
+ song_rap_sub: str,
35
+ progress,
36
+ ) -> tuple[str, str]:
37
+ """
38
+ RAG β†’ script β†’ audio. Shared by all three input tabs.
39
+ Returns (script, audio_path).
40
+ """
41
+ # RAG: chunk & embed
42
+ progress(0.30, desc="🧠 Building knowledge index…")
43
+ rag_store.add_document(text)
44
+ chunk_count = len(rag_store.chunks)
45
+ logger.info("RAG index: %d chunks", chunk_count)
46
+
47
+ # RAG: retrieve
48
+ progress(0.45, desc="πŸ” Retrieving relevant content…")
49
+ if chunk_count <= 8:
50
+ context_chunks = rag_store.get_all_chunks()
51
+ else:
52
+ context_chunks = rag_store.query(
53
+ "What are the main topics, key insights, and important details?",
54
+ top_k=6,
55
+ )
56
+
57
+ # Script generation
58
+ progress(0.60, desc=f"✍️ Writing {_mode_label(delivery_mode, song_rap_sub)} script…")
59
+ script = generate_script(
60
+ context_chunks=context_chunks,
61
+ mode=delivery_mode,
62
+ sub_mode=song_rap_sub,
63
+ )
64
+ logger.info("Script: %d chars", len(script))
65
+
66
+ # TTS β€” route by mode
67
+ progress(0.80, desc="πŸŽ™οΈ Synthesising audio…")
68
+ m = delivery_mode.strip().lower()
69
+ if m == "podcast":
70
+ audio_path, engine = generate_audio_podcast(script)
71
+ elif m == "debate":
72
+ audio_path, engine = generate_audio_debate(script)
73
+ elif m == "song / rap" and song_rap_sub.lower() == "rap":
74
+ audio_path, engine = generate_audio_rap(script)
75
+ elif m == "story":
76
+ audio_path, engine = generate_audio_story(script)
77
+ else:
78
+ audio_path, engine = generate_audio(script)
79
+ logger.info("Audio via %s: %s", engine, audio_path)
80
+
81
+ progress(1.00, desc="βœ… Done!")
82
+ return script, audio_path
83
+
84
+
85
+ def _mode_label(mode: str, sub_mode: str) -> str:
86
+ m = mode.lower()
87
+ if "podcast" in m:
88
+ return "podcast"
89
+ if "debate" in m:
90
+ return "debate"
91
+ if "story" in m:
92
+ return "story"
93
+ if "song" in m or "rap" in m:
94
+ return sub_mode.lower()
95
+ return "summary"
96
+
97
+
98
+ # ══════════════════════════════════════════════════════════════════════════════
99
+ # Per-tab handlers
100
+ # ══════════════════════════════════════════════════════════════════════════════
101
+
102
+ def process_file(file, delivery_mode, song_rap_sub, progress=gr.Progress()):
103
+ if file is None:
104
+ raise gr.Error("Please upload a PDF or TXT file first.")
105
+ file_path = file.name if hasattr(file, "name") else str(file)
106
+ is_valid, msg = validate_file(file_path)
107
+ if not is_valid:
108
+ raise gr.Error(msg)
109
+ try:
110
+ progress(0.10, desc="πŸ“„ Extracting text from document…")
111
+ text = extract_text(file_path)
112
+ if not text or len(text.strip()) < 50:
113
+ raise gr.Error("Document has too little text. Please upload a richer file.")
114
+ progress(0.20, desc="βœ… Text extracted")
115
+ return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
116
+ except gr.Error:
117
+ raise
118
+ except EnvironmentError as e:
119
+ raise gr.Error(str(e))
120
+ except Exception as e:
121
+ raise gr.Error(format_error("pipeline", e))
122
+
123
+
124
+ def process_url(url_input, delivery_mode, song_rap_sub, progress=gr.Progress()):
125
+ if not url_input or not url_input.strip():
126
+ raise gr.Error("Please enter a URL or YouTube link.")
127
+ try:
128
+ progress(0.05, desc="🌐 Fetching content…")
129
+ text, source_label = ingest_from_url_or_text(url_input.strip())
130
+ logger.info("Ingested from %s: %d chars", source_label, len(text))
131
+ progress(0.20, desc=f"βœ… Content fetched from {source_label}")
132
+ return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
133
+ except gr.Error:
134
+ raise
135
+ except ValueError as e:
136
+ raise gr.Error(str(e))
137
+ except EnvironmentError as e:
138
+ raise gr.Error(str(e))
139
+ except Exception as e:
140
+ raise gr.Error(format_error("pipeline", e))
141
+
142
+
143
+ def process_paste(pasted_text, delivery_mode, song_rap_sub, progress=gr.Progress()):
144
+ if not pasted_text or not pasted_text.strip():
145
+ raise gr.Error("Please paste some text first.")
146
+ try:
147
+ progress(0.10, desc="πŸ“‹ Processing pasted text…")
148
+ text = extract_pasted_text(pasted_text)
149
+ progress(0.20, desc="βœ… Text ready")
150
+ return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
151
+ except gr.Error:
152
+ raise
153
+ except ValueError as e:
154
+ raise gr.Error(str(e))
155
+ except EnvironmentError as e:
156
+ raise gr.Error(str(e))
157
+ except Exception as e:
158
+ raise gr.Error(format_error("pipeline", e))
159
+
160
+
161
+ # ══════════════════════════════════════════════════════════════════════════════
162
+ # UI helpers
163
+ # ══════════════════════════════════════════════════════════════════════════════
164
+
165
+ def _mode_description(mode: str) -> str:
166
+ return {
167
+ "Summary": (
168
+ "*πŸ“‹ **Summary** β€” Structured narration: intro, key points, conclusion. "
169
+ "Single voice, neutral tone.*"
170
+ ),
171
+ "Podcast": (
172
+ "*πŸŽ™οΈ **Podcast** β€” Two-host conversation. Female host guides; "
173
+ "Male host explains. Dual voices.*"
174
+ ),
175
+ "Song / Rap": (
176
+ "*🎡 **Song / Rap** β€” Key ideas as a rhythmic track. "
177
+ "Song = smooth flow Β· Rap = fast, punchy, bass-boosted.*"
178
+ ),
179
+ "Debate": (
180
+ "*βš”οΈ **Debate** β€” Two debaters argue opposing sides. "
181
+ "Female voice (pro, assertive) vs Male voice (con, deliberate).*"
182
+ ),
183
+ "Story": (
184
+ "*πŸ“– **Story** β€” Content retold as an immersive narrative. "
185
+ "Slow, warm delivery with expressive pauses.*"
186
+ ),
187
+ }.get(mode, "")
188
+
189
+
190
+ def _on_mode_change(mode: str):
191
+ show_sub = "song" in mode.lower() or "rap" in mode.lower()
192
+ return gr.update(visible=show_sub), _mode_description(mode)
193
+
194
+
195
+ # ══════════════════════════════════════════════════════════════════════════════
196
+ # Gradio UI
197
+ # ══════════════════════════════════════════════════════════════════════════════
198
+
199
+ def build_ui() -> gr.Blocks:
200
+
201
+ css = """
202
+ .main-header { text-align: center; margin-bottom: 1rem; }
203
+ .main-header h1 {
204
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
205
+ -webkit-background-clip: text;
206
+ -webkit-text-fill-color: transparent;
207
+ font-size: 2.5rem;
208
+ font-weight: 800;
209
+ margin-bottom: 0.25rem;
210
+ }
211
+ .main-header p { color: #6b7280; font-size: 1.1rem; }
212
+
213
+ .mode-card {
214
+ background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
215
+ border: 1px solid #e0d9ff;
216
+ border-radius: 12px;
217
+ padding: 1rem 1.25rem;
218
+ margin-top: 0.75rem;
219
+ margin-bottom: 0.75rem;
220
+ }
221
+
222
+ /* Hide the "Radio" label Gradio adds automatically */
223
+ #delivery-mode-radio .label-wrap { display: none !important; }
224
+
225
+ .url-hint { color: #6b7280; font-size: 0.82rem; margin-top: 0.3rem; }
226
+ """
227
+
228
+ with gr.Blocks(
229
+ title="VoiceVerse AI",
230
+ theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
231
+ css=css,
232
+ ) as app:
233
+
234
+ # ── Header ───────────────────────────────────────────────────────────
235
+ gr.HTML("""
236
+ <div class="main-header">
237
+ <h1>πŸŽ™οΈ VoiceVerse AI</h1>
238
+ <p>Transform any content into an engaging audio experience</p>
239
+ </div>
240
+ """)
241
+
242
+ with gr.Row(equal_height=False):
243
+
244
+ # ── LEFT COLUMN ────────────────────────────────────────────────���──
245
+ with gr.Column(scale=1):
246
+
247
+ gr.Markdown("### πŸ“₯ Choose Your Content Source")
248
+
249
+ with gr.Tabs():
250
+
251
+ # ── Tab 1: File upload ────────────────────────────────────
252
+ with gr.Tab("πŸ“„ File Upload"):
253
+ file_input = gr.File(
254
+ label="Upload a PDF or TXT file",
255
+ file_types=[".pdf", ".txt"],
256
+ type="filepath",
257
+ )
258
+ file_btn = gr.Button(
259
+ "πŸŽ™οΈ Generate Audio",
260
+ variant="primary",
261
+ size="lg",
262
+ )
263
+
264
+ # ── Tab 2: URL / YouTube ──────────────────────────────────
265
+ with gr.Tab("πŸ”— URL"):
266
+ url_input = gr.Textbox(
267
+ label=None,
268
+ placeholder=(
269
+ "Paste any link here…\n\n"
270
+ "β–Ά YouTube: https://youtube.com/watch?v=...\n"
271
+ "πŸ“° Article: https://example.com/article\n"
272
+ "🌐 Website: https://en.wikipedia.org/wiki/..."
273
+ ),
274
+ lines=5,
275
+ max_lines=6,
276
+ show_label=False,
277
+ )
278
+ gr.HTML(
279
+ "<p class='url-hint'>"
280
+ "βœ… Works with: YouTube (with captions), news articles, "
281
+ "blogs, Wikipedia, most public pages.<br>"
282
+ "❌ Won't work: paywalled or login-required pages."
283
+ "</p>"
284
+ )
285
+ url_btn = gr.Button(
286
+ "πŸŽ™οΈ Generate Audio",
287
+ variant="primary",
288
+ size="lg",
289
+ )
290
+
291
+ # ── Tab 3: Paste text ─────────────────────────────────────
292
+ with gr.Tab("πŸ“‹ Paste Text"):
293
+ paste_input = gr.Textbox(
294
+ label=None,
295
+ placeholder=(
296
+ "Paste any text here β€” article content, notes, "
297
+ "transcripts, research, anything…"
298
+ ),
299
+ lines=10,
300
+ max_lines=40,
301
+ show_label=False,
302
+ )
303
+ paste_btn = gr.Button(
304
+ "πŸŽ™οΈ Generate Audio",
305
+ variant="primary",
306
+ size="lg",
307
+ )
308
+
309
+ # ── Delivery Mode card ────────────────────────────────────────
310
+ gr.Markdown("### 🎨 Choose Audio Experience")
311
+
312
+ delivery_mode = gr.Radio(
313
+ choices=["Summary", "Podcast", "Song / Rap", "Debate", "Story"],
314
+ value="Summary",
315
+ show_label=False, # removes the "Radio" label
316
+ elem_id="delivery-mode-radio",
317
+ )
318
+
319
+ # Song/Rap sub-option β€” hidden unless Song/Rap is selected
320
+ with gr.Row(visible=False) as song_rap_row:
321
+ song_rap_sub = gr.Radio(
322
+ choices=["Song", "Rap"],
323
+ value="Rap",
324
+ label="Style",
325
+ )
326
+
327
+ mode_description = gr.Markdown(value=_mode_description("Summary"))
328
+
329
+ # ── RIGHT COLUMN ──────────────────────────────────────────────────
330
+ with gr.Column(scale=1):
331
+
332
+ gr.Markdown("### 🎧 Generated Audio")
333
+ audio_output = gr.Audio(
334
+ label="Audio",
335
+ type="filepath",
336
+ interactive=False,
337
+ show_download_button=True,
338
+ )
339
+
340
+ gr.Markdown("### ✍️ Generated Script")
341
+ script_output = gr.Textbox(
342
+ label="Script",
343
+ lines=14,
344
+ max_lines=22,
345
+ interactive=False,
346
+ placeholder="Your generated script will appear here…",
347
+ show_copy_button=True,
348
+ )
349
+
350
+ # ── Footer ────────────────���──────────────────────────────────────────
351
+ gr.Markdown(
352
+ "<center style='color:#9ca3af;margin-top:1rem;'>"
353
+ "Built with ❀️ using SmolLM3-3B · Qwen3-TTS · Edge-TTS · Gradio"
354
+ "</center>"
355
+ )
356
+
357
+ # ── Event wiring ─────────────────────────────────────────────────────
358
+
359
+ delivery_mode.change(
360
+ fn=_on_mode_change,
361
+ inputs=[delivery_mode],
362
+ outputs=[song_rap_row, mode_description],
363
+ )
364
+ file_btn.click(
365
+ fn=process_file,
366
+ inputs=[file_input, delivery_mode, song_rap_sub],
367
+ outputs=[script_output, audio_output],
368
+ )
369
+ url_btn.click(
370
+ fn=process_url,
371
+ inputs=[url_input, delivery_mode, song_rap_sub],
372
+ outputs=[script_output, audio_output],
373
+ )
374
+ paste_btn.click(
375
+ fn=process_paste,
376
+ inputs=[paste_input, delivery_mode, song_rap_sub],
377
+ outputs=[script_output, audio_output],
378
+ )
379
+
380
+ return app
381
+
382
+
383
+ # ── Entry point ───────────────────────────────────────────────────────────────
384
+
385
+ if __name__ == "__main__":
386
+ logger.info("Starting VoiceVerse AI…")
387
+ app = build_ui()
388
+ app.launch(
389
+ server_name="0.0.0.0",
390
+ server_port=7860,
391
+ share=False,
392
+ show_error=True,
393
+ )
convert_to_word.ps1 ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ $markdownPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\project_report.md"
3
+ $wordPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\VoiceVerse_AI_Project_Report.docx"
4
+
5
+ if (-not (Test-Path $markdownPath)) {
6
+ Write-Error "Markdown file not found at $markdownPath"
7
+ exit 1
8
+ }
9
+
10
+ $content = Get-Content -Path $markdownPath -Raw
11
+
12
+ # Create Word Object
13
+ try {
14
+ $word = New-Object -ComObject Word.Application
15
+ $word.Visible = $false
16
+ $doc = $word.Documents.Add()
17
+ $selection = $word.Selection
18
+
19
+ # Basic Markdown Parsing (Simplified)
20
+ $lines = $content -split "`r?`n"
21
+ foreach ($line in $lines) {
22
+ if ($line -match "^# (.*)") {
23
+ $selection.Style = "Title"
24
+ $selection.TypeText($matches[1])
25
+ $selection.TypeParagraph()
26
+ } elseif ($line -match "^## (.*)") {
27
+ $selection.Style = "Heading 1"
28
+ $selection.TypeText($matches[1])
29
+ $selection.TypeParagraph()
30
+ } elseif ($line -match "^### (.*)") {
31
+ $selection.Style = "Heading 2"
32
+ $selection.TypeText($matches[1])
33
+ $selection.TypeParagraph()
34
+ } elseif ($line -match "^---") {
35
+ # Skip horizontal rules or add a page break?
36
+ # For now just skip
37
+ } elseif ($line -match "^\|") {
38
+ # Table handling is complex, for now just TypeText
39
+ $selection.Style = "Normal"
40
+ $selection.TypeText($line)
41
+ $selection.TypeParagraph()
42
+ } else {
43
+ $selection.Style = "Normal"
44
+ # Remove bold/italic markers for cleaner look
45
+ $cleanLine = $line -replace "\*\*", "" -replace "\*", ""
46
+ $selection.TypeText($cleanLine)
47
+ $selection.TypeParagraph()
48
+ }
49
+ }
50
+
51
+ $doc.SaveAs([ref]$wordPath)
52
+ $doc.Close()
53
+ $word.Quit()
54
+ Write-Host "Word document created successfully at $wordPath"
55
+ } catch {
56
+ Write-Error "Failed to create Word document: $_"
57
+ if ($word) { $word.Quit() }
58
+ }
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ingestion.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Content Ingestion Module.
3
+
4
+ Handles all input sources beyond file upload:
5
+ - YouTube links β†’ transcript via youtube-transcript-api
6
+ - Article / website β†’ readable text via trafilatura + BeautifulSoup fallback
7
+ - Pasted raw text β†’ light cleaning and validation
8
+
9
+ Returns plain text string that feeds into RAGStore.add_document().
10
+ rag.py is completely unchanged.
11
+ """
12
+
13
+ import re
14
+ import urllib.parse
15
+ from utils import logger
16
+
17
+
18
+ # ══════════════════════════════════════════════════════════════════════════════
19
+ # URL type detection
20
+ # ══════════════════════════════════════════════════════════════════════════════
21
+
22
+ def _is_youtube(url: str) -> bool:
23
+ parsed = urllib.parse.urlparse(url.strip())
24
+ host = parsed.netloc.lower().replace("www.", "")
25
+ return host in ("youtube.com", "youtu.be")
26
+
27
+
28
+ def _extract_youtube_id(url: str) -> str | None:
29
+ patterns = [
30
+ r"(?:v=)([a-zA-Z0-9_-]{11})",
31
+ r"youtu\.be/([a-zA-Z0-9_-]{11})",
32
+ r"embed/([a-zA-Z0-9_-]{11})",
33
+ r"shorts/([a-zA-Z0-9_-]{11})",
34
+ ]
35
+ for pattern in patterns:
36
+ match = re.search(pattern, url)
37
+ if match:
38
+ return match.group(1)
39
+ return None
40
+
41
+
42
+ # ══════════════════════════════════════════════════════════════════════════════
43
+ # YouTube transcript
44
+ # ══════════════════════════════════════════════════════════════════════════════
45
+
46
+ def extract_youtube(url: str) -> str:
47
+ try:
48
+ from youtube_transcript_api import (
49
+ YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
50
+ )
51
+ except ImportError:
52
+ raise ImportError(
53
+ "youtube-transcript-api is not installed. "
54
+ "Add 'youtube-transcript-api' to requirements.txt and restart the Space."
55
+ )
56
+
57
+ video_id = _extract_youtube_id(url)
58
+ if not video_id:
59
+ raise ValueError(f"Could not extract a YouTube video ID from: {url}")
60
+
61
+ logger.info("Fetching YouTube transcript: video_id=%s", video_id)
62
+
63
+ try:
64
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
65
+
66
+ # Prefer English manual captions, then English auto, then anything available
67
+ try:
68
+ transcript = transcript_list.find_manually_created_transcript(
69
+ ["en", "en-US", "en-GB"]
70
+ )
71
+ except NoTranscriptFound:
72
+ try:
73
+ transcript = transcript_list.find_generated_transcript(
74
+ ["en", "en-US", "en-GB"]
75
+ )
76
+ except NoTranscriptFound:
77
+ transcript = next(iter(transcript_list))
78
+ logger.info("No English transcript β€” using: %s", transcript.language)
79
+
80
+ entries = transcript.fetch()
81
+ text = " ".join(entry["text"] for entry in entries)
82
+
83
+ # Clean YouTube caption artifacts
84
+ text = re.sub(r"\[.*?\]", "", text) # [Music], [Applause] etc.
85
+ text = re.sub(r"\s{2,}", " ", text).strip()
86
+
87
+ if len(text) < 50:
88
+ raise ValueError("YouTube transcript is too short to process.")
89
+
90
+ logger.info("YouTube transcript: %d chars", len(text))
91
+ return text
92
+
93
+ except (NoTranscriptFound, TranscriptsDisabled) as e:
94
+ raise ValueError(
95
+ f"No transcript available for this video. "
96
+ f"The video may have captions disabled or be private.\n\n"
97
+ f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
98
+ )
99
+
100
+
101
+ # ══════════════════════════════════════════════════════════════════════════════
102
+ # Article / website URL
103
+ # ══════════════════════════════════════════════════════════════════════════════
104
+
105
+ def extract_url(url: str) -> str:
106
+ """
107
+ Fetch a webpage and extract readable text.
108
+ Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
109
+ """
110
+ url = url.strip()
111
+ logger.info("Fetching URL: %s", url)
112
+
113
+ headers = {
114
+ "User-Agent": (
115
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
116
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
117
+ "Chrome/120.0.0.0 Safari/537.36"
118
+ )
119
+ }
120
+
121
+ # ── Attempt 1: trafilatura ────────────────────────────────────────────────
122
+ try:
123
+ import trafilatura
124
+ downloaded = trafilatura.fetch_url(url)
125
+ if downloaded:
126
+ text = trafilatura.extract(
127
+ downloaded,
128
+ include_comments=False,
129
+ include_tables=True,
130
+ no_fallback=False,
131
+ )
132
+ if text and len(text.strip()) > 100:
133
+ logger.info("trafilatura extracted %d chars", len(text))
134
+ return text.strip()
135
+ except Exception as e:
136
+ logger.warning("trafilatura failed (%s) β€” trying BeautifulSoup", e)
137
+
138
+ # ── Attempt 2: requests + BeautifulSoup ──────────────────────────────────
139
+ try:
140
+ import requests
141
+ from bs4 import BeautifulSoup
142
+
143
+ resp = requests.get(url, headers=headers, timeout=15)
144
+ resp.raise_for_status()
145
+
146
+ soup = BeautifulSoup(resp.text, "html.parser")
147
+ for tag in soup(["script", "style", "nav", "footer", "header",
148
+ "aside", "form", "noscript", "iframe"]):
149
+ tag.decompose()
150
+
151
+ article = soup.find("article") or soup.find("main") or soup.find("body")
152
+ text = (
153
+ article.get_text(separator=" ", strip=True)
154
+ if article
155
+ else soup.get_text(separator=" ", strip=True)
156
+ )
157
+ text = re.sub(r"\s{3,}", "\n\n", text)
158
+ text = re.sub(r" {2,}", " ", text).strip()
159
+
160
+ if len(text) < 100:
161
+ raise ValueError("Could not extract enough text from this page.")
162
+
163
+ logger.info("BeautifulSoup extracted %d chars", len(text))
164
+ return text
165
+
166
+ except Exception as e:
167
+ raise ValueError(
168
+ f"Could not fetch content from: {url}\n\n"
169
+ f"Reason: {e}\n\n"
170
+ "The page may require a login or block bots. "
171
+ "Try copying the article text and pasting it in the Paste Text tab."
172
+ )
173
+
174
+
175
+ # ══════════════════════════════════════════════════════════════════════════════
176
+ # Pasted raw text
177
+ # ══════════════════════════════════════════════════════════════════════════════
178
+
179
+ def extract_pasted_text(text: str) -> str:
180
+ if not text or not text.strip():
181
+ raise ValueError("No text was pasted. Please paste some content.")
182
+
183
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
184
+ text = re.sub(r"\n{4,}", "\n\n\n", text)
185
+ text = re.sub(r" {2,}", " ", text).strip()
186
+
187
+ if len(text) < 50:
188
+ raise ValueError(
189
+ "Pasted text is too short. Please paste at least a paragraph of content."
190
+ )
191
+
192
+ logger.info("Pasted text ingested: %d chars", len(text))
193
+ return text
194
+
195
+
196
+ # ══════════════════════════════════════════════════════════════════════════════
197
+ # Unified entry point
198
+ # ══════════════════════════════════════════════════════════════════════════════
199
+
200
+ def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
201
+ """
202
+ Auto-detect whether input is a YouTube URL, article URL, or plain text.
203
+
204
+ Returns:
205
+ (extracted_text, source_label)
206
+ """
207
+ raw = raw_input.strip()
208
+ if not raw:
209
+ raise ValueError("Please enter a URL or paste some text.")
210
+
211
+ if re.match(r"https?://", raw, re.IGNORECASE):
212
+ if _is_youtube(raw):
213
+ return extract_youtube(raw), "YouTube"
214
+ else:
215
+ return extract_url(raw), "Article / Website"
216
+ else:
217
+ return extract_pasted_text(raw), "Pasted Text"
packages.txt ADDED
File without changes
rag.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” RAG Pipeline.
3
+
4
+ Handles document ingestion, text chunking, embedding generation,
5
+ and semantic retrieval using an in-memory vector store.
6
+
7
+ Models used:
8
+ - sentence-transformers/all-MiniLM-L6-v2 for embeddings (22 MB, CPU-friendly)
9
+
10
+ Design decisions:
11
+ - NumPy cosine similarity instead of FAISS to avoid heavy native deps
12
+ - Overlapping chunks to preserve context across boundaries
13
+ - Single-document architecture (clear store on new upload)
14
+ """
15
+
16
+ import os
17
+ import numpy as np
18
+ from utils import logger
19
+
20
+ # ── Text Extraction ──────────────────────────────────────────────────────────
21
+
22
+ def extract_text(file_path: str) -> str:
23
+ """
24
+ Extract plain text from a PDF or TXT file.
25
+ Returns the full document text as a single string.
26
+ """
27
+ ext = os.path.splitext(file_path)[1].lower()
28
+
29
+ if ext == ".pdf":
30
+ return _extract_pdf(file_path)
31
+ elif ext == ".txt":
32
+ return _extract_txt(file_path)
33
+ else:
34
+ raise ValueError(f"Unsupported file type: {ext}")
35
+
36
+
37
+ def _extract_pdf(file_path: str) -> str:
38
+ """Extract text from PDF using PyMuPDF."""
39
+ import fitz # PyMuPDF
40
+
41
+ text_parts = []
42
+ with fitz.open(file_path) as doc:
43
+ for page_num, page in enumerate(doc):
44
+ page_text = page.get_text("text")
45
+ if page_text.strip():
46
+ text_parts.append(page_text)
47
+ logger.debug("Extracted page %d: %d chars", page_num + 1, len(page_text))
48
+
49
+ full_text = "\n\n".join(text_parts)
50
+ logger.info("PDF extraction complete: %d pages, %d chars total",
51
+ len(text_parts), len(full_text))
52
+ return full_text
53
+
54
+
55
+ def _extract_txt(file_path: str) -> str:
56
+ """Read plain text file with encoding fallback."""
57
+ for encoding in ("utf-8", "utf-8-sig", "latin-1", "cp1252"):
58
+ try:
59
+ with open(file_path, "r", encoding=encoding) as f:
60
+ text = f.read()
61
+ logger.info("TXT extraction complete (%s): %d chars", encoding, len(text))
62
+ return text
63
+ except UnicodeDecodeError:
64
+ continue
65
+ raise ValueError("Could not decode the text file with any supported encoding.")
66
+
67
+
68
+ # ── Text Chunking ────────────────────────────────────────────────────────────
69
+
70
+ def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
71
+ """
72
+ Split text into overlapping chunks of roughly `chunk_size` characters.
73
+ Overlap ensures context isn't lost at chunk boundaries.
74
+
75
+ Uses sentence-aware splitting: tries to break at sentence boundaries
76
+ within the chunk window for more coherent chunks.
77
+ """
78
+ if not text or not text.strip():
79
+ return []
80
+
81
+ # Clean up whitespace
82
+ text = " ".join(text.split())
83
+
84
+ chunks = []
85
+ start = 0
86
+
87
+ while start < len(text):
88
+ end = start + chunk_size
89
+
90
+ # If not at the end, try to break at a sentence boundary
91
+ if end < len(text):
92
+ # Look for sentence-ending punctuation near the end
93
+ search_start = max(start + chunk_size // 2, start)
94
+ last_period = -1
95
+ for i in range(min(end, len(text)) - 1, search_start - 1, -1):
96
+ if text[i] in ".!?" and (i + 1 >= len(text) or text[i + 1] == " "):
97
+ last_period = i
98
+ break
99
+ if last_period > start:
100
+ end = last_period + 1
101
+
102
+ chunk = text[start:end].strip()
103
+ if chunk:
104
+ chunks.append(chunk)
105
+
106
+ # Move forward by (chunk length - overlap)
107
+ start = max(start + 1, end - overlap)
108
+
109
+ logger.info("Chunking complete: %d chunks (size=%d, overlap=%d)",
110
+ len(chunks), chunk_size, overlap)
111
+ return chunks
112
+
113
+
114
+ # ── Embedding & Vector Store ─────────────────────────────────────────────────
115
+
116
+ class RAGStore:
117
+ """
118
+ In-memory vector store using sentence-transformers embeddings
119
+ and NumPy cosine similarity.
120
+
121
+ Usage:
122
+ store = RAGStore()
123
+ store.add_document("full document text here")
124
+ results = store.query("what is this about?", top_k=5)
125
+ """
126
+
127
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
128
+
129
+ def __init__(self):
130
+ self._model = None
131
+ self.chunks: list[str] = []
132
+ self.embeddings: np.ndarray | None = None
133
+
134
+ @property
135
+ def model(self):
136
+ """Lazy-load the embedding model to avoid startup cost."""
137
+ if self._model is None:
138
+ logger.info("Loading embedding model: %s", self.MODEL_NAME)
139
+ from sentence_transformers import SentenceTransformer
140
+ self._model = SentenceTransformer(self.MODEL_NAME)
141
+ logger.info("Embedding model loaded successfully")
142
+ return self._model
143
+
144
+ def clear(self):
145
+ """Clear the store for a new document."""
146
+ self.chunks = []
147
+ self.embeddings = None
148
+
149
+ def add_document(self, text: str, chunk_size: int = 512, overlap: int = 50):
150
+ """
151
+ Process a document: chunk the text, generate embeddings, and store.
152
+ Clears any previously stored document.
153
+ """
154
+ self.clear()
155
+
156
+ self.chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
157
+ if not self.chunks:
158
+ raise ValueError("No text chunks could be extracted from the document.")
159
+
160
+ logger.info("Generating embeddings for %d chunks...", len(self.chunks))
161
+ self.embeddings = self.model.encode(
162
+ self.chunks,
163
+ show_progress_bar=False,
164
+ convert_to_numpy=True,
165
+ normalize_embeddings=True, # Pre-normalize for faster cosine sim
166
+ )
167
+ logger.info("Embeddings generated: shape %s", self.embeddings.shape)
168
+
169
+ def query(self, question: str, top_k: int = 5) -> list[str]:
170
+ """
171
+ Retrieve the top-k most relevant chunks for the given question.
172
+ Uses cosine similarity (dot product on normalized vectors).
173
+ """
174
+ if self.embeddings is None or len(self.chunks) == 0:
175
+ return []
176
+
177
+ # Embed the query
178
+ query_embedding = self.model.encode(
179
+ [question],
180
+ convert_to_numpy=True,
181
+ normalize_embeddings=True,
182
+ )
183
+
184
+ # Cosine similarity = dot product (vectors are pre-normalized)
185
+ similarities = np.dot(self.embeddings, query_embedding.T).flatten()
186
+
187
+ # Get top-k indices
188
+ top_k = min(top_k, len(self.chunks))
189
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
190
+
191
+ results = [self.chunks[i] for i in top_indices]
192
+ logger.info("Retrieved %d chunks (top similarity: %.3f)",
193
+ len(results), similarities[top_indices[0]])
194
+ return results
195
+
196
+ def get_all_chunks(self) -> list[str]:
197
+ """Return all stored chunks (useful for short documents)."""
198
+ return self.chunks.copy()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.23.1,<6.0
2
+ huggingface-hub>=0.25
3
+ pydantic>=2.0,<2.11
4
+ sentence-transformers
5
+ numpy
6
+ PyMuPDF
7
+ edge-tts
8
+ scipy
9
+ pydub
10
+ requests
11
+ beautifulsoup4
12
+ trafilatura
13
+ youtube-transcript-api
script_gen.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Script Generation Module.
3
+
4
+ Delivery Modes:
5
+ Summary β€” single-speaker structured narration
6
+ Podcast β€” HOST_1 / HOST_2 two-host dialogue
7
+ Song/Rap β€” rhythmic retention content
8
+ Debate β€” DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
9
+ """
10
+
11
+ import os
12
+ import re
13
+ from huggingface_hub import InferenceClient
14
+ from utils import logger
15
+
16
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
17
+ MAX_NEW_TOKENS = 1200
18
+ TEMPERATURE = 0.5
19
+
20
+
21
+ # ══════════════════════════════════════════════════════════════════════════════
22
+ # Prompts
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+
25
+ # ── Summary ───────────────────────────────────────────────────────────────────
26
+ _SUMMARY_SYSTEM = """\
27
+ You are a professional narrator. Produce a clear spoken summary strictly from the source material.
28
+ RULES:
29
+ 1. Use ONLY facts from the source. Do NOT add outside knowledge.
30
+ 2. Structure: short intro β†’ key points as natural spoken sentences β†’ concise conclusion.
31
+ 3. Plain text only β€” no markdown, no bullets, no headers.
32
+ 4. Write for the ear: short sentences, conversational tone.
33
+ 5. Never say "the document says". Speak as the expert.
34
+ 6. Output ONLY the narration text, nothing else."""
35
+
36
+ _SUMMARY_USER = """\
37
+ SOURCE MATERIAL:
38
+ {context}
39
+
40
+ Write a flowing spoken summary (intro, key points, conclusion) in plain sentences."""
41
+
42
+
43
+ # ── Podcast ───────────────────────────────────────────────────────────────────
44
+ _PODCAST_SYSTEM = """\
45
+ You are a podcast script writer. Write a two-host conversation strictly from the source material.
46
+
47
+ STRICT FORMAT β€” every single line must start with a speaker tag:
48
+ HOST_1: <what Host 1 says>
49
+ HOST_2: <what Host 2 says>
50
+
51
+ RULES:
52
+ 1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
53
+ 2. HOST_1 introduces topics and asks questions.
54
+ 3. HOST_2 explains concepts and answers.
55
+ 4. Use ONLY information from the source. No hallucination.
56
+ 5. Conversational, engaging tone.
57
+ 6. No markdown, no stage directions, no lines without a HOST tag.
58
+ 7. Aim for 16–24 exchanges."""
59
+
60
+ _PODCAST_USER = """\
61
+ SOURCE MATERIAL:
62
+ {context}
63
+
64
+ Write the full podcast. Every line must start with HOST_1: or HOST_2:"""
65
+
66
+
67
+ # ── Song / Rap ────────────────────────────────────────────────────────────────
68
+ _SONG_SYSTEM = """\
69
+ You are a lyricist. Two steps:
70
+ STEP 1 β€” silently extract 5–7 key ideas from the source.
71
+ STEP 2 β€” write a smooth melodic SONG from those ideas.
72
+
73
+ RULES:
74
+ - Simple memorable language, rhyming couplets (AABB).
75
+ - Label sections [VERSE 1], [VERSE 2], [CHORUS].
76
+ - [CHORUS] repeats the main concept.
77
+ - Short lines (6–10 words). Use repetition.
78
+ - Do NOT invent facts not in the source.
79
+ - Output ONLY the lyrics with section labels."""
80
+
81
+ _RAP_SYSTEM = """\
82
+ You are a lyricist. Two steps:
83
+ STEP 1 β€” silently extract 5–7 key ideas from the source.
84
+ STEP 2 β€” write a punchy rhythmic RAP from those ideas.
85
+
86
+ RULES:
87
+ - Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
88
+ - Label sections [VERSE 1], [VERSE 2], [HOOK].
89
+ - [HOOK] repeats the main concept.
90
+ - Wordplay and repetition to aid retention.
91
+ - Do NOT invent facts not in the source.
92
+ - Output ONLY the lyrics with section labels."""
93
+
94
+ _SONG_RAP_USER = """\
95
+ SOURCE MATERIAL:
96
+ {context}
97
+
98
+ Extract the key ideas, then write the full {form}."""
99
+
100
+
101
+ # ── Debate ────────────────────────────────────────────────────────────────────
102
+ _DEBATE_SYSTEM = """\
103
+ You are a debate script writer. Write a structured two-person debate strictly grounded \
104
+ in the provided source material.
105
+
106
+ STRICT FORMAT β€” every single line must start with a speaker tag:
107
+ DEBATER_A: <what Debater A says>
108
+ DEBATER_B: <what Debater B says>
109
+
110
+ CHARACTER PROFILES:
111
+ - DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
112
+ - DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.
113
+
114
+ DEBATE STRUCTURE:
115
+ 1. DEBATER_A opens with a strong statement supporting the topic.
116
+ 2. DEBATER_B immediately challenges with a counterpoint.
117
+ 3. They alternate, each directly responding to the other's previous point.
118
+ 4. Both use evidence and logic from the source material only.
119
+ 5. End with each debater giving a brief closing statement.
120
+
121
+ RULES:
122
+ - Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
123
+ - Use ONLY information from the source material. No hallucination.
124
+ - Each turn should be 1–3 sentences β€” punchy, not long speeches.
125
+ - No markdown, no stage directions, no narration outside the speaker tags.
126
+ - Aim for 16–22 exchanges total."""
127
+
128
+ _DEBATE_USER = """\
129
+ SOURCE MATERIAL:
130
+ {context}
131
+
132
+ Write the full debate on the key topics from this material. \
133
+ Every line must start with DEBATER_A: or DEBATER_B:"""
134
+
135
+
136
+ # ── Story ─────────────────────────────────────────────────────────────────────
137
+ _STORY_SYSTEM = """\
138
+ You are a master storyteller. Retell the ideas from the source material as an \
139
+ immersive narrative story written for slow, expressive audio delivery.
140
+
141
+ RULES:
142
+ 1. Transform factual content into a story β€” use characters, scenes, a narrative arc \
143
+ (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
144
+ 2. Use ONLY information and ideas from the source. Do NOT invent new facts.
145
+ 3. Warm, descriptive storytelling voice. Vivid but calm.
146
+ 4. Short paragraphs, 1–3 sentences each, separated by blank lines.
147
+ 5. Plain text only β€” no markdown, no bullets, no headers.
148
+ 6. Begin with an evocative scene-setting sentence.
149
+ 7. End with a closing reflection or lesson drawn from the source.
150
+ 8. Output ONLY the story text, nothing else."""
151
+
152
+ _STORY_USER = """\
153
+ SOURCE MATERIAL:
154
+ {context}
155
+
156
+ Transform this into a rich narrative story for slow, expressive audio. \
157
+ Use short paragraphs with blank lines between them."""
158
+
159
+
160
+ # ══════════════════════════════════════════════════════════════════════════════
161
+ # Post-processing
162
+ # ══════════════════════════════════════════════════════════════════════════════
163
+
164
+ def _clean(text: str) -> str:
165
+ """Remove all markdown and XML artifacts from LLM output."""
166
+ text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
167
+ text = re.sub(r"<[^>]+>", "", text)
168
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
169
+ text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
170
+ text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
171
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
172
+ text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
173
+ text = re.sub(r"`([^`]+)`", r"\1", text)
174
+ text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
175
+ text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
176
+ text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
177
+ text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
178
+ text = re.sub(r"\n{3,}", "\n\n", text)
179
+ text = re.sub(r" {2,}", " ", text)
180
+ return text.strip()
181
+
182
+
183
+ def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
184
+ """
185
+ Clean output that must have speaker tags (podcast or debate).
186
+ Normalises tag variants, removes lines without valid tags.
187
+ """
188
+ text = _clean(text)
189
+
190
+ # Normalise tag variants the model might produce
191
+ if tag_a == "HOST_1":
192
+ text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
193
+ text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
194
+ elif tag_a == "DEBATER_A":
195
+ text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
196
+ text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
197
+ # Also catch "Pro:" / "Con:" / "Speaker A:" variants
198
+ text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
199
+ text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
200
+ text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
201
+ text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)
202
+
203
+ # Keep only lines that have a valid speaker tag
204
+ lines = text.splitlines()
205
+ clean_lines = [
206
+ ln for ln in lines
207
+ if ln.strip() == ""
208
+ or ln.strip().startswith(f"{tag_a}:")
209
+ or ln.strip().startswith(f"{tag_b}:")
210
+ ]
211
+ return "\n".join(clean_lines).strip()
212
+
213
+
214
+ # ══════════════════════════════════════════════════════════════════════════════
215
+ # LLM client
216
+ # ══════════════════════════════════════════════════════════════════════════════
217
+
218
+ def _get_client() -> InferenceClient:
219
+ token = os.environ.get("HF_TOKEN")
220
+ if not token:
221
+ raise EnvironmentError(
222
+ "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
223
+ )
224
+ return InferenceClient(provider="hf-inference", token=token)
225
+
226
+
227
+ def _call_llm(system: str, user: str) -> str:
228
+ client = _get_client()
229
+ response = client.chat_completion(
230
+ model=MODEL_ID,
231
+ messages=[
232
+ {"role": "system", "content": system},
233
+ {"role": "user", "content": user},
234
+ ],
235
+ max_tokens=MAX_NEW_TOKENS,
236
+ temperature=TEMPERATURE,
237
+ top_p=0.9,
238
+ )
239
+ raw = response.choices[0].message.content.strip()
240
+ if not raw:
241
+ raise RuntimeError("Model returned empty response. Please try again.")
242
+ return raw
243
+
244
+
245
+ # ══════════════════════════════════════════════════════════════════════════════
246
+ # Public entry point
247
+ # ══════════════════════════════════════════════════════════════════════════════
248
+
249
+ def generate_script(
250
+ context_chunks: list[str],
251
+ mode: str = "Summary",
252
+ sub_mode: str = "Rap",
253
+ topic: str = "the key ideas from this document",
254
+ ) -> str:
255
+ """
256
+ Generate a spoken script from RAG chunks.
257
+
258
+ Args:
259
+ context_chunks : chunks from RAGStore β€” NOT modified here
260
+ mode : "Summary" | "Podcast" | "Song / Rap" | "Debate"
261
+ sub_mode : "Song" | "Rap" (only for Song/Rap mode)
262
+
263
+ Returns:
264
+ Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
265
+ Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
266
+ """
267
+ if not context_chunks:
268
+ raise ValueError("No document context. Please upload or paste content first.")
269
+
270
+ context = "\n\n".join(context_chunks)
271
+ if len(context) > 6000:
272
+ context = context[:6000]
273
+ logger.warning("Context truncated to 6000 chars")
274
+
275
+ logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))
276
+
277
+ m = mode.strip().lower()
278
+
279
+ if m == "summary":
280
+ raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
281
+ script = _clean(raw)
282
+
283
+ elif m == "podcast":
284
+ raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
285
+ script = _clean_dialogue(raw, "HOST_1", "HOST_2")
286
+
287
+ elif "song" in m or "rap" in m:
288
+ form = sub_mode.lower()
289
+ sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
290
+ raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
291
+ script = _clean(raw)
292
+
293
+ elif "debate" in m:
294
+ raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
295
+ script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")
296
+
297
+ elif "story" in m:
298
+ raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
299
+ script = _clean(raw)
300
+
301
+ else:
302
+ logger.warning("Unknown mode '%s' β€” falling back to Summary", mode)
303
+ raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
304
+ script = _clean(raw)
305
+
306
+ if not script:
307
+ raise RuntimeError("Script was empty after cleaning. Please try again.")
308
+
309
+ logger.info("Script ready: %d chars", len(script))
310
+ return script
tts.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” TTS Module.
3
+
4
+ Primary: Qwen3-TTS via HF Inference API
5
+ Fallback: Edge-TTS (CPU, no key needed)
6
+
7
+ Voice + audio style per mode:
8
+ Summary β€” neutral female voice, normal rate
9
+ Podcast β€” HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
10
+ Rap β€” male voice, faster rate (+40%), bass boost via pydub
11
+ Song β€” female voice, normal rate
12
+ Debate β€” DEBATER_A female (JennyNeural, +5%) / DEBATER_B male (DavisNeural, -5%)
13
+ Story β€” female voice, slow rate (-30%), long silence gaps between sentences
14
+ """
15
+
16
+ import os
17
+ import re
18
+ import asyncio
19
+ from utils import logger, get_temp_filepath
20
+
21
+ QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
22
+ TTS_MAX_CHARS = 3000
23
+
24
+ # ── Voice assignments ─────────────────────────────────────────────────────────
25
+ # Summary / Song / Story β€” single female voice
26
+ EDGE_VOICE_FEMALE = "en-US-AriaNeural"
27
+
28
+ # Podcast
29
+ EDGE_VOICE_HOST_FEMALE = "en-US-AriaNeural" # HOST_1 β€” female
30
+ EDGE_VOICE_HOST_MALE = "en-US-GuyNeural" # HOST_2 β€” male
31
+
32
+ # Rap β€” male voice reads the rap
33
+ EDGE_VOICE_RAP = "en-US-GuyNeural"
34
+ RAP_RATE = "+40%" # fast delivery
35
+
36
+ # Debate
37
+ EDGE_VOICE_DEBATER_A = "en-US-JennyNeural" # female, pro β€” assertive
38
+ EDGE_VOICE_DEBATER_B = "en-US-DavisNeural" # male, con β€” skeptical
39
+ DEBATE_RATE_A = "+8%" # slightly faster
40
+ DEBATE_RATE_B = "-5%" # slightly slower, deliberate
41
+
42
+ # Story β€” slow, warm delivery
43
+ EDGE_VOICE_STORY = "en-US-AriaNeural"
44
+ STORY_RATE = "-30%" # noticeably slower
45
+
46
+
47
+ # ══════════════════════════════════════════════════════════════════════════════
48
+ # Low-level TTS helpers
49
+ # ══════════════════════════════════════════════════════════════════════════════
50
+
51
+ def _qwen_tts(text: str) -> str | None:
52
+ token = os.environ.get("HF_TOKEN")
53
+ if not token:
54
+ return None
55
+ try:
56
+ from huggingface_hub import InferenceClient
57
+ client = InferenceClient(token=token)
58
+ audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
59
+ if not audio_bytes:
60
+ return None
61
+ path = get_temp_filepath(suffix=".wav")
62
+ with open(path, "wb") as f:
63
+ f.write(audio_bytes)
64
+ logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
65
+ return path
66
+ except Exception as e:
67
+ logger.warning("Qwen TTS failed: %s", e)
68
+ return None
69
+
70
+
71
+ def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%") -> str:
72
+ """
73
+ Generate audio via Edge-TTS.
74
+ rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
75
+ """
76
+ import edge_tts
77
+ path = get_temp_filepath(suffix=".mp3")
78
+ snippet = text[:TTS_MAX_CHARS]
79
+
80
+ async def _run():
81
+ communicate = edge_tts.Communicate(snippet, voice, rate=rate)
82
+ await communicate.save(path)
83
+
84
+ try:
85
+ loop = asyncio.get_event_loop()
86
+ if loop.is_running():
87
+ import concurrent.futures
88
+ with concurrent.futures.ThreadPoolExecutor() as pool:
89
+ pool.submit(asyncio.run, _run()).result(timeout=120)
90
+ else:
91
+ loop.run_until_complete(_run())
92
+ except RuntimeError:
93
+ asyncio.run(_run())
94
+
95
+ if os.path.getsize(path) == 0:
96
+ raise RuntimeError("Edge-TTS produced an empty audio file.")
97
+ logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
98
+ return path
99
+
100
+
101
+ # ══════════════════════════════════════════════════════════════════════════════
102
+ # Audio post-processing
103
+ # ══════════════════════════════════════════════════════════════════════════════
104
+
105
+ def _apply_rap_fx(path: str) -> str:
106
+ """
107
+ Apply bass boost to a rap audio file using pydub.
108
+ Low-frequency boost makes it sound punchier and more rap-like.
109
+ Returns path to processed file (new file).
110
+ """
111
+ try:
112
+ from pydub import AudioSegment
113
+ from pydub.effects import low_pass_filter
114
+
115
+ audio = AudioSegment.from_file(path)
116
+
117
+ # Split into bass (low) and mid/high frequencies
118
+ bass = low_pass_filter(audio, 200) # frequencies below 200 Hz
119
+ highs = audio - low_pass_filter(audio, 200) # everything above
120
+
121
+ # Boost bass by 6 dB, keep highs as-is, combine
122
+ boosted = (bass + 6).overlay(highs)
123
+
124
+ out = get_temp_filepath(suffix=".mp3")
125
+ boosted.export(out, format="mp3")
126
+ logger.info("Rap bass boost applied β†’ %s", out)
127
+ return out
128
+ except Exception as e:
129
+ logger.warning("Rap FX failed (%s) β€” returning original audio", e)
130
+ return path
131
+
132
+
133
+ def _concat(paths: list[str], silence_ms: int = 300) -> str:
134
+ """Concatenate audio files with silence between each segment."""
135
+ if len(paths) == 1:
136
+ return paths[0]
137
+ try:
138
+ from pydub import AudioSegment
139
+ combined = AudioSegment.empty()
140
+ silence = AudioSegment.silent(duration=silence_ms)
141
+ for p in paths:
142
+ combined += AudioSegment.from_file(p) + silence
143
+ out = get_temp_filepath(suffix=".mp3")
144
+ combined.export(out, format="mp3")
145
+ logger.info("Concatenated %d segments β†’ %s", len(paths), out)
146
+ return out
147
+ except Exception as e:
148
+ logger.warning("pydub concat failed (%s) β€” returning first segment", e)
149
+ return paths[0]
150
+
151
+
152
+ def _add_story_gaps(path: str) -> str:
153
+ """
154
+ Insert longer silence gaps between sentences in story audio.
155
+ Gives the warm, unhurried feel of a storyteller.
156
+ """
157
+ try:
158
+ from pydub import AudioSegment
159
+ audio = AudioSegment.from_file(path)
160
+ gap = AudioSegment.silent(duration=600) # 600 ms between sentences
161
+ # Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
162
+ chunk_ms = 5000
163
+ chunks = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
164
+ combined = AudioSegment.empty()
165
+ for chunk in chunks:
166
+ combined += chunk + gap
167
+ out = get_temp_filepath(suffix=".mp3")
168
+ combined.export(out, format="mp3")
169
+ logger.info("Story gaps applied β†’ %s", out)
170
+ return out
171
+ except Exception as e:
172
+ logger.warning("Story gap insertion failed (%s) β€” returning original", e)
173
+ return path
174
+
175
+
176
+ # ══════════════════════════════════════════════════════════════════════════════
177
+ # Dialogue script parser
178
+ # ══════════════════════════════════════════════════════════════════════════════
179
+
180
+ def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
181
+ """Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
182
+ segments: list[tuple[str, str]] = []
183
+ prefix_a = f"{tag_a}:"
184
+ prefix_b = f"{tag_b}:"
185
+
186
+ for line in script.splitlines():
187
+ line = line.strip()
188
+ if line.startswith(prefix_a):
189
+ text = line[len(prefix_a):].strip()
190
+ if text:
191
+ if segments and segments[-1][0] == tag_a:
192
+ segments[-1] = (tag_a, segments[-1][1] + " " + text)
193
+ else:
194
+ segments.append((tag_a, text))
195
+ elif line.startswith(prefix_b):
196
+ text = line[len(prefix_b):].strip()
197
+ if text:
198
+ if segments and segments[-1][0] == tag_b:
199
+ segments[-1] = (tag_b, segments[-1][1] + " " + text)
200
+ else:
201
+ segments.append((tag_b, text))
202
+ return segments
203
+
204
+
205
+ # ══════════════════════════════════════════════════════════════════════════════
206
+ # Per-mode audio generators
207
+ # ══════════════════════════════════════════════════════════════════════════════
208
+
209
+ def generate_audio_podcast(script: str) -> tuple[str, str]:
210
+ """
211
+ Podcast: HOST_1 = female (AriaNeural), HOST_2 = male (GuyNeural).
212
+ Normal conversational rate, 300 ms silence between turns.
213
+ """
214
+ segments = _parse_dialogue(script, "HOST_1", "HOST_2")
215
+ if not segments:
216
+ logger.warning("No HOST tags β€” falling back to single voice")
217
+ return generate_audio(script)
218
+
219
+ voice_map = {
220
+ "HOST_1": (EDGE_VOICE_HOST_FEMALE, "+0%"),
221
+ "HOST_2": (EDGE_VOICE_HOST_MALE, "+0%"),
222
+ }
223
+ paths = []
224
+ for speaker, text in segments:
225
+ voice, rate = voice_map[speaker]
226
+ try:
227
+ paths.append(_edge_tts(text, voice=voice, rate=rate))
228
+ except Exception as e:
229
+ logger.warning("Podcast segment failed %s: %s", speaker, e)
230
+
231
+ if not paths:
232
+ raise RuntimeError("All podcast segments failed.")
233
+ return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"
234
+
235
+
236
+ def generate_audio_debate(script: str) -> tuple[str, str]:
237
+ """
238
+ Debate: DEBATER_A = female (JennyNeural, assertive +8%),
239
+ DEBATER_B = male (DavisNeural, deliberate -5%).
240
+ 400 ms silence between turns for debate feel.
241
+ """
242
+ segments = _parse_dialogue(script, "DEBATER_A", "DEBATER_B")
243
+ if not segments:
244
+ logger.warning("No DEBATER tags β€” falling back to single voice")
245
+ return generate_audio(script)
246
+
247
+ voice_map = {
248
+ "DEBATER_A": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
249
+ "DEBATER_B": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
250
+ }
251
+ paths = []
252
+ for speaker, text in segments:
253
+ voice, rate = voice_map[speaker]
254
+ try:
255
+ paths.append(_edge_tts(text, voice=voice, rate=rate))
256
+ except Exception as e:
257
+ logger.warning("Debate segment failed %s: %s", speaker, e)
258
+
259
+ if not paths:
260
+ raise RuntimeError("All debate segments failed.")
261
+ return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"
262
+
263
+
264
+ def generate_audio_rap(script: str) -> tuple[str, str]:
265
+ """
266
+ Rap: male voice, fast rate (+40%), then bass boost applied via pydub.
267
+ """
268
+ path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
269
+ path = _apply_rap_fx(path)
270
+ return path, "Edge-TTS (Rap)"
271
+
272
+
273
+ def generate_audio_story(script: str) -> tuple[str, str]:
274
+ """
275
+ Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
276
+ """
277
+ path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
278
+ path = _add_story_gaps(path)
279
+ return path, "Edge-TTS (Story)"
280
+
281
+
282
+ # ══════════════════════════════════════════════════════════════════════════════
283
+ # Unified public interface
284
+ # ══════════════════════════════════════════════════════════════════════════════
285
+
286
+ def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
287
+ """Single-voice TTS for Summary and Song modes. Tries Qwen first."""
288
+ if not text or not text.strip():
289
+ raise ValueError("No text provided for audio generation.")
290
+ path = _qwen_tts(text)
291
+ if path and os.path.exists(path):
292
+ return path, "Qwen3-TTS"
293
+ return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"
utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Utility helpers.
3
+
4
+ Provides temp file management and error formatting
5
+ used across the pipeline.
6
+ """
7
+
8
+ import os
9
+ import tempfile
10
+ import logging
11
+
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
15
+ )
16
+ logger = logging.getLogger("voiceverse")
17
+
18
+
19
+ def get_temp_filepath(suffix: str = ".wav") -> str:
20
+ """Return a path to a new temporary file that won't be auto-deleted."""
21
+ fd, path = tempfile.mkstemp(suffix=suffix)
22
+ os.close(fd)
23
+ return path
24
+
25
+
26
+ def format_error(stage: str, error: Exception) -> str:
27
+ """
28
+ Return a user-friendly error string.
29
+ Hides raw tracebacks; logs the full error for debugging.
30
+ """
31
+ logger.error("Error in %s: %s", stage, error, exc_info=True)
32
+ friendly_messages = {
33
+ "upload": "Could not read the uploaded file. Please try a different PDF or TXT file.",
34
+ "rag": "Failed to process the document text. The file may be empty or corrupted.",
35
+ "script": "Could not generate the audio script. Please check your HF_TOKEN and try again.",
36
+ "tts": "Audio generation failed. The system will retry with a fallback voice.",
37
+ }
38
+ return friendly_messages.get(stage, f"An unexpected error occurred: {stage}")
39
+
40
+
41
+ def validate_file(file_path: str) -> tuple[bool, str]:
42
+ """
43
+ Validate an uploaded file. Returns (is_valid, message).
44
+ """
45
+ if file_path is None:
46
+ return False, "Please upload a PDF or TXT file first."
47
+
48
+ if not os.path.exists(file_path):
49
+ return False, "The uploaded file could not be found. Please try again."
50
+
51
+ ext = os.path.splitext(file_path)[1].lower()
52
+ if ext not in (".pdf", ".txt"):
53
+ return False, f"Unsupported file format '{ext}'. Please upload a PDF or TXT file."
54
+
55
+ size = os.path.getsize(file_path)
56
+ if size == 0:
57
+ return False, "The uploaded file is empty. Please upload a file with content."
58
+
59
+ if size > 20 * 1024 * 1024: # 20 MB limit
60
+ return False, "File is too large (>20 MB). Please upload a smaller document."
61
+
62
+ return True, "File is valid."