Isshi14 commited on
Commit
ebd182e
Β·
verified Β·
1 Parent(s): a247242

Upload 12 files

Browse files
Files changed (12) hide show
  1. README.md +57 -12
  2. app (1).py +402 -0
  3. convert_to_word.ps1 +58 -0
  4. gitattributes +35 -0
  5. gitattributes (1) +35 -0
  6. ingestion.py +217 -0
  7. packages.txt +1 -0
  8. rag.py +198 -0
  9. requirements.txt +13 -0
  10. script_gen.py +316 -0
  11. tts.py +369 -0
  12. utils.py +62 -0
README.md CHANGED
@@ -1,12 +1,57 @@
1
- ---
2
- title: CHECK
3
- emoji: πŸ“ˆ
4
- colorFrom: indigo
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VoiceVerse AI
3
+ emoji: πŸŽ™οΈ
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "5.23.1"
8
+ python_version: "3.10"
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
+ # πŸŽ™οΈ VoiceVerse AI β€” Document to Audio
14
+
15
+ Transform uploaded documents into engaging, emotionally expressive podcast-style audio narrations.
16
+
17
+ ## Pipeline
18
+
19
+ ```
20
+ PDF/TXT β†’ Text Extraction β†’ RAG (chunk + embed + retrieve) β†’ Script Generation (Mistral-7B) β†’ TTS (Qwen3-TTS / Edge-TTS) β†’ Audio Playback
21
+ ```
22
+
23
+ ## Models Used
24
+
25
+ | Component | Model | How |
26
+ |-----------|-------|-----|
27
+ | Embeddings | `all-MiniLM-L6-v2` | Local (CPU) |
28
+ | Script Gen | `Mistral-7B-Instruct-v0.3` | HF Inference API |
29
+ | TTS (primary) | `Qwen3-TTS` | HF Inference API |
30
+ | TTS (fallback) | `Edge-TTS (AriaNeural)` | Local (CPU) |
31
+
32
+ ## Setup
33
+
34
+ ```bash
35
+ pip install -r requirements.txt
36
+ export HF_TOKEN="your_huggingface_token_here"
37
+ python app.py
38
+ ```
39
+
40
+ ## Deployment on HF Spaces
41
+
42
+ 1. Create a new Space (Gradio SDK)
43
+ 2. Upload all project files
44
+ 3. Set `HF_TOKEN` as a Space Secret
45
+ 4. The app will auto-launch on port 7860
46
+
47
+ ## Project Structure
48
+
49
+ ```
50
+ app.py # Gradio UI entry point
51
+ rag.py # Document ingestion, chunking, embedding, retrieval
52
+ script_gen.py # LLM script generation (Mistral-7B-Instruct)
53
+ tts.py # Text-to-speech (Qwen3-TTS + Edge-TTS fallback)
54
+ utils.py # Helpers (temp files, validation, error formatting)
55
+ requirements.txt # Python dependencies
56
+ packages.txt # System packages (ffmpeg)
57
+ ```
app (1).py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Main Application.
3
+
4
+ Input sources (three tabs):
5
+ Tab 1 β€” Upload PDF or TXT file
6
+ Tab 2 β€” URL / YouTube link
7
+ Tab 3 β€” Paste raw text
8
+
9
+ Delivery Modes:
10
+ Summary / Podcast / Song / Rap / Debate
11
+
12
+ No status card shown. RAG pipeline unchanged.
13
+ """
14
+
15
+ import os
16
+ import gradio as gr
17
+ from utils import logger, validate_file, format_error
18
+ from rag import extract_text, RAGStore
19
+ from script_gen import generate_script
20
+ from tts import generate_audio, generate_audio_podcast, generate_audio_debate, generate_audio_rap, generate_audio_story, apply_pitch_shift
21
+ from ingestion import ingest_from_url_or_text, extract_pasted_text
22
+
23
+ # ── Global RAG Store ──────────────────────────────────────────────────────────
24
+ rag_store = RAGStore()
25
+
26
+
27
+ # ══════════════════════════════════════════════════════════════════════════════
28
+ # Shared RAG + Script + TTS pipeline
29
+ # ══════════════════════════════════════════════════════════════════════════════
30
+
31
+ def _run_pipeline(
32
+ text: str,
33
+ delivery_mode: str,
34
+ song_rap_sub: str,
35
+ pitch_shift: float,
36
+ progress,
37
+ ) -> tuple[str, str]:
38
+ """
39
+ RAG β†’ script β†’ audio. Shared by all three input tabs.
40
+ Returns (script, audio_path).
41
+ """
42
+ # RAG: chunk & embed
43
+ progress(0.30, desc="🧠 Building knowledge index…")
44
+ rag_store.add_document(text)
45
+ chunk_count = len(rag_store.chunks)
46
+ logger.info("RAG index: %d chunks", chunk_count)
47
+
48
+ # RAG: retrieve
49
+ progress(0.45, desc="πŸ” Retrieving relevant content…")
50
+ if chunk_count <= 8:
51
+ context_chunks = rag_store.get_all_chunks()
52
+ else:
53
+ context_chunks = rag_store.query(
54
+ "What are the main topics, key insights, and important details?",
55
+ top_k=6,
56
+ )
57
+
58
+ # Script generation
59
+ progress(0.60, desc=f"✍️ Writing {_mode_label(delivery_mode, song_rap_sub)} script…")
60
+ script = generate_script(
61
+ context_chunks=context_chunks,
62
+ mode=delivery_mode,
63
+ sub_mode=song_rap_sub,
64
+ )
65
+ logger.info("Script: %d chars", len(script))
66
+
67
+ # TTS β€” route by mode
68
+ progress(0.80, desc="πŸŽ™οΈ Synthesising audio…")
69
+ m = delivery_mode.strip().lower()
70
+ if m == "podcast":
71
+ audio_path, engine = generate_audio_podcast(script)
72
+ elif m == "debate":
73
+ audio_path, engine = generate_audio_debate(script)
74
+ elif "rap" in m:
75
+ audio_path, engine = generate_audio_rap(script)
76
+ elif m == "story":
77
+ audio_path, engine = generate_audio_story(script)
78
+ else:
79
+ audio_path, engine = generate_audio(script)
80
+ logger.info("Audio via %s: %s", engine, audio_path)
81
+
82
+ # Apply pitch shift if requested
83
+ if pitch_shift and abs(pitch_shift) >= 0.1:
84
+ progress(0.90, desc="🎡 Adjusting pitch…")
85
+ audio_path = apply_pitch_shift(audio_path, pitch_shift)
86
+
87
+ progress(1.00, desc="βœ… Done!")
88
+ return script, audio_path
89
+
90
+
91
+ def _mode_label(mode: str, sub_mode: str) -> str:
92
+ m = mode.lower()
93
+ if "podcast" in m:
94
+ return "podcast"
95
+ if "debate" in m:
96
+ return "debate"
97
+ if "story" in m:
98
+ return "story"
99
+ if "song" in m or "rap" in m:
100
+ return sub_mode.lower()
101
+ return "summary"
102
+
103
+
104
+ # ══════════════════════════════════════════════════════════════════════════════
105
+ # Per-tab handlers
106
+ # ══════════════════════════════════════════════════════════════════════════════
107
+
108
+ def process_file(file, delivery_mode, song_rap_sub, pitch_shift, progress=gr.Progress()):
109
+ if file is None:
110
+ raise gr.Error("Please upload a PDF or TXT file first.")
111
+ file_path = file.name if hasattr(file, "name") else str(file)
112
+ is_valid, msg = validate_file(file_path)
113
+ if not is_valid:
114
+ raise gr.Error(msg)
115
+ try:
116
+ progress(0.10, desc="πŸ“„ Extracting text from document…")
117
+ text = extract_text(file_path)
118
+ if not text or len(text.strip()) < 50:
119
+ raise gr.Error("Document has too little text. Please upload a richer file.")
120
+ progress(0.20, desc="βœ… Text extracted")
121
+ return _run_pipeline(text, delivery_mode, song_rap_sub, pitch_shift, progress)
122
+ except gr.Error:
123
+ raise
124
+ except EnvironmentError as e:
125
+ raise gr.Error(str(e))
126
+ except Exception as e:
127
+ raise gr.Error(format_error("pipeline", e))
128
+
129
+
130
+ def process_url(url_input, delivery_mode, song_rap_sub, pitch_shift, progress=gr.Progress()):
131
+ if not url_input or not url_input.strip():
132
+ raise gr.Error("Please enter a URL or YouTube link.")
133
+ try:
134
+ progress(0.05, desc="🌐 Fetching content…")
135
+ text, source_label = ingest_from_url_or_text(url_input.strip())
136
+ logger.info("Ingested from %s: %d chars", source_label, len(text))
137
+ progress(0.20, desc=f"βœ… Content fetched from {source_label}")
138
+ return _run_pipeline(text, delivery_mode, song_rap_sub, pitch_shift, progress)
139
+ except gr.Error:
140
+ raise
141
+ except ValueError as e:
142
+ raise gr.Error(str(e))
143
+ except EnvironmentError as e:
144
+ raise gr.Error(str(e))
145
+ except Exception as e:
146
+ raise gr.Error(format_error("pipeline", e))
147
+
148
+
149
+ def process_paste(pasted_text, delivery_mode, song_rap_sub, pitch_shift, progress=gr.Progress()):
150
+ if not pasted_text or not pasted_text.strip():
151
+ raise gr.Error("Please paste some text first.")
152
+ try:
153
+ progress(0.10, desc="πŸ“‹ Processing pasted text…")
154
+ text = extract_pasted_text(pasted_text)
155
+ progress(0.20, desc="βœ… Text ready")
156
+ return _run_pipeline(text, delivery_mode, song_rap_sub, pitch_shift, progress)
157
+ except gr.Error:
158
+ raise
159
+ except ValueError as e:
160
+ raise gr.Error(str(e))
161
+ except EnvironmentError as e:
162
+ raise gr.Error(str(e))
163
+ except Exception as e:
164
+ raise gr.Error(format_error("pipeline", e))
165
+
166
+
167
+ # ══════════════════════════════════════════════════════════════════════════════
168
+ # UI helpers
169
+ # ══════════════════════════════════════════════════════════════════════════════
170
+
171
+ def _mode_description(mode: str) -> str:
172
+ return {
173
+ "Summary": (
174
+ "*πŸ“‹ **Summary** β€” Structured narration: intro, key points, conclusion. "
175
+ "Single voice, neutral tone.*"
176
+ ),
177
+ "Podcast": (
178
+ "*πŸŽ™οΈ **Podcast** β€” Two-host conversation between Alex and Sam. "
179
+ "Alex guides; Sam explains. Dual voices.*"
180
+ ),
181
+ "Rap": (
182
+ "*🎡 **Rap** β€” Key ideas as a punchy rhythmic track. "
183
+ "Fast delivery, bass-boosted, line-by-line flow.*"
184
+ ),
185
+ "Debate": (
186
+ "*βš”οΈ **Debate** β€” Maya (pro) vs Ryan (con) argue opposing sides. "
187
+ "Female voice (assertive) vs Male voice (deliberate).*"
188
+ ),
189
+ "Story": (
190
+ "*πŸ“– **Story** β€” Content retold as an immersive narrative. "
191
+ "Slow, warm delivery with expressive pauses.*"
192
+ ),
193
+ }.get(mode, "")
194
+
195
+
196
+ def _on_mode_change(mode: str):
197
+ show_sub = "song" in mode.lower() or "rap" in mode.lower()
198
+ return gr.update(visible=show_sub), _mode_description(mode)
199
+
200
+
201
+ # ══════════════════════════════════════════════════════════════════════════════
202
+ # Gradio UI
203
+ # ══════════════════════════════════════════════════════════════════════════════
204
+
205
+ def build_ui() -> gr.Blocks:
206
+
207
+ css = """
208
+ .main-header { text-align: center; margin-bottom: 1rem; }
209
+ .main-header h1 {
210
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
211
+ -webkit-background-clip: text;
212
+ -webkit-text-fill-color: transparent;
213
+ font-size: 2.5rem;
214
+ font-weight: 800;
215
+ margin-bottom: 0.25rem;
216
+ }
217
+ .main-header p { color: #6b7280; font-size: 1.1rem; }
218
+
219
+ .mode-card {
220
+ background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
221
+ border: 1px solid #e0d9ff;
222
+ border-radius: 12px;
223
+ padding: 1rem 1.25rem;
224
+ margin-top: 0.75rem;
225
+ margin-bottom: 0.75rem;
226
+ }
227
+
228
+ /* Hide the "Radio" label Gradio adds automatically */
229
+ #delivery-mode-radio .label-wrap { display: none !important; }
230
+
231
+ .url-hint { color: #6b7280; font-size: 0.82rem; margin-top: 0.3rem; }
232
+ """
233
+
234
+ with gr.Blocks(
235
+ title="VoiceVerse AI",
236
+ theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
237
+ css=css,
238
+ ) as app:
239
+
240
+ # ── Header ───────────────────────────────────────────────────────────
241
+ gr.HTML("""
242
+ <div class="main-header">
243
+ <h1>πŸŽ™οΈ VoiceVerse AI</h1>
244
+ <p>Transform any content into an engaging audio experience</p>
245
+ </div>
246
+ """)
247
+
248
+ with gr.Row(equal_height=False):
249
+
250
+ # ── LEFT COLUMN ───────────────────────────────────────────────────
251
+ with gr.Column(scale=1):
252
+
253
+ gr.Markdown("### πŸ“₯ Choose Your Content Source")
254
+
255
+ with gr.Tabs():
256
+
257
+ # ── Tab 1: File upload ────────────────────────────────────
258
+ with gr.Tab("πŸ“„ File Upload"):
259
+ file_input = gr.File(
260
+ label="Upload a PDF or TXT file",
261
+ file_types=[".pdf", ".txt"],
262
+ type="filepath",
263
+ )
264
+ file_btn = gr.Button(
265
+ "πŸŽ™οΈ Generate Audio",
266
+ variant="primary",
267
+ size="lg",
268
+ )
269
+
270
+ # ── Tab 2: URL ────────────────────────────────────────────
271
+ with gr.Tab("πŸ”— URL"):
272
+ url_input = gr.Textbox(
273
+ label=None,
274
+ placeholder=(
275
+ "Paste any link here…\n\n"
276
+ "πŸ“° Article: https://example.com/article\n"
277
+ "🌐 Website: https://en.wikipedia.org/wiki/...\n"
278
+ "πŸ“ Blog: https://blog.example.com/post"
279
+ ),
280
+ lines=5,
281
+ max_lines=6,
282
+ show_label=False,
283
+ )
284
+ gr.HTML(
285
+ "<p class='url-hint'>"
286
+ "βœ… Works with: news articles, "
287
+ "blogs, Wikipedia, most public pages.<br>"
288
+ "❌ Won't work: paywalled or login-required pages."
289
+ "</p>"
290
+ )
291
+ url_btn = gr.Button(
292
+ "πŸŽ™οΈ Generate Audio",
293
+ variant="primary",
294
+ size="lg",
295
+ )
296
+
297
+ # ── Tab 3: Paste text ─────────────────────────────────────
298
+ with gr.Tab("πŸ“‹ Paste Text"):
299
+ paste_input = gr.Textbox(
300
+ label=None,
301
+ placeholder=(
302
+ "Paste any text here β€” article content, notes, "
303
+ "transcripts, research, anything…"
304
+ ),
305
+ lines=10,
306
+ max_lines=40,
307
+ show_label=False,
308
+ )
309
+ paste_btn = gr.Button(
310
+ "πŸŽ™οΈ Generate Audio",
311
+ variant="primary",
312
+ size="lg",
313
+ )
314
+
315
+ # ── Delivery Mode card ────────────────────────────────────────
316
+ gr.Markdown("### 🎨 Choose Audio Experience")
317
+
318
+ delivery_mode = gr.Radio(
319
+ choices=["Summary", "Podcast", "Rap", "Debate", "Story"],
320
+ value="Summary",
321
+ show_label=False,
322
+ elem_id="delivery-mode-radio",
323
+ )
324
+
325
+ # Hidden state for backward compat
326
+ song_rap_sub = gr.State("Rap")
327
+
328
+ mode_description = gr.Markdown(value=_mode_description("Summary"))
329
+
330
+ # ── Pitch adjustment ──────────────────────────────────────────
331
+ gr.Markdown("### 🎡 Audio Adjustments")
332
+ pitch_slider = gr.Slider(
333
+ minimum=-6, maximum=6, step=0.5, value=0,
334
+ label="Pitch Shift (semitones)",
335
+ info="Negative = deeper voice, Positive = higher voice",
336
+ )
337
+
338
+ # ── RIGHT COLUMN ──────────────────────────────────────────────────
339
+ with gr.Column(scale=1):
340
+
341
+ gr.Markdown("### 🎧 Generated Audio")
342
+ audio_output = gr.Audio(
343
+ label="Audio",
344
+ type="filepath",
345
+ interactive=False,
346
+ show_download_button=True,
347
+ )
348
+
349
+ gr.Markdown("### ✍️ Generated Script")
350
+ script_output = gr.Textbox(
351
+ label="Script",
352
+ lines=14,
353
+ max_lines=22,
354
+ interactive=False,
355
+ placeholder="Your generated script will appear here…",
356
+ show_copy_button=True,
357
+ )
358
+
359
+ # ── Footer ───────────────────────────────────────────────────────────
360
+ gr.Markdown(
361
+ "<center style='color:#9ca3af;margin-top:1rem;'>"
362
+ "Built with ❀️ using SmolLM3-3B · Qwen3-TTS · Edge-TTS · Gradio"
363
+ "</center>"
364
+ )
365
+
366
+ # ── Event wiring ─────────────────────────────────────────────────────
367
+
368
+ delivery_mode.change(
369
+ fn=lambda mode: _mode_description(mode),
370
+ inputs=[delivery_mode],
371
+ outputs=[mode_description],
372
+ )
373
+ file_btn.click(
374
+ fn=process_file,
375
+ inputs=[file_input, delivery_mode, song_rap_sub, pitch_slider],
376
+ outputs=[script_output, audio_output],
377
+ )
378
+ url_btn.click(
379
+ fn=process_url,
380
+ inputs=[url_input, delivery_mode, song_rap_sub, pitch_slider],
381
+ outputs=[script_output, audio_output],
382
+ )
383
+ paste_btn.click(
384
+ fn=process_paste,
385
+ inputs=[paste_input, delivery_mode, song_rap_sub, pitch_slider],
386
+ outputs=[script_output, audio_output],
387
+ )
388
+
389
+ return app
390
+
391
+
392
+ # ── Entry point ───────────────────────────────────────────────────────────────
393
+
394
+ if __name__ == "__main__":
395
+ logger.info("Starting VoiceVerse AI…")
396
+ app = build_ui()
397
+ app.launch(
398
+ server_name="0.0.0.0",
399
+ server_port=7860,
400
+ share=False,
401
+ show_error=True,
402
+ )
convert_to_word.ps1 ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ $markdownPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\project_report.md"
3
+ $wordPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\VoiceVerse_AI_Project_Report.docx"
4
+
5
+ if (-not (Test-Path $markdownPath)) {
6
+ Write-Error "Markdown file not found at $markdownPath"
7
+ exit 1
8
+ }
9
+
10
+ $content = Get-Content -Path $markdownPath -Raw
11
+
12
+ # Create Word Object
13
+ try {
14
+ $word = New-Object -ComObject Word.Application
15
+ $word.Visible = $false
16
+ $doc = $word.Documents.Add()
17
+ $selection = $word.Selection
18
+
19
+ # Basic Markdown Parsing (Simplified)
20
+ $lines = $content -split "`r?`n"
21
+ foreach ($line in $lines) {
22
+ if ($line -match "^# (.*)") {
23
+ $selection.Style = "Title"
24
+ $selection.TypeText($matches[1])
25
+ $selection.TypeParagraph()
26
+ } elseif ($line -match "^## (.*)") {
27
+ $selection.Style = "Heading 1"
28
+ $selection.TypeText($matches[1])
29
+ $selection.TypeParagraph()
30
+ } elseif ($line -match "^### (.*)") {
31
+ $selection.Style = "Heading 2"
32
+ $selection.TypeText($matches[1])
33
+ $selection.TypeParagraph()
34
+ } elseif ($line -match "^---") {
35
+ # Skip horizontal rules or add a page break?
36
+ # For now just skip
37
+ } elseif ($line -match "^\|") {
38
+ # Table handling is complex, for now just TypeText
39
+ $selection.Style = "Normal"
40
+ $selection.TypeText($line)
41
+ $selection.TypeParagraph()
42
+ } else {
43
+ $selection.Style = "Normal"
44
+ # Remove bold/italic markers for cleaner look
45
+ $cleanLine = $line -replace "\*\*", "" -replace "\*", ""
46
+ $selection.TypeText($cleanLine)
47
+ $selection.TypeParagraph()
48
+ }
49
+ }
50
+
51
+ $doc.SaveAs([ref]$wordPath)
52
+ $doc.Close()
53
+ $word.Quit()
54
+ Write-Host "Word document created successfully at $wordPath"
55
+ } catch {
56
+ Write-Error "Failed to create Word document: $_"
57
+ if ($word) { $word.Quit() }
58
+ }
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
gitattributes (1) ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ingestion.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Content Ingestion Module.
3
+
4
+ Handles all input sources beyond file upload:
5
+ - YouTube links β†’ transcript via youtube-transcript-api
6
+ - Article / website β†’ readable text via trafilatura + BeautifulSoup fallback
7
+ - Pasted raw text β†’ light cleaning and validation
8
+
9
+ Returns plain text string that feeds into RAGStore.add_document().
10
+ rag.py is completely unchanged.
11
+ """
12
+
13
+ import re
14
+ import urllib.parse
15
+ from utils import logger
16
+
17
+
18
+ # ══════════════════════════════════════════════════════════════════════════════
19
+ # URL type detection
20
+ # ══════════════════════════════════════════════════════════════════════════════
21
+
22
+ def _is_youtube(url: str) -> bool:
23
+ parsed = urllib.parse.urlparse(url.strip())
24
+ host = parsed.netloc.lower().replace("www.", "")
25
+ return host in ("youtube.com", "youtu.be")
26
+
27
+
28
+ def _extract_youtube_id(url: str) -> str | None:
29
+ patterns = [
30
+ r"(?:v=)([a-zA-Z0-9_-]{11})",
31
+ r"youtu\.be/([a-zA-Z0-9_-]{11})",
32
+ r"embed/([a-zA-Z0-9_-]{11})",
33
+ r"shorts/([a-zA-Z0-9_-]{11})",
34
+ ]
35
+ for pattern in patterns:
36
+ match = re.search(pattern, url)
37
+ if match:
38
+ return match.group(1)
39
+ return None
40
+
41
+
42
+ # ══════════════════════════════════════════════════════════════════════════════
43
+ # YouTube transcript
44
+ # ══════════════════════════════════════════════════════════════════════════════
45
+
46
+ def extract_youtube(url: str) -> str:
47
+ try:
48
+ from youtube_transcript_api import (
49
+ YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
50
+ )
51
+ except ImportError:
52
+ raise ImportError(
53
+ "youtube-transcript-api is not installed. "
54
+ "Add 'youtube-transcript-api' to requirements.txt and restart the Space."
55
+ )
56
+
57
+ video_id = _extract_youtube_id(url)
58
+ if not video_id:
59
+ raise ValueError(f"Could not extract a YouTube video ID from: {url}")
60
+
61
+ logger.info("Fetching YouTube transcript: video_id=%s", video_id)
62
+
63
+ try:
64
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
65
+
66
+ # Prefer English manual captions, then English auto, then anything available
67
+ try:
68
+ transcript = transcript_list.find_manually_created_transcript(
69
+ ["en", "en-US", "en-GB"]
70
+ )
71
+ except NoTranscriptFound:
72
+ try:
73
+ transcript = transcript_list.find_generated_transcript(
74
+ ["en", "en-US", "en-GB"]
75
+ )
76
+ except NoTranscriptFound:
77
+ transcript = next(iter(transcript_list))
78
+ logger.info("No English transcript β€” using: %s", transcript.language)
79
+
80
+ entries = transcript.fetch()
81
+ text = " ".join(entry["text"] for entry in entries)
82
+
83
+ # Clean YouTube caption artifacts
84
+ text = re.sub(r"\[.*?\]", "", text) # [Music], [Applause] etc.
85
+ text = re.sub(r"\s{2,}", " ", text).strip()
86
+
87
+ if len(text) < 50:
88
+ raise ValueError("YouTube transcript is too short to process.")
89
+
90
+ logger.info("YouTube transcript: %d chars", len(text))
91
+ return text
92
+
93
+ except (NoTranscriptFound, TranscriptsDisabled) as e:
94
+ raise ValueError(
95
+ f"No transcript available for this video. "
96
+ f"The video may have captions disabled or be private.\n\n"
97
+ f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
98
+ )
99
+
100
+
101
+ # ══════════════════════════════════════════════════════════════════════════════
102
+ # Article / website URL
103
+ # ══════════════════════════════════════════════════════════════════════════════
104
+
105
+ def extract_url(url: str) -> str:
106
+ """
107
+ Fetch a webpage and extract readable text.
108
+ Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
109
+ """
110
+ url = url.strip()
111
+ logger.info("Fetching URL: %s", url)
112
+
113
+ headers = {
114
+ "User-Agent": (
115
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
116
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
117
+ "Chrome/120.0.0.0 Safari/537.36"
118
+ )
119
+ }
120
+
121
+ # ── Attempt 1: trafilatura ────────────────────────────────────────────────
122
+ try:
123
+ import trafilatura
124
+ downloaded = trafilatura.fetch_url(url)
125
+ if downloaded:
126
+ text = trafilatura.extract(
127
+ downloaded,
128
+ include_comments=False,
129
+ include_tables=True,
130
+ no_fallback=False,
131
+ )
132
+ if text and len(text.strip()) > 100:
133
+ logger.info("trafilatura extracted %d chars", len(text))
134
+ return text.strip()
135
+ except Exception as e:
136
+ logger.warning("trafilatura failed (%s) β€” trying BeautifulSoup", e)
137
+
138
+ # ── Attempt 2: requests + BeautifulSoup ──────────────────────────────────
139
+ try:
140
+ import requests
141
+ from bs4 import BeautifulSoup
142
+
143
+ resp = requests.get(url, headers=headers, timeout=15)
144
+ resp.raise_for_status()
145
+
146
+ soup = BeautifulSoup(resp.text, "html.parser")
147
+ for tag in soup(["script", "style", "nav", "footer", "header",
148
+ "aside", "form", "noscript", "iframe"]):
149
+ tag.decompose()
150
+
151
+ article = soup.find("article") or soup.find("main") or soup.find("body")
152
+ text = (
153
+ article.get_text(separator=" ", strip=True)
154
+ if article
155
+ else soup.get_text(separator=" ", strip=True)
156
+ )
157
+ text = re.sub(r"\s{3,}", "\n\n", text)
158
+ text = re.sub(r" {2,}", " ", text).strip()
159
+
160
+ if len(text) < 100:
161
+ raise ValueError("Could not extract enough text from this page.")
162
+
163
+ logger.info("BeautifulSoup extracted %d chars", len(text))
164
+ return text
165
+
166
+ except Exception as e:
167
+ raise ValueError(
168
+ f"Could not fetch content from: {url}\n\n"
169
+ f"Reason: {e}\n\n"
170
+ "The page may require a login or block bots. "
171
+ "Try copying the article text and pasting it in the Paste Text tab."
172
+ )
173
+
174
+
175
+ # ══════════════════════════════════════════════════════════════════════════════
176
+ # Pasted raw text
177
+ # ══════════════════════════════════════════════════════════════════════════════
178
+
179
+ def extract_pasted_text(text: str) -> str:
180
+ if not text or not text.strip():
181
+ raise ValueError("No text was pasted. Please paste some content.")
182
+
183
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
184
+ text = re.sub(r"\n{4,}", "\n\n\n", text)
185
+ text = re.sub(r" {2,}", " ", text).strip()
186
+
187
+ if len(text) < 50:
188
+ raise ValueError(
189
+ "Pasted text is too short. Please paste at least a paragraph of content."
190
+ )
191
+
192
+ logger.info("Pasted text ingested: %d chars", len(text))
193
+ return text
194
+
195
+
196
+ # ══════════════════════════════════════════════════════════════════════════════
197
+ # Unified entry point
198
+ # ══════════════════════════════════════════════════════════════════════════════
199
+
200
+ def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
201
+ """
202
+ Auto-detect whether input is a YouTube URL, article URL, or plain text.
203
+
204
+ Returns:
205
+ (extracted_text, source_label)
206
+ """
207
+ raw = raw_input.strip()
208
+ if not raw:
209
+ raise ValueError("Please enter a URL or paste some text.")
210
+
211
+ if re.match(r"https?://", raw, re.IGNORECASE):
212
+ if _is_youtube(raw):
213
+ return extract_youtube(raw), "YouTube"
214
+ else:
215
+ return extract_url(raw), "Article / Website"
216
+ else:
217
+ return extract_pasted_text(raw), "Pasted Text"
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
rag.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” RAG Pipeline.
3
+
4
+ Handles document ingestion, text chunking, embedding generation,
5
+ and semantic retrieval using an in-memory vector store.
6
+
7
+ Models used:
8
+ - sentence-transformers/all-MiniLM-L6-v2 for embeddings (22 MB, CPU-friendly)
9
+
10
+ Design decisions:
11
+ - NumPy cosine similarity instead of FAISS to avoid heavy native deps
12
+ - Overlapping chunks to preserve context across boundaries
13
+ - Single-document architecture (clear store on new upload)
14
+ """
15
+
16
+ import os
17
+ import numpy as np
18
+ from utils import logger
19
+
20
+ # ── Text Extraction ──────────────────────────────────────────────────────────
21
+
22
+ def extract_text(file_path: str) -> str:
23
+ """
24
+ Extract plain text from a PDF or TXT file.
25
+ Returns the full document text as a single string.
26
+ """
27
+ ext = os.path.splitext(file_path)[1].lower()
28
+
29
+ if ext == ".pdf":
30
+ return _extract_pdf(file_path)
31
+ elif ext == ".txt":
32
+ return _extract_txt(file_path)
33
+ else:
34
+ raise ValueError(f"Unsupported file type: {ext}")
35
+
36
+
37
+ def _extract_pdf(file_path: str) -> str:
38
+ """Extract text from PDF using PyMuPDF."""
39
+ import fitz # PyMuPDF
40
+
41
+ text_parts = []
42
+ with fitz.open(file_path) as doc:
43
+ for page_num, page in enumerate(doc):
44
+ page_text = page.get_text("text")
45
+ if page_text.strip():
46
+ text_parts.append(page_text)
47
+ logger.debug("Extracted page %d: %d chars", page_num + 1, len(page_text))
48
+
49
+ full_text = "\n\n".join(text_parts)
50
+ logger.info("PDF extraction complete: %d pages, %d chars total",
51
+ len(text_parts), len(full_text))
52
+ return full_text
53
+
54
+
55
+ def _extract_txt(file_path: str) -> str:
56
+ """Read plain text file with encoding fallback."""
57
+ for encoding in ("utf-8", "utf-8-sig", "latin-1", "cp1252"):
58
+ try:
59
+ with open(file_path, "r", encoding=encoding) as f:
60
+ text = f.read()
61
+ logger.info("TXT extraction complete (%s): %d chars", encoding, len(text))
62
+ return text
63
+ except UnicodeDecodeError:
64
+ continue
65
+ raise ValueError("Could not decode the text file with any supported encoding.")
66
+
67
+
68
+ # ── Text Chunking ────────────────────────────────────────────────────────────
69
+
70
+ def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
71
+ """
72
+ Split text into overlapping chunks of roughly `chunk_size` characters.
73
+ Overlap ensures context isn't lost at chunk boundaries.
74
+
75
+ Uses sentence-aware splitting: tries to break at sentence boundaries
76
+ within the chunk window for more coherent chunks.
77
+ """
78
+ if not text or not text.strip():
79
+ return []
80
+
81
+ # Clean up whitespace
82
+ text = " ".join(text.split())
83
+
84
+ chunks = []
85
+ start = 0
86
+
87
+ while start < len(text):
88
+ end = start + chunk_size
89
+
90
+ # If not at the end, try to break at a sentence boundary
91
+ if end < len(text):
92
+ # Look for sentence-ending punctuation near the end
93
+ search_start = max(start + chunk_size // 2, start)
94
+ last_period = -1
95
+ for i in range(min(end, len(text)) - 1, search_start - 1, -1):
96
+ if text[i] in ".!?" and (i + 1 >= len(text) or text[i + 1] == " "):
97
+ last_period = i
98
+ break
99
+ if last_period > start:
100
+ end = last_period + 1
101
+
102
+ chunk = text[start:end].strip()
103
+ if chunk:
104
+ chunks.append(chunk)
105
+
106
+ # Move forward by (chunk length - overlap)
107
+ start = max(start + 1, end - overlap)
108
+
109
+ logger.info("Chunking complete: %d chunks (size=%d, overlap=%d)",
110
+ len(chunks), chunk_size, overlap)
111
+ return chunks
112
+
113
+
114
+ # ── Embedding & Vector Store ─────────────────────────────────────────────────
115
+
116
+ class RAGStore:
117
+ """
118
+ In-memory vector store using sentence-transformers embeddings
119
+ and NumPy cosine similarity.
120
+
121
+ Usage:
122
+ store = RAGStore()
123
+ store.add_document("full document text here")
124
+ results = store.query("what is this about?", top_k=5)
125
+ """
126
+
127
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
128
+
129
+ def __init__(self):
130
+ self._model = None
131
+ self.chunks: list[str] = []
132
+ self.embeddings: np.ndarray | None = None
133
+
134
+ @property
135
+ def model(self):
136
+ """Lazy-load the embedding model to avoid startup cost."""
137
+ if self._model is None:
138
+ logger.info("Loading embedding model: %s", self.MODEL_NAME)
139
+ from sentence_transformers import SentenceTransformer
140
+ self._model = SentenceTransformer(self.MODEL_NAME)
141
+ logger.info("Embedding model loaded successfully")
142
+ return self._model
143
+
144
+ def clear(self):
145
+ """Clear the store for a new document."""
146
+ self.chunks = []
147
+ self.embeddings = None
148
+
149
+ def add_document(self, text: str, chunk_size: int = 512, overlap: int = 50):
150
+ """
151
+ Process a document: chunk the text, generate embeddings, and store.
152
+ Clears any previously stored document.
153
+ """
154
+ self.clear()
155
+
156
+ self.chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
157
+ if not self.chunks:
158
+ raise ValueError("No text chunks could be extracted from the document.")
159
+
160
+ logger.info("Generating embeddings for %d chunks...", len(self.chunks))
161
+ self.embeddings = self.model.encode(
162
+ self.chunks,
163
+ show_progress_bar=False,
164
+ convert_to_numpy=True,
165
+ normalize_embeddings=True, # Pre-normalize for faster cosine sim
166
+ )
167
+ logger.info("Embeddings generated: shape %s", self.embeddings.shape)
168
+
169
+ def query(self, question: str, top_k: int = 5) -> list[str]:
170
+ """
171
+ Retrieve the top-k most relevant chunks for the given question.
172
+ Uses cosine similarity (dot product on normalized vectors).
173
+ """
174
+ if self.embeddings is None or len(self.chunks) == 0:
175
+ return []
176
+
177
+ # Embed the query
178
+ query_embedding = self.model.encode(
179
+ [question],
180
+ convert_to_numpy=True,
181
+ normalize_embeddings=True,
182
+ )
183
+
184
+ # Cosine similarity = dot product (vectors are pre-normalized)
185
+ similarities = np.dot(self.embeddings, query_embedding.T).flatten()
186
+
187
+ # Get top-k indices
188
+ top_k = min(top_k, len(self.chunks))
189
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
190
+
191
+ results = [self.chunks[i] for i in top_indices]
192
+ logger.info("Retrieved %d chunks (top similarity: %.3f)",
193
+ len(results), similarities[top_indices[0]])
194
+ return results
195
+
196
+ def get_all_chunks(self) -> list[str]:
197
+ """Return all stored chunks (useful for short documents)."""
198
+ return self.chunks.copy()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.23.1,<6.0
2
+ huggingface-hub>=0.25
3
+ pydantic>=2.0,<2.11
4
+ sentence-transformers
5
+ numpy
6
+ PyMuPDF
7
+ edge-tts
8
+ scipy
9
+ pydub
10
+ requests
11
+ beautifulsoup4
12
+ trafilatura
13
+ youtube-transcript-api
script_gen.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Script Generation Module.
3
+
4
+ Delivery Modes:
5
+ Summary β€” single-speaker structured narration
6
+ Podcast β€” HOST_1 / HOST_2 two-host dialogue
7
+ Song/Rap β€” rhythmic retention content
8
+ Debate β€” DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
9
+ """
10
+
11
+ import os
12
+ import re
13
+ from huggingface_hub import InferenceClient
14
+ from utils import logger
15
+
16
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
17
+ MAX_NEW_TOKENS = 2048
18
+ TEMPERATURE = 0.5
19
+
20
+
21
+ # ══════════════════════════════════════════════════════════════════════════════
22
+ # Prompts
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+
25
+ # ── Summary ───────────────────────────────────────────────────────────────────
26
+ _SUMMARY_SYSTEM = """\
27
+ You are a professional narrator. Produce a clear spoken summary strictly from the source material.
28
+ RULES:
29
+ 1. Use ONLY facts from the source. Do NOT add outside knowledge.
30
+ 2. Write as one continuous flowing narration. Do NOT use any section headings, labels, or structural markers like "Introduction", "Intro", "Key Points", "Conclusion", "Summary", "Section 1", etc.
31
+ 3. Use smooth spoken transitions instead of headings. For example say "Let's start with..." or "Now moving on to..." or "To wrap things up..." instead of labeling sections.
32
+ 4. Plain text only β€” no markdown, no bullets, no headers, no labels of any kind.
33
+ 5. Write for the ear: short sentences, conversational tone.
34
+ 6. Never say "the document says". Speak as the expert.
35
+ 7. Output ONLY the spoken narration text, nothing else. It should read like someone is naturally talking."""
36
+
37
+ _SUMMARY_USER = """\
38
+ SOURCE MATERIAL:
39
+ {context}
40
+
41
+ Write a flowing spoken summary in plain sentences. Do NOT include any headings or labels like Intro, Conclusion, etc. Just speak naturally as if talking to a listener."""
42
+
43
+
44
+ # ── Podcast ───────────────────────────────────────────────────────────────────
45
+ _PODCAST_SYSTEM = """\
46
+ You are a podcast script writer. Write a two-host conversation strictly from the source material.
47
+
48
+ STRICT FORMAT β€” every single line must start with a speaker tag:
49
+ ALEX: <what Alex says>
50
+ SAM: <what Sam says>
51
+
52
+ RULES:
53
+ 1. Alternate ALEX and SAM. Never same host twice in a row.
54
+ 2. ALEX introduces topics and asks questions.
55
+ 3. SAM explains concepts and answers.
56
+ 4. Use ONLY information from the source. No hallucination.
57
+ 5. Conversational, engaging tone.
58
+ 6. No markdown, no stage directions, no lines without a speaker tag.
59
+ 7. Aim for 16–24 exchanges."""
60
+
61
+ _PODCAST_USER = """\
62
+ SOURCE MATERIAL:
63
+ {context}
64
+
65
+ Write the full podcast. Every line must start with ALEX: or SAM:"""
66
+
67
+
68
+ # ── Rap ───────────────────────────────────────────────────────────────────────
69
+ _RAP_SYSTEM = """\
70
+ You are a lyricist. Two steps:
71
+ STEP 1 β€” silently extract 5–7 key ideas from the source.
72
+ STEP 2 β€” write a punchy rhythmic RAP from those ideas.
73
+
74
+ RULES:
75
+ - Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
76
+ - Do NOT use any section labels like [VERSE 1], [CHORUS], [HOOK], [BRIDGE] etc.
77
+ - Just write the rap lines continuously. Use a blank line to separate verses.
78
+ - The hook/chorus should repeat naturally without a label.
79
+ - Wordplay and repetition to aid retention.
80
+ - Do NOT invent facts not in the source.
81
+ - Output ONLY the lyrics, no labels, no headers."""
82
+
83
+ _RAP_USER = """\
84
+ SOURCE MATERIAL:
85
+ {context}
86
+
87
+ Extract the key ideas, then write the full rap. No section labels."""
88
+
89
+
90
+ # ── Debate ────────────────────────────────────────────────────────────────────
91
+ _DEBATE_SYSTEM = """\
92
+ You are a debate script writer. Write a structured two-person debate strictly grounded \
93
+ in the provided source material.
94
+
95
+ STRICT FORMAT β€” every single line must start with a speaker tag:
96
+ MAYA: <what Maya says>
97
+ RYAN: <what Ryan says>
98
+
99
+ CHARACTER PROFILES:
100
+ - MAYA: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
101
+ - RYAN: Takes the CON / critical position. Tone is skeptical, cautious, questioning.
102
+
103
+ DEBATE STRUCTURE:
104
+ 1. MAYA opens with a strong statement supporting the topic.
105
+ 2. RYAN immediately challenges with a counterpoint.
106
+ 3. They alternate, each directly responding to the other's previous point.
107
+ 4. Both use evidence and logic from the source material only.
108
+ 5. End with each debater giving a brief closing statement.
109
+
110
+ RULES:
111
+ - Alternate MAYA and RYAN. Never same debater twice in a row.
112
+ - Use ONLY information from the source material. No hallucination.
113
+ - Each turn should be 1–3 sentences β€” punchy, not long speeches.
114
+ - No markdown, no stage directions, no narration outside the speaker tags.
115
+ - Aim for 16–22 exchanges total."""
116
+
117
+ _DEBATE_USER = """\
118
+ SOURCE MATERIAL:
119
+ {context}
120
+
121
+ Write the full debate on the key topics from this material. \
122
+ Every line must start with MAYA: or RYAN:"""
123
+
124
+
125
+ # ── Story ─────────────────────────────────────────────────────────────────────
126
+ _STORY_SYSTEM = """\
127
+ You are a master storyteller. Retell the ideas from the source material as an \
128
+ immersive narrative story written for slow, expressive audio delivery.
129
+
130
+ RULES:
131
+ 1. Transform factual content into a story β€” use characters, scenes, a narrative arc \
132
+ (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
133
+ 2. Use ONLY information and ideas from the source. Do NOT invent new facts.
134
+ 3. Warm, descriptive storytelling voice. Vivid but calm.
135
+ 4. Short paragraphs, 1–3 sentences each, separated by blank lines.
136
+ 5. Plain text only β€” no markdown, no bullets, no headers.
137
+ 6. Begin with an evocative scene-setting sentence.
138
+ 7. End with a closing reflection or lesson drawn from the source.
139
+ 8. Output ONLY the story text, nothing else."""
140
+
141
+ _STORY_USER = """\
142
+ SOURCE MATERIAL:
143
+ {context}
144
+
145
+ Transform this into a rich narrative story for slow, expressive audio. \
146
+ Use short paragraphs with blank lines between them."""
147
+
148
+
149
+ # ══════════════════════════════════════════════════════════════════════════════
150
+ # Post-processing
151
+ # ══════════════════════════════════════════════════════════════════════════════
152
+
153
+ def _clean(text: str) -> str:
154
+ """Remove all markdown and XML artifacts from LLM output."""
155
+ text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
156
+ text = re.sub(r"<[^>]+>", "", text)
157
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
158
+
159
+ # Remove heading-like labels that TTS would read aloud
160
+ # e.g. "Introduction:", "Intro:", "Conclusion:", "Key Points:", "Summary:" etc.
161
+ text = re.sub(
162
+ r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
163
+ r"Closing|Opening|Final\s*Thoughts?|In\s*Summary|To\s*Conclude)\s*[:\-β€”]?\s*$",
164
+ "", text, flags=re.MULTILINE | re.IGNORECASE
165
+ )
166
+ # Also remove inline heading labels at the start of a line followed by content
167
+ text = re.sub(
168
+ r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
169
+ r"Closing|Opening|Final\s*Thoughts?)\s*[:\-β€”]\s+",
170
+ "", text, flags=re.MULTILINE | re.IGNORECASE
171
+ )
172
+ # Remove [VERSE 1], [CHORUS], [HOOK], [BRIDGE] etc. labels from rap/song output
173
+ text = re.sub(r"\[(?:VERSE|CHORUS|HOOK|BRIDGE|INTRO|OUTRO)\s*\d*\]", "", text, flags=re.IGNORECASE)
174
+ text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
175
+ text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
176
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
177
+ text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
178
+ text = re.sub(r"`([^`]+)`", r"\1", text)
179
+ text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
180
+ text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
181
+ text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
182
+ text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
183
+ text = re.sub(r"\n{3,}", "\n\n", text)
184
+ text = re.sub(r" {2,}", " ", text)
185
+ return text.strip()
186
+
187
+
188
+ def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
189
+ """
190
+ Clean output that must have speaker tags (podcast or debate).
191
+ Normalises tag variants, removes lines without valid tags.
192
+ """
193
+ text = _clean(text)
194
+
195
+ # Normalise tag variants the model might produce
196
+ if tag_a == "ALEX":
197
+ text = re.sub(r"(?i)\balex\s*:", "ALEX:", text)
198
+ text = re.sub(r"(?i)\bsam\s*:", "SAM:", text)
199
+ text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "ALEX:", text)
200
+ text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "SAM:", text)
201
+ elif tag_a == "MAYA":
202
+ text = re.sub(r"(?i)\bmaya\s*:", "MAYA:", text)
203
+ text = re.sub(r"(?i)\bryan\s*:", "RYAN:", text)
204
+ text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "MAYA:", text)
205
+ text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "RYAN:", text)
206
+ text = re.sub(r"(?i)\bpro\s*:", "MAYA:", text)
207
+ text = re.sub(r"(?i)\bcon\s*:", "RYAN:", text)
208
+ text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "MAYA:", text)
209
+ text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "RYAN:", text)
210
+
211
+ # Keep only lines that have a valid speaker tag
212
+ lines = text.splitlines()
213
+ clean_lines = [
214
+ ln for ln in lines
215
+ if ln.strip() == ""
216
+ or ln.strip().startswith(f"{tag_a}:")
217
+ or ln.strip().startswith(f"{tag_b}:")
218
+ ]
219
+ return "\n".join(clean_lines).strip()
220
+
221
+
222
+ # ══════════════════════════════════════════════════════════════════════════════
223
+ # LLM client
224
+ # ══════════════════════════════════════════════════════════════════════════════
225
+
226
+ def _get_client() -> InferenceClient:
227
+ token = os.environ.get("HF_TOKEN")
228
+ if not token:
229
+ raise EnvironmentError(
230
+ "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
231
+ )
232
+ return InferenceClient(provider="hf-inference", token=token)
233
+
234
+
235
+ def _call_llm(system: str, user: str) -> str:
236
+ client = _get_client()
237
+ response = client.chat_completion(
238
+ model=MODEL_ID,
239
+ messages=[
240
+ {"role": "system", "content": system},
241
+ {"role": "user", "content": user},
242
+ ],
243
+ max_tokens=MAX_NEW_TOKENS,
244
+ temperature=TEMPERATURE,
245
+ top_p=0.9,
246
+ )
247
+ raw = response.choices[0].message.content.strip()
248
+ if not raw:
249
+ raise RuntimeError("Model returned empty response. Please try again.")
250
+ return raw
251
+
252
+
253
+ # ══════════════════════════════════════════════════════════════════════════════
254
+ # Public entry point
255
+ # ══════════════════════════════════════════════════════════════════════════════
256
+
257
+ def generate_script(
258
+ context_chunks: list[str],
259
+ mode: str = "Summary",
260
+ sub_mode: str = "Rap",
261
+ topic: str = "the key ideas from this document",
262
+ ) -> str:
263
+ """
264
+ Generate a spoken script from RAG chunks.
265
+
266
+ Args:
267
+ context_chunks : chunks from RAGStore β€” NOT modified here
268
+ mode : "Summary" | "Podcast" | "Song / Rap" | "Debate"
269
+ sub_mode : "Song" | "Rap" (only for Song/Rap mode)
270
+
271
+ Returns:
272
+ Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
273
+ Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
274
+ """
275
+ if not context_chunks:
276
+ raise ValueError("No document context. Please upload or paste content first.")
277
+
278
+ context = "\n\n".join(context_chunks)
279
+ if len(context) > 6000:
280
+ context = context[:6000]
281
+ logger.warning("Context truncated to 6000 chars")
282
+
283
+ logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))
284
+
285
+ m = mode.strip().lower()
286
+
287
+ if m == "summary":
288
+ raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
289
+ script = _clean(raw)
290
+
291
+ elif m == "podcast":
292
+ raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
293
+ script = _clean_dialogue(raw, "ALEX", "SAM")
294
+
295
+ elif "rap" in m:
296
+ raw = _call_llm(_RAP_SYSTEM, _RAP_USER.format(context=context))
297
+ script = _clean(raw)
298
+
299
+ elif "debate" in m:
300
+ raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
301
+ script = _clean_dialogue(raw, "MAYA", "RYAN")
302
+
303
+ elif "story" in m:
304
+ raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
305
+ script = _clean(raw)
306
+
307
+ else:
308
+ logger.warning("Unknown mode '%s' β€” falling back to Summary", mode)
309
+ raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
310
+ script = _clean(raw)
311
+
312
+ if not script:
313
+ raise RuntimeError("Script was empty after cleaning. Please try again.")
314
+
315
+ logger.info("Script ready: %d chars", len(script))
316
+ return script
tts.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” TTS Module.
3
+
4
+ Primary: Qwen3-TTS via HF Inference API
5
+ Fallback: Edge-TTS (CPU, no key needed)
6
+
7
+ Voice + audio style per mode:
8
+ Summary β€” neutral female voice, normal rate
9
+ Podcast β€” HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
10
+ Rap β€” male voice, faster rate (+40%), bass boost via pydub
11
+ Song β€” female voice, normal rate
12
+ Debate β€” DEBATER_A female (AriaNeural, +8%) / DEBATER_B male (GuyNeural, -5%)
13
+ Story β€” female voice, slow rate (-30%), long silence gaps between sentences
14
+ """
15
+
16
+ import os
17
+ import re
18
+ import asyncio
19
+ from utils import logger, get_temp_filepath
20
+
21
+ QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
22
+ TTS_MAX_CHARS = 3000
23
+
24
+ # ── Voice assignments ─────────────────────────────────────────────────────────
25
+ # Summary / Song / Story β€” single female voice
26
+ EDGE_VOICE_FEMALE = "en-US-AriaNeural"
27
+
28
+ # Podcast
29
+ EDGE_VOICE_HOST_FEMALE = "en-US-AriaNeural" # HOST_1 β€” female
30
+ EDGE_VOICE_HOST_MALE = "en-US-GuyNeural" # HOST_2 β€” male
31
+
32
+ # Rap β€” male voice reads the rap
33
+ EDGE_VOICE_RAP = "en-US-GuyNeural"
34
+ RAP_RATE = "+40%" # fast delivery
35
+
36
+ # Debate β€” use same reliable voices as podcast, just different rates
37
+ EDGE_VOICE_DEBATER_A = "en-US-AriaNeural" # female, pro β€” assertive
38
+ EDGE_VOICE_DEBATER_B = "en-US-GuyNeural" # male, con β€” skeptical
39
+ DEBATE_RATE_A = "+8%" # slightly faster
40
+ DEBATE_RATE_B = "-5%" # slightly slower, deliberate
41
+
42
+ # Story β€” slow, warm delivery
43
+ EDGE_VOICE_STORY = "en-US-AriaNeural"
44
+ STORY_RATE = "-30%" # noticeably slower
45
+
46
+
47
+ # ══════════════════════════════════════════════════════════════════════════════
48
+ # Low-level TTS helpers
49
+ # ══════════════════════════════════════════════════════════════════════════════
50
+
51
+ def _qwen_tts(text: str) -> str | None:
52
+ token = os.environ.get("HF_TOKEN")
53
+ if not token:
54
+ return None
55
+ try:
56
+ from huggingface_hub import InferenceClient
57
+ client = InferenceClient(token=token)
58
+ audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
59
+ if not audio_bytes:
60
+ return None
61
+ path = get_temp_filepath(suffix=".wav")
62
+ with open(path, "wb") as f:
63
+ f.write(audio_bytes)
64
+ logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
65
+ return path
66
+ except Exception as e:
67
+ logger.warning("Qwen TTS failed: %s", e)
68
+ return None
69
+
70
+
71
+ def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%", pitch: str = "+0Hz") -> str:
72
+ """
73
+ Generate audio via Edge-TTS.
74
+ rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
75
+ pitch: SSML prosody pitch string, e.g. "+50Hz" higher, "-50Hz" lower.
76
+ """
77
+ import edge_tts
78
+ path = get_temp_filepath(suffix=".mp3")
79
+ snippet = text[:TTS_MAX_CHARS]
80
+
81
+ async def _run():
82
+ communicate = edge_tts.Communicate(snippet, voice, rate=rate, pitch=pitch)
83
+ await communicate.save(path)
84
+
85
+ try:
86
+ loop = asyncio.get_event_loop()
87
+ if loop.is_running():
88
+ import concurrent.futures
89
+ with concurrent.futures.ThreadPoolExecutor() as pool:
90
+ pool.submit(asyncio.run, _run()).result(timeout=120)
91
+ else:
92
+ loop.run_until_complete(_run())
93
+ except RuntimeError:
94
+ asyncio.run(_run())
95
+
96
+ if os.path.getsize(path) == 0:
97
+ raise RuntimeError("Edge-TTS produced an empty audio file.")
98
+ logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
99
+ return path
100
+
101
+
102
+ # ══════════════════════════════════════════════════════════════════════════════
103
+ # Audio post-processing
104
+ # ══════════════════════════════════════════════════════════════════════════════
105
+
106
+ def _apply_rap_fx(path: str) -> str:
107
+ """
108
+ Apply bass boost to a rap audio file using pydub.
109
+ Low-frequency boost makes it sound punchier and more rap-like.
110
+ Returns path to processed file (new file).
111
+ """
112
+ try:
113
+ from pydub import AudioSegment
114
+ from pydub.effects import low_pass_filter
115
+
116
+ audio = AudioSegment.from_file(path)
117
+
118
+ # Split into bass (low) and mid/high frequencies
119
+ bass = low_pass_filter(audio, 200) # frequencies below 200 Hz
120
+ highs = audio - low_pass_filter(audio, 200) # everything above
121
+
122
+ # Boost bass by 10 dB for a punchier feel, keep highs as-is, combine
123
+ boosted = (bass + 10).overlay(highs)
124
+
125
+ out = get_temp_filepath(suffix=".mp3")
126
+ boosted.export(out, format="mp3")
127
+ logger.info("Rap bass boost applied β†’ %s", out)
128
+ return out
129
+ except Exception as e:
130
+ logger.warning("Rap FX failed (%s) β€” returning original audio", e)
131
+ return path
132
+
133
+
134
+ def _concat(paths: list[str], silence_ms: int = 300) -> str:
135
+ """Concatenate audio files with silence between each segment."""
136
+ if len(paths) == 1:
137
+ return paths[0]
138
+ try:
139
+ from pydub import AudioSegment
140
+ combined = AudioSegment.empty()
141
+ silence = AudioSegment.silent(duration=silence_ms)
142
+ for p in paths:
143
+ combined += AudioSegment.from_file(p) + silence
144
+ out = get_temp_filepath(suffix=".mp3")
145
+ combined.export(out, format="mp3")
146
+ logger.info("Concatenated %d segments β†’ %s", len(paths), out)
147
+ return out
148
+ except Exception as e:
149
+ logger.warning("pydub concat failed (%s) β€” trying ffmpeg fallback", e)
150
+ return _concat_ffmpeg(paths)
151
+
152
+
153
+ def _concat_ffmpeg(paths: list[str]) -> str:
154
+ """Fallback: concatenate audio files using ffmpeg directly via subprocess."""
155
+ import subprocess
156
+ import tempfile
157
+
158
+ out = get_temp_filepath(suffix=".mp3")
159
+
160
+ # Write a concat list file for ffmpeg
161
+ list_path = get_temp_filepath(suffix=".txt")
162
+ with open(list_path, "w") as f:
163
+ for p in paths:
164
+ f.write(f"file '{p}'\n")
165
+
166
+ try:
167
+ subprocess.run(
168
+ ["ffmpeg", "-y", "-f", "concat", "-safe", "0",
169
+ "-i", list_path, "-c", "copy", out],
170
+ check=True, capture_output=True, timeout=120,
171
+ )
172
+ logger.info("ffmpeg concat: %d segments β†’ %s", len(paths), out)
173
+ return out
174
+ except Exception as e2:
175
+ logger.warning("ffmpeg concat also failed (%s) β€” returning first segment", e2)
176
+ return paths[0]
177
+
178
+
179
+ def _add_story_gaps(path: str) -> str:
180
+ """
181
+ Insert longer silence gaps between sentences in story audio.
182
+ Gives the warm, unhurried feel of a storyteller.
183
+ """
184
+ try:
185
+ from pydub import AudioSegment
186
+ audio = AudioSegment.from_file(path)
187
+ gap = AudioSegment.silent(duration=600) # 600 ms between sentences
188
+ # Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
189
+ chunk_ms = 5000
190
+ chunks = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
191
+ combined = AudioSegment.empty()
192
+ for chunk in chunks:
193
+ combined += chunk + gap
194
+ out = get_temp_filepath(suffix=".mp3")
195
+ combined.export(out, format="mp3")
196
+ logger.info("Story gaps applied β†’ %s", out)
197
+ return out
198
+ except Exception as e:
199
+ logger.warning("Story gap insertion failed (%s) β€” returning original", e)
200
+ return path
201
+
202
+
203
+ # ══════════════════════════════════════════════════════════════════════════════
204
+ # Dialogue script parser
205
+ # ══════════════════════════════════════════════════════════════════════════════
206
+
207
+ def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
208
+ """Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
209
+ segments: list[tuple[str, str]] = []
210
+ prefix_a = f"{tag_a}:"
211
+ prefix_b = f"{tag_b}:"
212
+
213
+ for line in script.splitlines():
214
+ line = line.strip()
215
+ if line.startswith(prefix_a):
216
+ text = line[len(prefix_a):].strip()
217
+ if text:
218
+ if segments and segments[-1][0] == tag_a:
219
+ segments[-1] = (tag_a, segments[-1][1] + " " + text)
220
+ else:
221
+ segments.append((tag_a, text))
222
+ elif line.startswith(prefix_b):
223
+ text = line[len(prefix_b):].strip()
224
+ if text:
225
+ if segments and segments[-1][0] == tag_b:
226
+ segments[-1] = (tag_b, segments[-1][1] + " " + text)
227
+ else:
228
+ segments.append((tag_b, text))
229
+ return segments
230
+
231
+
232
+ # ══════════════════════════════════════════════════════════════════════════════
233
+ # Per-mode audio generators
234
+ # ══════════════════════════════════════════════════════════════════════════════
235
+
236
+ def generate_audio_podcast(script: str) -> tuple[str, str]:
237
+ """
238
+ Podcast: ALEX = female (AriaNeural), SAM = male (GuyNeural).
239
+ Normal conversational rate, 300 ms silence between turns.
240
+ """
241
+ segments = _parse_dialogue(script, "ALEX", "SAM")
242
+ if not segments:
243
+ logger.warning("No ALEX/SAM tags β€” falling back to single voice")
244
+ return generate_audio(script)
245
+
246
+ voice_map = {
247
+ "ALEX": (EDGE_VOICE_HOST_FEMALE, "+0%"),
248
+ "SAM": (EDGE_VOICE_HOST_MALE, "+0%"),
249
+ }
250
+ paths = []
251
+ for speaker, text in segments:
252
+ voice, rate = voice_map[speaker]
253
+ try:
254
+ paths.append(_edge_tts(text, voice=voice, rate=rate))
255
+ except Exception as e:
256
+ logger.warning("Podcast segment failed %s: %s", speaker, e)
257
+
258
+ if not paths:
259
+ raise RuntimeError("All podcast segments failed.")
260
+ return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"
261
+
262
+
263
+ def generate_audio_debate(script: str) -> tuple[str, str]:
264
+ """
265
+ Debate: MAYA = female (AriaNeural, assertive +8%),
266
+ RYAN = male (GuyNeural, deliberate -5%).
267
+ 400 ms silence between turns for debate feel.
268
+ """
269
+ segments = _parse_dialogue(script, "MAYA", "RYAN")
270
+ if not segments:
271
+ logger.warning("No MAYA/RYAN tags β€” falling back to single voice")
272
+ return generate_audio(script)
273
+
274
+ voice_map = {
275
+ "MAYA": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
276
+ "RYAN": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
277
+ }
278
+ paths = []
279
+ for speaker, text in segments:
280
+ voice, rate = voice_map[speaker]
281
+ try:
282
+ paths.append(_edge_tts(text, voice=voice, rate=rate))
283
+ except Exception as e:
284
+ logger.warning("Debate segment failed %s: %s", speaker, e)
285
+
286
+ if not paths:
287
+ raise RuntimeError("All debate segments failed.")
288
+ return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"
289
+
290
+
291
+ def generate_audio_rap(script: str) -> tuple[str, str]:
292
+ """
293
+ Rap: TTS each line separately with short pauses for rhythm,
294
+ then concatenate and apply bass boost for a punchier sound.
295
+ """
296
+ # Split into non-empty lines for line-by-line TTS
297
+ lines = [ln.strip() for ln in script.splitlines() if ln.strip()]
298
+
299
+ if len(lines) <= 1:
300
+ # Very short rap β€” just TTS the whole thing
301
+ path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
302
+ path = _apply_rap_fx(path)
303
+ return path, "Edge-TTS (Rap)"
304
+
305
+ # TTS each line separately
306
+ paths = []
307
+ for line in lines:
308
+ try:
309
+ paths.append(_edge_tts(line, voice=EDGE_VOICE_RAP, rate=RAP_RATE))
310
+ except Exception as e:
311
+ logger.warning("Rap line TTS failed: %s", e)
312
+
313
+ if not paths:
314
+ raise RuntimeError("All rap line TTS failed.")
315
+
316
+ # Concatenate with short pauses (200ms between lines for rhythmic feel)
317
+ combined = _concat(paths, silence_ms=200)
318
+ # Apply bass boost
319
+ combined = _apply_rap_fx(combined)
320
+ return combined, "Edge-TTS (Rap)"
321
+
322
+
323
+ def generate_audio_story(script: str) -> tuple[str, str]:
324
+ """
325
+ Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
326
+ """
327
+ path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
328
+ path = _add_story_gaps(path)
329
+ return path, "Edge-TTS (Story)"
330
+
331
+
332
+ # ══════════════════════════════════════════════════════════════════════════════
333
+ # Unified public interface
334
+ # ══════════════════════════════════════════════════════════════════════════════
335
+
336
+ def apply_pitch_shift(path: str, pitch_semitones: float) -> str:
337
+ """
338
+ Shift pitch of an audio file by the given number of semitones using pydub.
339
+ Positive = higher pitch, negative = lower pitch.
340
+ Returns path to new file, or original if processing fails.
341
+ """
342
+ if abs(pitch_semitones) < 0.1:
343
+ return path # no change needed
344
+ try:
345
+ from pydub import AudioSegment
346
+ audio = AudioSegment.from_file(path)
347
+ # Change sample rate to shift pitch (speed changes too, then we fix duration)
348
+ factor = 2 ** (pitch_semitones / 12.0)
349
+ new_sample_rate = int(audio.frame_rate * factor)
350
+ shifted = audio._spawn(audio.raw_data, overrides={"frame_rate": new_sample_rate})
351
+ # Restore original sample rate to fix playback speed
352
+ shifted = shifted.set_frame_rate(audio.frame_rate)
353
+ out = get_temp_filepath(suffix=".mp3")
354
+ shifted.export(out, format="mp3")
355
+ logger.info("Pitch shifted by %.1f semitones β†’ %s", pitch_semitones, out)
356
+ return out
357
+ except Exception as e:
358
+ logger.warning("Pitch shift failed (%s) β€” returning original", e)
359
+ return path
360
+
361
+
362
+ def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
363
+ """Single-voice TTS for Summary and Song modes. Tries Qwen first."""
364
+ if not text or not text.strip():
365
+ raise ValueError("No text provided for audio generation.")
366
+ path = _qwen_tts(text)
367
+ if path and os.path.exists(path):
368
+ return path, "Qwen3-TTS"
369
+ return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"
utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Utility helpers.
3
+
4
+ Provides temp file management and error formatting
5
+ used across the pipeline.
6
+ """
7
+
8
+ import os
9
+ import tempfile
10
+ import logging
11
+
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
15
+ )
16
+ logger = logging.getLogger("voiceverse")
17
+
18
+
19
+ def get_temp_filepath(suffix: str = ".wav") -> str:
20
+ """Return a path to a new temporary file that won't be auto-deleted."""
21
+ fd, path = tempfile.mkstemp(suffix=suffix)
22
+ os.close(fd)
23
+ return path
24
+
25
+
26
+ def format_error(stage: str, error: Exception) -> str:
27
+ """
28
+ Return a user-friendly error string.
29
+ Hides raw tracebacks; logs the full error for debugging.
30
+ """
31
+ logger.error("Error in %s: %s", stage, error, exc_info=True)
32
+ friendly_messages = {
33
+ "upload": "Could not read the uploaded file. Please try a different PDF or TXT file.",
34
+ "rag": "Failed to process the document text. The file may be empty or corrupted.",
35
+ "script": "Could not generate the audio script. Please check your HF_TOKEN and try again.",
36
+ "tts": "Audio generation failed. The system will retry with a fallback voice.",
37
+ }
38
+ return friendly_messages.get(stage, f"An unexpected error occurred: {stage}")
39
+
40
+
41
+ def validate_file(file_path: str) -> tuple[bool, str]:
42
+ """
43
+ Validate an uploaded file. Returns (is_valid, message).
44
+ """
45
+ if file_path is None:
46
+ return False, "Please upload a PDF or TXT file first."
47
+
48
+ if not os.path.exists(file_path):
49
+ return False, "The uploaded file could not be found. Please try again."
50
+
51
+ ext = os.path.splitext(file_path)[1].lower()
52
+ if ext not in (".pdf", ".txt"):
53
+ return False, f"Unsupported file format '{ext}'. Please upload a PDF or TXT file."
54
+
55
+ size = os.path.getsize(file_path)
56
+ if size == 0:
57
+ return False, "The uploaded file is empty. Please upload a file with content."
58
+
59
+ if size > 20 * 1024 * 1024: # 20 MB limit
60
+ return False, "File is too large (>20 MB). Please upload a smaller document."
61
+
62
+ return True, "File is valid."