Isshi14 commited on
Commit
fbe59b1
Β·
verified Β·
1 Parent(s): 7723b85

Upload 9 files

Browse files
Files changed (4) hide show
  1. app.py +364 -239
  2. gitattributes +35 -0
  3. script_gen.py +253 -101
  4. tts.py +245 -157
app.py CHANGED
@@ -1,239 +1,364 @@
1
- """
2
- VoiceVerse AI β€” Main Application.
3
-
4
- Gradio-based UI that orchestrates the full document-to-audio pipeline:
5
- 1. Upload PDF/TXT β†’ extract text
6
- 2. RAG: chunk, embed, retrieve relevant context
7
- 3. Generate a spoken-style script via Mistral-7B-Instruct
8
- 4. Convert script to expressive audio via Qwen TTS / Edge-TTS
9
- 5. Play audio in the browser
10
-
11
- Entry point for Hugging Face Spaces deployment.
12
- """
13
-
14
- import os
15
- import gradio as gr
16
- from utils import logger, validate_file, format_error
17
- from rag import extract_text, RAGStore
18
- from script_gen import generate_script
19
- from tts import generate_audio
20
-
21
- # ── Global RAG Store (single-user demo) ──────────────────────────────────────
22
- rag_store = RAGStore()
23
-
24
-
25
- # ── Pipeline Orchestration ───────────────────────────────────────────────────
26
-
27
- def process_document(file, progress=gr.Progress()):
28
- """
29
- Full pipeline: upload β†’ extract β†’ RAG β†’ script β†’ audio.
30
-
31
- Args:
32
- file: Gradio uploaded file object (has .name attribute)
33
-
34
- Returns:
35
- Tuple of (script_text, audio_file_path, status_message)
36
- """
37
- # ── Step 0: Validate ─────────────────────────────────────────────────
38
- if file is None:
39
- raise gr.Error("Please upload a PDF or TXT file first.")
40
-
41
- file_path = file.name if hasattr(file, "name") else str(file)
42
- is_valid, msg = validate_file(file_path)
43
- if not is_valid:
44
- raise gr.Error(msg)
45
-
46
- try:
47
- # ── Step 1: Extract Text ─────────────────────────────────────────
48
- progress(0.1, desc="πŸ“„ Extracting text from document...")
49
- logger.info("Processing file: %s", file_path)
50
-
51
- text = extract_text(file_path)
52
- if not text or len(text.strip()) < 50:
53
- raise gr.Error(
54
- "The document contains too little text to generate audio. "
55
- "Please upload a document with more content."
56
- )
57
-
58
- progress(0.2, desc="βœ… Text extracted successfully")
59
-
60
- # ── Step 2: RAG β€” Chunk & Embed ──────────────────────────────────
61
- progress(0.3, desc="🧠 Processing document with AI...")
62
- rag_store.add_document(text)
63
-
64
- chunk_count = len(rag_store.chunks)
65
- logger.info("Document processed: %d chunks created", chunk_count)
66
-
67
- # ── Step 3: Retrieve Context ─────────────────────────────────────
68
- progress(0.4, desc="πŸ” Retrieving key content...")
69
-
70
- # For short documents, use all chunks; for longer ones, retrieve smartly
71
- if chunk_count <= 8:
72
- context_chunks = rag_store.get_all_chunks()
73
- else:
74
- context_chunks = rag_store.query(
75
- "What are the main topics, key insights, and important details?",
76
- top_k=6,
77
- )
78
-
79
- progress(0.5, desc="βœ… Context retrieved")
80
-
81
- # ── Step 4: Generate Script ──────────────────────────────────────
82
- progress(0.6, desc="✍️ Writing spoken script...")
83
-
84
- script = generate_script(context_chunks)
85
- logger.info("Script generated: %d characters", len(script))
86
-
87
- progress(0.75, desc="βœ… Script ready")
88
-
89
- # ── Step 5: Generate Audio ───────────────────────────────────────
90
- progress(0.8, desc="πŸŽ™οΈ Generating expressive audio...")
91
-
92
- audio_path, engine = generate_audio(script)
93
- logger.info("Audio generated via %s: %s", engine, audio_path)
94
-
95
- progress(1.0, desc="βœ… Audio ready!")
96
-
97
- # ── Build status message ─────────────────────────────────────────
98
- status = (
99
- f"βœ… **Generation complete!**\n\n"
100
- f"- πŸ“„ Document: {os.path.basename(file_path)}\n"
101
- f"- πŸ“ Text extracted: {len(text):,} characters\n"
102
- f"- 🧩 Chunks created: {chunk_count}\n"
103
- f"- ✍️ Script length: {len(script):,} characters\n"
104
- f"- πŸŽ™οΈ Voice engine: {engine}\n"
105
- )
106
-
107
- return script, audio_path, status
108
-
109
- except gr.Error:
110
- raise # Re-raise Gradio errors as-is
111
- except EnvironmentError as e:
112
- raise gr.Error(str(e))
113
- except Exception as e:
114
- error_msg = format_error("pipeline", e)
115
- raise gr.Error(error_msg)
116
-
117
-
118
- # ── Gradio UI ────────────────────────────────────────────────────────────────
119
-
120
- def build_ui() -> gr.Blocks:
121
- """Build and return the Gradio Blocks interface."""
122
-
123
- # Custom CSS for a clean, polished look
124
- css = """
125
- .main-header {
126
- text-align: center;
127
- margin-bottom: 1rem;
128
- }
129
- .main-header h1 {
130
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
131
- -webkit-background-clip: text;
132
- -webkit-text-fill-color: transparent;
133
- font-size: 2.5rem;
134
- font-weight: 800;
135
- margin-bottom: 0.25rem;
136
- }
137
- .main-header p {
138
- color: #6b7280;
139
- font-size: 1.1rem;
140
- }
141
- .status-box {
142
- border-left: 3px solid #667eea;
143
- padding-left: 1rem;
144
- margin: 0.5rem 0;
145
- }
146
- """
147
-
148
- with gr.Blocks(
149
- title="VoiceVerse AI β€” Document to Audio",
150
- theme=gr.themes.Soft(
151
- primary_hue="indigo",
152
- secondary_hue="purple",
153
- ),
154
- css=css,
155
- ) as app:
156
-
157
- # ── Header ───────────────────────────────────────────────────────
158
- gr.HTML("""
159
- <div class="main-header">
160
- <h1>πŸŽ™οΈ VoiceVerse AI</h1>
161
- <p>Transform your documents into engaging podcast-style audio</p>
162
- </div>
163
- """)
164
-
165
- with gr.Row():
166
- # ── Left Column: Input ───────────────────────────────────────
167
- with gr.Column(scale=1):
168
- gr.Markdown("### πŸ“€ Upload Document")
169
-
170
- file_input = gr.File(
171
- label="Upload a PDF or TXT file",
172
- file_types=[".pdf", ".txt"],
173
- type="filepath",
174
- elem_id="file-upload",
175
- )
176
-
177
- generate_btn = gr.Button(
178
- "πŸŽ™οΈ Generate Audio",
179
- variant="primary",
180
- size="lg",
181
- elem_id="generate-btn",
182
- )
183
-
184
- status_output = gr.Markdown(
185
- value="*Upload a document and click Generate to start.*",
186
- elem_classes=["status-box"],
187
- )
188
-
189
- # ── Right Column: Output ─────────────────────────────────────
190
- with gr.Column(scale=1):
191
- gr.Markdown("### 🎧 Generated Audio")
192
-
193
- audio_output = gr.Audio(
194
- label="Audio Narration",
195
- type="filepath",
196
- elem_id="audio-player",
197
- interactive=False,
198
- )
199
-
200
- gr.Markdown("### ✍️ Generated Script")
201
-
202
- script_output = gr.Textbox(
203
- label="Spoken Script",
204
- lines=12,
205
- max_lines=20,
206
- interactive=False,
207
- placeholder="The generated script will appear here...",
208
- elem_id="script-display",
209
- )
210
-
211
- # ── Wire up the generate button ──────────────────────────────────
212
- generate_btn.click(
213
- fn=process_document,
214
- inputs=[file_input],
215
- outputs=[script_output, audio_output, status_output],
216
- )
217
-
218
- # ── Footer ───────────────────────────────────────────────────────
219
- gr.Markdown(
220
- "<center style='color: #9ca3af; margin-top: 1rem;'>"
221
- "Built with ❀️ using Mistral-7B-Instruct · Qwen3-TTS · Edge-TTS · Gradio"
222
- "</center>"
223
- )
224
-
225
- return app
226
-
227
-
228
- # ── Entry Point ──────────────────────────────────────────────────────────────
229
-
230
- if __name__ == "__main__":
231
- logger.info("Starting VoiceVerse AI...")
232
-
233
- app = build_ui()
234
- app.launch(
235
- server_name="0.0.0.0",
236
- server_port=7860,
237
- share=False,
238
- show_error=True,
239
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Main Application.
3
+
4
+ Gradio-based UI that orchestrates the full document-to-audio pipeline:
5
+ 1. Upload PDF/TXT β†’ extract text
6
+ 2. RAG: chunk, embed, retrieve relevant context ← UNCHANGED
7
+ 3. Delivery Mode selector routes to mode-specific prompt ← NEW
8
+ 4. Generate a spoken/podcast/song script via SmolLM3-3B
9
+ 5. Convert script to audio via Qwen TTS / Edge-TTS
10
+ 6. Play audio in the browser
11
+
12
+ Delivery Modes:
13
+ - Summary : single-voice structured narration
14
+ - Podcast : two-host dialogue (HOST_1 / HOST_2), dual voice TTS
15
+ - Song / Rap : rhythmic retention content, single voice
16
+
17
+ Entry point for Hugging Face Spaces deployment.
18
+ """
19
+
20
+ import os
21
+ import gradio as gr
22
+ from utils import logger, validate_file, format_error
23
+ from rag import extract_text, RAGStore
24
+ from script_gen import generate_script
25
+ from tts import generate_audio, generate_audio_podcast
26
+
27
+ # ── Global RAG Store (single-user demo) ──────────────────────────────────────
28
+ rag_store = RAGStore()
29
+
30
+
31
+ # ══════════════════════════════════════════════════════════════════════════════
32
+ # Pipeline Orchestration
33
+ # ══════════════════════════════════════════════════════════════════════════════
34
+
35
+ def process_document(
36
+ file,
37
+ delivery_mode: str,
38
+ song_rap_sub: str,
39
+ progress=gr.Progress(),
40
+ ):
41
+ """
42
+ Full pipeline:
43
+ upload β†’ extract β†’ RAG β†’ script (mode-specific) β†’ audio
44
+
45
+ Args:
46
+ file : Gradio uploaded file object (.name attribute)
47
+ delivery_mode : "Summary" | "Podcast" | "Song / Rap"
48
+ song_rap_sub : "Song" | "Rap" (only relevant for Song/Rap mode)
49
+
50
+ Returns:
51
+ (script_text, audio_file_path, status_markdown)
52
+ """
53
+
54
+ # ── Validate input ───────────────────────────────────────────────────────
55
+ if file is None:
56
+ raise gr.Error("Please upload a PDF or TXT file first.")
57
+
58
+ file_path = file.name if hasattr(file, "name") else str(file)
59
+ is_valid, msg = validate_file(file_path)
60
+ if not is_valid:
61
+ raise gr.Error(msg)
62
+
63
+ try:
64
+ # ── Step 1: Extract text ─────────────────────────────────────────────
65
+ progress(0.10, desc="πŸ“„ Extracting text from document…")
66
+ logger.info("Processing file: %s | mode: %s", file_path, delivery_mode)
67
+
68
+ text = extract_text(file_path)
69
+ if not text or len(text.strip()) < 50:
70
+ raise gr.Error(
71
+ "The document contains too little text. "
72
+ "Please upload a document with more content."
73
+ )
74
+ progress(0.20, desc="βœ… Text extracted")
75
+
76
+ # ── Step 2: RAG β€” chunk & embed (UNCHANGED) ──────────────────────────
77
+ progress(0.30, desc="🧠 Building knowledge index…")
78
+ rag_store.add_document(text)
79
+ chunk_count = len(rag_store.chunks)
80
+ logger.info("RAG index built: %d chunks", chunk_count)
81
+
82
+ # ── Step 3: Retrieve context (UNCHANGED) ─────────────────────────────
83
+ progress(0.40, desc="πŸ” Retrieving relevant content…")
84
+ if chunk_count <= 8:
85
+ context_chunks = rag_store.get_all_chunks()
86
+ else:
87
+ context_chunks = rag_store.query(
88
+ "What are the main topics, key insights, and important details?",
89
+ top_k=6,
90
+ )
91
+ progress(0.50, desc="βœ… Context retrieved")
92
+
93
+ # ── Step 4: Generate script (mode-aware) ─────────────────────────────
94
+ mode_label = _mode_progress_label(delivery_mode, song_rap_sub)
95
+ progress(0.60, desc=f"✍️ Writing {mode_label} script…")
96
+
97
+ script = generate_script(
98
+ context_chunks=context_chunks,
99
+ mode=delivery_mode,
100
+ sub_mode=song_rap_sub,
101
+ )
102
+ logger.info("Script generated: %d chars", len(script))
103
+ progress(0.75, desc="βœ… Script ready")
104
+
105
+ # ── Step 5: Generate audio (mode-aware) ──────────────────────────────
106
+ progress(0.80, desc="πŸŽ™οΈ Synthesising audio…")
107
+
108
+ is_podcast = delivery_mode.strip().lower() == "podcast"
109
+
110
+ if is_podcast:
111
+ audio_path, engine = generate_audio_podcast(script)
112
+ else:
113
+ audio_path, engine = generate_audio(script)
114
+
115
+ logger.info("Audio generated via %s: %s", engine, audio_path)
116
+ progress(1.00, desc="βœ… Done!")
117
+
118
+ # ── Build status card ─────────────────────────────────────────────────
119
+ mode_icon = {"summary": "πŸ“‹", "podcast": "πŸŽ™οΈ", "song / rap": "🎡"}.get(
120
+ delivery_mode.lower(), "🎧"
121
+ )
122
+ status = (
123
+ f"### βœ… Generation complete!\n\n"
124
+ f"| | |\n|---|---|\n"
125
+ f"| {mode_icon} **Mode** | {delivery_mode}"
126
+ + (f" β€” {song_rap_sub}" if "song" in delivery_mode.lower() or "rap" in delivery_mode.lower() else "")
127
+ + f" |\n"
128
+ f"| πŸ“„ **Document** | {os.path.basename(file_path)} |\n"
129
+ f"| 🧩 **Chunks** | {chunk_count} |\n"
130
+ f"| ✍️ **Script length** | {len(script):,} chars |\n"
131
+ f"| πŸ”Š **Voice engine** | {engine} |\n"
132
+ )
133
+
134
+ return script, audio_path, status
135
+
136
+ except gr.Error:
137
+ raise
138
+ except EnvironmentError as e:
139
+ raise gr.Error(str(e))
140
+ except Exception as e:
141
+ error_msg = format_error("pipeline", e)
142
+ raise gr.Error(error_msg)
143
+
144
+
145
+ def _mode_progress_label(mode: str, sub_mode: str) -> str:
146
+ m = mode.lower()
147
+ if "podcast" in m:
148
+ return "podcast"
149
+ if "song" in m or "rap" in m:
150
+ return sub_mode.lower()
151
+ return "summary"
152
+
153
+
154
+ # ══════════════════════════════════════════════════════════════════════════════
155
+ # Conditional UI visibility helpers
156
+ # ══════════════════════════════════════════════════════════════════════════════
157
+
158
+ def _on_mode_change(mode: str):
159
+ """
160
+ Return visibility updates for mode-specific sub-controls.
161
+ Called whenever the delivery mode radio changes.
162
+ """
163
+ show_song_rap = "song" in mode.lower() or "rap" in mode.lower()
164
+ return gr.update(visible=show_song_rap)
165
+
166
+
167
+ # ══════════════════════════════════════════════════════════════════════════════
168
+ # Gradio UI
169
+ # ══════════════════════════════════════════════════════════════════════════════
170
+
171
+ def build_ui() -> gr.Blocks:
172
+
173
+ css = """
174
+ /* ── Header ─────────────────────────────────────────────── */
175
+ .main-header { text-align: center; margin-bottom: 1rem; }
176
+ .main-header h1 {
177
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
178
+ -webkit-background-clip: text;
179
+ -webkit-text-fill-color: transparent;
180
+ font-size: 2.5rem;
181
+ font-weight: 800;
182
+ margin-bottom: 0.25rem;
183
+ }
184
+ .main-header p { color: #6b7280; font-size: 1.1rem; }
185
+
186
+ /* ── Mode selector card ──────────────────────────────────── */
187
+ .mode-card {
188
+ background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
189
+ border: 1px solid #e0d9ff;
190
+ border-radius: 12px;
191
+ padding: 1rem 1.25rem;
192
+ margin-top: 0.5rem;
193
+ }
194
+ .mode-card h3 { color: #4c3d99; margin-bottom: 0.5rem; }
195
+
196
+ /* ── Status ──────────────────────────────────────────────── */
197
+ .status-box {
198
+ border-left: 3px solid #667eea;
199
+ padding-left: 1rem;
200
+ margin: 0.5rem 0;
201
+ }
202
+
203
+ /* ── Sub-mode row ────────────────────────────────────────── */
204
+ .sub-mode-row { margin-top: 0.5rem; }
205
+ """
206
+
207
+ with gr.Blocks(
208
+ title="VoiceVerse AI β€” Document to Audio",
209
+ theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
210
+ css=css,
211
+ ) as app:
212
+
213
+ # ── Header ───────────────────────────────────────────────────────────
214
+ gr.HTML("""
215
+ <div class="main-header">
216
+ <h1>πŸŽ™οΈ VoiceVerse AI</h1>
217
+ <p>Transform your documents into engaging audio experiences</p>
218
+ </div>
219
+ """)
220
+
221
+ with gr.Row(equal_height=False):
222
+
223
+ # ════════════════════════════════════════════════════════════════
224
+ # LEFT COLUMN β€” Upload + Mode Selector
225
+ # ═══════════════════════════════════════════════════════════���════
226
+ with gr.Column(scale=1):
227
+
228
+ # ── Upload ───────────────────────────────────────────────
229
+ gr.Markdown("### πŸ“€ Upload Document")
230
+ file_input = gr.File(
231
+ label="Upload a PDF or TXT file",
232
+ file_types=[".pdf", ".txt"],
233
+ type="filepath",
234
+ )
235
+
236
+ # ── Delivery Mode Selector ───────────────────────────────
237
+ gr.HTML('<div class="mode-card">')
238
+ gr.Markdown("### 🎨 Choose Audio Experience")
239
+
240
+ delivery_mode = gr.Radio(
241
+ choices=["Summary", "Podcast", "Song / Rap"],
242
+ value="Summary",
243
+ label=None,
244
+ elem_id="delivery-mode-radio",
245
+ )
246
+
247
+ # Song / Rap sub-option β€” hidden unless Song/Rap selected
248
+ with gr.Row(visible=False, elem_classes=["sub-mode-row"]) as song_rap_row:
249
+ song_rap_sub = gr.Radio(
250
+ choices=["Song", "Rap"],
251
+ value="Rap",
252
+ label="Style",
253
+ scale=1,
254
+ )
255
+
256
+ # Mode description (updates on change)
257
+ mode_description = gr.Markdown(
258
+ value=_mode_description("Summary"),
259
+ elem_id="mode-desc",
260
+ )
261
+
262
+ gr.HTML("</div>") # close .mode-card
263
+
264
+ # ── Generate Button ──────────────────────────────────────
265
+ generate_btn = gr.Button(
266
+ "πŸŽ™οΈ Generate Audio",
267
+ variant="primary",
268
+ size="lg",
269
+ )
270
+
271
+ # ── Status ───────────────────────────────────────────────
272
+ status_output = gr.Markdown(
273
+ value="*Upload a document, choose your audio experience, then click Generate.*",
274
+ elem_classes=["status-box"],
275
+ )
276
+
277
+ # ════════════════════════════════════════════════════════════════
278
+ # RIGHT COLUMN β€” Audio + Script Output
279
+ # ════════════════════════════════════════════════════════════════
280
+ with gr.Column(scale=1):
281
+
282
+ gr.Markdown("### 🎧 Generated Audio")
283
+ audio_output = gr.Audio(
284
+ label="Audio",
285
+ type="filepath",
286
+ interactive=False,
287
+ )
288
+
289
+ gr.Markdown("### ✍️ Generated Script")
290
+ script_output = gr.Textbox(
291
+ label="Script",
292
+ lines=14,
293
+ max_lines=22,
294
+ interactive=False,
295
+ placeholder="The generated script will appear here…",
296
+ )
297
+
298
+ # ── Footer ───────────────────────────────────────────────────────────
299
+ gr.Markdown(
300
+ "<center style='color:#9ca3af;margin-top:1rem;'>"
301
+ "Built with ❀️ using SmolLM3-3B · Qwen3-TTS · Edge-TTS · Gradio"
302
+ "</center>"
303
+ )
304
+
305
+ # ════════════════════════════════════════════════════════════════════
306
+ # Event wiring
307
+ # ════════════════════════════════════════════════════════════════════
308
+
309
+ # Show/hide Song-Rap sub-option + update description when mode changes
310
+ delivery_mode.change(
311
+ fn=_on_mode_change_full,
312
+ inputs=[delivery_mode],
313
+ outputs=[song_rap_row, mode_description],
314
+ )
315
+
316
+ # Generate button click
317
+ generate_btn.click(
318
+ fn=process_document,
319
+ inputs=[file_input, delivery_mode, song_rap_sub],
320
+ outputs=[script_output, audio_output, status_output],
321
+ )
322
+
323
+ return app
324
+
325
+
326
+ # ── Mode description helper ───────────────────────────────────────────────────
327
+
328
+ def _mode_description(mode: str) -> str:
329
+ descriptions = {
330
+ "Summary": (
331
+ "*πŸ“‹ **Summary** β€” A clear, structured spoken narration covering "
332
+ "the intro, key points, and conclusion. Single voice, neutral tone.*"
333
+ ),
334
+ "Podcast": (
335
+ "*πŸŽ™οΈ **Podcast** β€” A two-host conversation. Host 1 guides and "
336
+ "asks questions; Host 2 explains and elaborates. Dual voices.*"
337
+ ),
338
+ "Song / Rap": (
339
+ "*🎡 **Song / Rap** β€” Key ideas transformed into a rhythmic, "
340
+ "memorable format. Choose Song for smooth flow or Rap for punchy lines.*"
341
+ ),
342
+ }
343
+ return descriptions.get(mode, "")
344
+
345
+
346
+ def _on_mode_change_full(mode: str):
347
+ """Return (song_rap_row visibility, description markdown)."""
348
+ show_sub = "song" in mode.lower() or "rap" in mode.lower()
349
+ return gr.update(visible=show_sub), _mode_description(mode)
350
+
351
+
352
+ # ══════════════════════════════════════════════════════════════════════════════
353
+ # Entry point
354
+ # ══════════════════════════════════════════════════════════════════════════════
355
+
356
+ if __name__ == "__main__":
357
+ logger.info("Starting VoiceVerse AI…")
358
+ app = build_ui()
359
+ app.launch(
360
+ server_name="0.0.0.0",
361
+ server_port=7860,
362
+ share=False,
363
+ show_error=True,
364
+ )
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
script_gen.py CHANGED
@@ -4,13 +4,19 @@ VoiceVerse AI β€” Script Generation Module.
4
  Generates spoken-style scripts from retrieved document chunks
5
  using SmolLM3-3B via the Hugging Face Inference API.
6
 
 
 
 
 
 
 
 
 
7
  Design decisions:
8
- - Serverless HF Inference API avoids loading a large model locally
9
- - SmolLM3-3B is deployed on the free hf-inference provider
10
- - Prompt template enforces podcast/narration structure
11
- - Max 1024 new tokens keeps scripts a reasonable length for TTS
12
- - Temperature 0.4 keeps output grounded and factual
13
- - Post-processing strips markdown/XML artifacts for clean TTS
14
  """
15
 
16
  import os
@@ -18,157 +24,303 @@ import re
18
  from huggingface_hub import InferenceClient
19
  from utils import logger
20
 
21
- # β€” Configuration β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
22
- MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
23
- MAX_NEW_TOKENS = 1024
24
- TEMPERATURE = 0.4
25
 
26
- # β€” Prompt Template β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
27
 
28
- SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.
29
 
30
- CRITICAL RULES:
31
- 1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
32
- 2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
33
- 3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
34
- 4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
35
- 5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
36
- 6. Never say "the document says" or "according to the text". Speak as the expert.
37
- 7. If the content is limited, keep the script short rather than inventing information.
38
- 8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
39
- 9. Output ONLY the spoken narration text, nothing else."""
40
 
41
- USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:
 
 
42
 
43
- --- CONTENT ---
 
 
 
 
 
 
 
 
 
 
 
44
  {context}
45
- --- END ---
46
 
47
- Topic: {topic}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""
 
50
 
 
 
51
 
52
- # β€” Post-processing β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- def _clean_script_for_tts(text: str) -> str:
 
 
 
 
 
55
  """
56
- Remove markdown, XML/HTML tags, and other artifacts that would be
57
- read aloud by TTS engines.
58
  """
59
- # Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
60
- text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
61
 
62
- # Remove any remaining XML/HTML-style tags
63
- text = re.sub(r'<[^>]+>', '', text)
64
 
65
- # Remove markdown headers (# ## ### etc.)
66
- text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
67
 
68
- # Remove markdown bold/italic markers
69
- text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
70
- text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
71
 
72
- # Remove markdown links [text](url) -> text
73
- text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
74
 
75
- # Remove markdown code blocks and inline code
76
- text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
77
- text = re.sub(r'`([^`]+)`', r'\1', text)
78
 
79
- # Remove bullet point markers
80
- text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
 
81
 
82
- # Remove numbered list markers
83
- text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
 
84
 
85
- # Remove blockquote markers
86
- text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
 
87
 
88
- # Remove horizontal rules
89
- text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
90
 
91
- # Collapse multiple newlines into one
92
- text = re.sub(r'\n{3,}', '\n\n', text)
93
 
94
- # Collapse multiple spaces
95
- text = re.sub(r' {2,}', ' ', text)
96
 
97
- return text.strip()
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # β€” Script Generation β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def _get_client() -> InferenceClient:
103
- """Create an HF Inference client with the user's token."""
104
  token = os.environ.get("HF_TOKEN")
105
  if not token:
106
  raise EnvironmentError(
107
  "HF_TOKEN environment variable is not set. "
108
- "Please set your Hugging Face API token to use the script generation feature."
109
  )
110
- return InferenceClient(
111
- provider="hf-inference",
112
- token=token,
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
 
 
 
 
114
 
115
 
 
 
 
 
116
  def generate_script(
117
  context_chunks: list[str],
 
 
118
  topic: str = "the key ideas and insights from this document",
119
  ) -> str:
120
  """
121
- Generate a spoken-style podcast script from retrieved document chunks.
122
 
123
  Args:
124
- context_chunks: List of relevant text chunks from the RAG store
125
- topic: Optional focus topic for the script
 
 
 
126
 
127
  Returns:
128
- A spoken script string ready for TTS conversion
 
129
  """
130
  if not context_chunks:
131
  raise ValueError("No document context provided. Please upload a document first.")
132
 
133
- # Combine chunks into a single context block
134
  context = "\n\n".join(context_chunks)
135
-
136
- # Truncate if too long
137
- max_context_chars = 6000
138
- if len(context) > max_context_chars:
139
- context = context[:max_context_chars]
140
- logger.warning("Context truncated to %d characters", max_context_chars)
141
-
142
- # Build the prompt
143
- user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)
144
-
145
- logger.info("Generating script via %s (context: %d chars, topic: '%s')",
146
- MODEL_ID, len(context), topic[:50])
147
-
148
- client = _get_client()
149
-
150
- # Call the model using chat_completion
151
- response = client.chat_completion(
152
- model=MODEL_ID,
153
- messages=[
154
- {"role": "system", "content": SYSTEM_PROMPT},
155
- {"role": "user", "content": user_message},
156
- ],
157
- max_tokens=MAX_NEW_TOKENS,
158
- temperature=TEMPERATURE,
159
- top_p=0.9,
160
  )
161
 
162
- raw_script = response.choices[0].message.content.strip()
 
163
 
164
- if not raw_script:
165
- raise RuntimeError("The model returned an empty script. Please try again.")
 
 
 
 
166
 
167
- # Clean the script for TTS (remove markdown, XML tags, etc.)
168
- script = _clean_script_for_tts(raw_script)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  if not script:
171
  raise RuntimeError("Script was empty after cleaning. Please try again.")
172
 
173
- logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
174
- return script
 
4
  Generates spoken-style scripts from retrieved document chunks
5
  using SmolLM3-3B via the Hugging Face Inference API.
6
 
7
+ Delivery Modes:
8
+ - Summary : Single-speaker structured narration
9
+ - Podcast : Two-host dialogue (HOST_1 / HOST_2 tags)
10
+ - Song / Rap : Rhythmic retention-style content
11
+
12
+ The core RAG pipeline (rag.py) is NOT modified.
13
+ Only this generation stage switches behaviour based on `mode`.
14
+
15
  Design decisions:
16
+ - generate_script() is the single public entry point
17
+ - Each mode has its own system + user prompt pair
18
+ - Post-processing cleans markdown / XML artifacts for TTS
19
+ - Podcast mode preserves HOST_1 / HOST_2 tags (tts.py splits on them)
 
 
20
  """
21
 
22
  import os
 
24
  from huggingface_hub import InferenceClient
25
  from utils import logger
26
 
27
+ # ── Configuration ────────────────────────────────────────────────────────────
 
 
 
28
 
29
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
30
+ MAX_NEW_TOKENS = 1200
31
+ TEMPERATURE = 0.5
32
 
 
33
 
34
+ # ══════════════════════════════════════════════════════════════════════════════
35
+ # Mode A β€” Summary
36
+ # ══════════════════════════════════════════════════════════════════════════════
 
 
 
 
 
 
 
37
 
38
+ _SUMMARY_SYSTEM = """\
39
+ You are a professional narrator. Your task is to produce a clear, structured \
40
+ spoken summary strictly grounded in the provided source material.
41
 
42
+ RULES:
43
+ 1. Use ONLY facts present in the source. Do NOT add outside knowledge.
44
+ 2. Structure: a short introduction, key points spoken as natural sentences, \
45
+ then a concise conclusion.
46
+ 3. Write in plain text only β€” no markdown, no bullet symbols, no headers.
47
+ 4. Write for the ear: short sentences, conversational language.
48
+ 5. Never say "the document says" or "according to the text". Speak as the expert.
49
+ 6. Output ONLY the spoken narration text, nothing else.\
50
+ """
51
+
52
+ _SUMMARY_USER = """\
53
+ SOURCE MATERIAL:
54
  {context}
 
55
 
56
+ Write a spoken summary that flows naturally. Cover the introduction, the key \
57
+ points, and a short conclusion β€” all in plain spoken sentences without headings \
58
+ or labels.\
59
+ """
60
+
61
+
62
+ # ══════════════════════════════════════════════════════════════════════════════
63
+ # Mode B β€” Podcast (Multi-Host)
64
+ # ══════════════════════════════════════════════════════════════════════════════
65
+
66
+ _PODCAST_SYSTEM = """\
67
+ You are a podcast script writer. Produce an engaging two-host conversation \
68
+ strictly grounded in the provided source material.
69
+
70
+ STRICT OUTPUT FORMAT β€” every line must start with a speaker tag:
71
+ HOST_1: <what Host 1 says>
72
+ HOST_2: <what Host 2 says>
73
+
74
+ RULES:
75
+ 1. Alternate HOST_1 and HOST_2 throughout. Never have the same host speak twice in a row.
76
+ 2. HOST_1 introduces topics, asks questions, and guides the conversation.
77
+ 3. HOST_2 explains concepts, provides detail, and answers HOST_1's questions.
78
+ 4. Use ONLY information present in the source material. No hallucination.
79
+ 5. Tone: conversational, curious, engaging β€” like a real podcast.
80
+ 6. Do NOT add lines that are not prefixed with HOST_1: or HOST_2:.
81
+ 7. No markdown, no stage directions, no asterisks.
82
+ 8. Aim for 16–24 exchanges (lines) so the conversation feels substantial.\
83
+ """
84
+
85
+ _PODCAST_USER = """\
86
+ SOURCE MATERIAL:
87
+ {context}
88
+
89
+ Write the full podcast conversation. Every single line must start with either \
90
+ HOST_1: or HOST_2: β€” no exceptions.\
91
+ """
92
+
93
+
94
+ # ══════════════════════════════════════════════════════════════════════════════
95
+ # Mode C β€” Song / Rap (Retention Mode)
96
+ # ══════════════════════════════════════════════════════════════════════════════
97
 
98
+ _SONG_SYSTEM = """\
99
+ You are a creative lyricist. Your task has two steps:
100
 
101
+ STEP 1 β€” silently extract 5 to 7 key ideas from the source material.
102
+ STEP 2 β€” turn those key ideas into a smooth, melodic SONG.
103
 
104
+ SONG RULES:
105
+ - Simple, memorable language.
106
+ - Rhyming couplets or AABB scheme.
107
+ - Include a CHORUS (label it [CHORUS]) that repeats the main concept.
108
+ - Label verses [VERSE 1], [VERSE 2], etc.
109
+ - Short lines (6–10 words each).
110
+ - Use repetition to aid retention.
111
+ - Do NOT invent facts not in the source.
112
+ - Output ONLY the song lyrics with section labels. No explanations.\
113
+ """
114
+
115
+ _RAP_SYSTEM = """\
116
+ You are a creative lyricist. Your task has two steps:
117
+
118
+ STEP 1 β€” silently extract 5 to 7 key ideas from the source material.
119
+ STEP 2 β€” turn those key ideas into a punchy, rhythmic RAP.
120
+
121
+ RAP RULES:
122
+ - Short, punchy lines (5–8 words each).
123
+ - Fast-flow rhyme scheme (AABB or ABAB).
124
+ - Include a HOOK (label it [HOOK]) that repeats the main concept.
125
+ - Label verses [VERSE 1], [VERSE 2], etc.
126
+ - Use repetition and wordplay to aid retention.
127
+ - Do NOT invent facts not in the source.
128
+ - Output ONLY the rap lyrics with section labels. No explanations.\
129
+ """
130
+
131
+ _SONG_RAP_USER = """\
132
+ SOURCE MATERIAL:
133
+ {context}
134
+
135
+ Extract the key ideas, then write the full {form} based strictly on those ideas.\
136
+ """
137
 
138
+
139
+ # ══════════════════════════════════════════════════════════════════════════════
140
+ # Post-processing
141
+ # ══════════════════════════════════════════════════════════════════════════════
142
+
143
+ def _clean_for_tts(text: str, preserve_host_tags: bool = False) -> str:
144
  """
145
+ Remove markdown and XML/HTML artifacts that TTS engines would read aloud.
146
+ When preserve_host_tags=True, HOST_1: / HOST_2: prefixes are kept intact.
147
  """
148
+ # Remove <think>…</think> reasoning traces (SmolLM3)
149
+ text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
150
 
151
+ # Remove remaining XML/HTML tags (but NOT HOST_1/HOST_2 lines)
152
+ text = re.sub(r"<[^>]+>", "", text)
153
 
154
+ # Markdown headers
155
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
156
 
157
+ # Bold / italic
158
+ text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
159
+ text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
160
 
161
+ # Links
162
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
163
 
164
+ # Code blocks
165
+ text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
166
+ text = re.sub(r"`([^`]+)`", r"\1", text)
167
 
168
+ # Bullet / numbered lists
169
+ text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
170
+ text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
171
 
172
+ # Blockquotes / horizontal rules
173
+ text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
174
+ text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
175
 
176
+ # Collapse whitespace
177
+ text = re.sub(r"\n{3,}", "\n\n", text)
178
+ text = re.sub(r" {2,}", " ", text)
179
 
180
+ return text.strip()
 
181
 
 
 
182
 
183
+ def _clean_summary(text: str) -> str:
184
+ return _clean_for_tts(text, preserve_host_tags=False)
185
 
 
186
 
187
+ def _clean_podcast(text: str) -> str:
188
+ """
189
+ Keep HOST_1: / HOST_2: tags β€” they are required by tts.py for voice splitting.
190
+ Strip everything else.
191
+ """
192
+ text = _clean_for_tts(text, preserve_host_tags=True)
193
+
194
+ # Normalise tag variants: "Host 1:", "host_1:", "HOST1:" β†’ "HOST_1:"
195
+ text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
196
+ text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
197
+
198
+ # Remove lines that lack a speaker tag (stray stage directions etc.)
199
+ lines = text.splitlines()
200
+ clean_lines = [
201
+ ln for ln in lines
202
+ if ln.strip() == "" or ln.strip().startswith("HOST_1:") or ln.strip().startswith("HOST_2:")
203
+ ]
204
+ return "\n".join(clean_lines).strip()
205
 
206
+
207
+ def _clean_song_rap(text: str) -> str:
208
+ """
209
+ Keep section labels ([VERSE 1], [CHORUS], [HOOK]) β€” they help TTS pacing
210
+ when read aloud, and are harmless.
211
+ """
212
+ return _clean_for_tts(text, preserve_host_tags=False)
213
+
214
+
215
+ # ══════════════════════════════════════════════════════════════════════════════
216
+ # LLM Client
217
+ # ══════════════════════════════════════════════════════════════════════════════
218
 
219
  def _get_client() -> InferenceClient:
 
220
  token = os.environ.get("HF_TOKEN")
221
  if not token:
222
  raise EnvironmentError(
223
  "HF_TOKEN environment variable is not set. "
224
+ "Please add your Hugging Face API token as a Space secret."
225
  )
226
+ return InferenceClient(provider="hf-inference", token=token)
227
+
228
+
229
+ def _call_llm(system_prompt: str, user_prompt: str) -> str:
230
+ """Send a chat completion request and return the raw response text."""
231
+ client = _get_client()
232
+ response = client.chat_completion(
233
+ model=MODEL_ID,
234
+ messages=[
235
+ {"role": "system", "content": system_prompt},
236
+ {"role": "user", "content": user_prompt},
237
+ ],
238
+ max_tokens=MAX_NEW_TOKENS,
239
+ temperature=TEMPERATURE,
240
+ top_p=0.9,
241
  )
242
+ raw = response.choices[0].message.content.strip()
243
+ if not raw:
244
+ raise RuntimeError("The model returned an empty response. Please try again.")
245
+ return raw
246
 
247
 
248
+ # ══════════════════════════════════════════════════════════════════════════════
249
+ # Public Entry Point
250
+ # ══════════════════════════════════════════════════════════════════════════════
251
+
252
  def generate_script(
253
  context_chunks: list[str],
254
+ mode: str = "Summary",
255
+ sub_mode: str = "Rap",
256
  topic: str = "the key ideas and insights from this document",
257
  ) -> str:
258
  """
259
+ Generate a spoken script from retrieved RAG chunks.
260
 
261
  Args:
262
+ context_chunks : Chunks returned by RAGStore.query() β€” NOT modified here.
263
+ mode : "Summary" | "Podcast" | "Song / Rap"
264
+ sub_mode : "Song" | "Rap" (only used when mode == "Song / Rap")
265
+ topic : Optional human-readable topic label (unused in prompts
266
+ except summary, kept for logging).
267
 
268
  Returns:
269
+ A clean string ready to hand to tts.generate_audio().
270
+ Podcast mode preserves HOST_1: / HOST_2: prefixes.
271
  """
272
  if not context_chunks:
273
  raise ValueError("No document context provided. Please upload a document first.")
274
 
275
+ # ── Combine & truncate context ───────────────────────────────────────────
276
  context = "\n\n".join(context_chunks)
277
+ max_ctx = 6000
278
+ if len(context) > max_ctx:
279
+ context = context[:max_ctx]
280
+ logger.warning("Context truncated to %d chars for LLM call.", max_ctx)
281
+
282
+ logger.info(
283
+ "Generating script | mode=%s sub_mode=%s context=%d chars",
284
+ mode, sub_mode, len(context),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  )
286
 
287
+ # ── Route to the correct prompt pair ────────────────────────────────────
288
+ mode_key = mode.strip().lower()
289
 
290
+ if mode_key == "summary":
291
+ raw = _call_llm(
292
+ _SUMMARY_SYSTEM,
293
+ _SUMMARY_USER.format(context=context),
294
+ )
295
+ script = _clean_summary(raw)
296
 
297
+ elif mode_key == "podcast":
298
+ raw = _call_llm(
299
+ _PODCAST_SYSTEM,
300
+ _PODCAST_USER.format(context=context),
301
+ )
302
+ script = _clean_podcast(raw)
303
+
304
+ elif "song" in mode_key or "rap" in mode_key:
305
+ form = sub_mode.lower() # "song" or "rap"
306
+ system = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
307
+ raw = _call_llm(
308
+ system,
309
+ _SONG_RAP_USER.format(context=context, form=form),
310
+ )
311
+ script = _clean_song_rap(raw)
312
+
313
+ else:
314
+ # Unknown mode β€” fall back to Summary so we never crash
315
+ logger.warning("Unknown mode '%s' β€” falling back to Summary.", mode)
316
+ raw = _call_llm(
317
+ _SUMMARY_SYSTEM,
318
+ _SUMMARY_USER.format(context=context),
319
+ )
320
+ script = _clean_summary(raw)
321
 
322
  if not script:
323
  raise RuntimeError("Script was empty after cleaning. Please try again.")
324
 
325
+ logger.info("Script ready: %d chars (raw %d chars)", len(script), len(raw))
326
+ return script
tts.py CHANGED
@@ -1,157 +1,245 @@
1
- """
2
- VoiceVerse AI β€” Voice Generation Module (TTS).
3
-
4
- Converts generated scripts into emotionally expressive audio.
5
-
6
- Primary: Qwen3-TTS via HF Inference API (expressive, emotional)
7
- Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)
8
-
9
- Design decisions:
10
- - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)
11
- - Edge-TTS is the demo-safe fallback β€” runs on CPU, no API key needed
12
- - Architecture accepts a voice_id parameter for future multi-voice support
13
- - Audio is saved as WAV for maximum compatibility
14
- """
15
-
16
- import os
17
- import asyncio
18
- import tempfile
19
- from utils import logger, get_temp_filepath
20
-
21
- # ── Configuration ────────────────────────────────────────────────────────────
22
-
23
- QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
24
- EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice
25
-
26
- # Chunk size for TTS (too-long text can cause issues)
27
- TTS_MAX_CHARS = 3000
28
-
29
-
30
- # ── Qwen TTS (Primary β€” via HF Inference API) ───────────────────────────────
31
-
32
- def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
33
- """
34
- Generate audio using Qwen3-TTS via the HF Inference API.
35
-
36
- Args:
37
- text: The script text to convert to speech
38
- voice_id: Reserved for future multi-voice support
39
-
40
- Returns:
41
- Path to the generated audio file, or None if failed
42
- """
43
- token = os.environ.get("HF_TOKEN")
44
- if not token:
45
- logger.warning("HF_TOKEN not set β€” skipping Qwen TTS")
46
- return None
47
-
48
- try:
49
- from huggingface_hub import InferenceClient
50
-
51
- client = InferenceClient(token=token)
52
- logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))
53
-
54
- # Truncate if needed
55
- tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
56
-
57
- # Call the TTS endpoint
58
- audio_bytes = client.text_to_speech(
59
- text=tts_text,
60
- model=QWEN_TTS_MODEL,
61
- )
62
-
63
- if audio_bytes and len(audio_bytes) > 0:
64
- output_path = get_temp_filepath(suffix=".wav")
65
- with open(output_path, "wb") as f:
66
- f.write(audio_bytes)
67
- logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
68
- return output_path
69
- else:
70
- logger.warning("Qwen TTS returned empty audio")
71
- return None
72
-
73
- except Exception as e:
74
- logger.warning("Qwen TTS failed: %s β€” will fall back to Edge-TTS", e)
75
- return None
76
-
77
-
78
- # ── Edge TTS (Fallback β€” CPU-only, no API key) ──────────────────────────────
79
-
80
- def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
81
- """
82
- Generate audio using Edge-TTS (Microsoft neural voices).
83
- Runs entirely on CPU, no API key required.
84
-
85
- Args:
86
- text: The script text to convert to speech
87
- voice_id: Edge-TTS voice name (default: en-US-AriaNeural)
88
-
89
- Returns:
90
- Path to the generated audio file
91
- """
92
- import edge_tts
93
-
94
- voice = voice_id or EDGE_TTS_VOICE
95
- output_path = get_temp_filepath(suffix=".mp3")
96
-
97
- # Truncate if needed
98
- tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
99
-
100
- logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))
101
-
102
- # Edge-TTS is async, so we need to run it in an event loop
103
- async def _generate():
104
- communicate = edge_tts.Communicate(tts_text, voice)
105
- await communicate.save(output_path)
106
-
107
- # Handle event loop β€” works whether called from sync or async context
108
- try:
109
- loop = asyncio.get_event_loop()
110
- if loop.is_running():
111
- # We're inside an existing event loop (e.g., Gradio)
112
- import concurrent.futures
113
- with concurrent.futures.ThreadPoolExecutor() as executor:
114
- future = executor.submit(asyncio.run, _generate())
115
- future.result(timeout=120)
116
- else:
117
- loop.run_until_complete(_generate())
118
- except RuntimeError:
119
- asyncio.run(_generate())
120
-
121
- file_size = os.path.getsize(output_path)
122
- logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)
123
-
124
- if file_size == 0:
125
- raise RuntimeError("Edge-TTS generated an empty audio file")
126
-
127
- return output_path
128
-
129
-
130
- # ── Unified Interface ────────────────────────────────────────────────────────
131
-
132
- def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
133
- """
134
- Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.
135
-
136
- Args:
137
- text: The script text to convert to speech
138
- voice_id: Optional voice identifier
139
-
140
- Returns:
141
- Tuple of (audio_file_path, engine_used)
142
- """
143
- if not text or not text.strip():
144
- raise ValueError("No text provided for audio generation.")
145
-
146
- # Try Qwen TTS first (expressive, emotional)
147
- logger.info("Attempting Qwen3-TTS (primary)...")
148
- audio_path = generate_audio_qwen(text, voice_id)
149
-
150
- if audio_path and os.path.exists(audio_path):
151
- return audio_path, "Qwen3-TTS"
152
-
153
- # Fall back to Edge-TTS (reliable, CPU-only)
154
- logger.info("Falling back to Edge-TTS...")
155
- audio_path = generate_audio_edge(text, voice_id)
156
-
157
- return audio_path, "Edge-TTS"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VoiceVerse AI β€” Voice Generation Module (TTS).
3
+
4
+ Converts generated scripts into audio.
5
+
6
+ Primary: Qwen3-TTS via HF Inference API
7
+ Fallback: Edge-TTS (CPU-only, no API key needed)
8
+
9
+ Delivery Mode additions:
10
+ - Podcast mode : splits script on HOST_1/HOST_2 tags, generates each
11
+ segment with a distinct voice, then concatenates.
12
+ - Summary/Song : single voice, unchanged from original behaviour.
13
+
14
+ Public API (unchanged signature):
15
+ generate_audio(text, voice_id=None) β†’ (path, engine_name)
16
+
17
+ New internal API:
18
+ generate_audio_podcast(script) β†’ (path, engine_name)
19
+ """
20
+
21
+ import os
22
+ import re
23
+ import asyncio
24
+ import tempfile
25
+ from utils import logger, get_temp_filepath
26
+
27
+ # ── Configuration ────────────────────────────────────────────────────────────
28
+
29
+ QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
30
+
31
+ # Edge-TTS voices
32
+ EDGE_VOICE_DEFAULT = "en-US-AriaNeural" # Host 1 / single voice
33
+ EDGE_VOICE_HOST2 = "en-US-GuyNeural" # Host 2 (podcast)
34
+
35
+ TTS_MAX_CHARS = 3000 # hard cap per TTS call
36
+
37
+
38
+ # ══════════════════════════════════════════════════════════════════════════════
39
+ # Low-level TTS helpers
40
+ # ══════════════════════════════════════════════════════════════════════════════
41
+
42
+ def _qwen_tts(text: str) -> str | None:
43
+ """
44
+ Call Qwen3-TTS via HF Inference API.
45
+ Returns a WAV file path on success, None on any failure.
46
+ """
47
+ token = os.environ.get("HF_TOKEN")
48
+ if not token:
49
+ logger.warning("HF_TOKEN not set β€” skipping Qwen TTS")
50
+ return None
51
+ try:
52
+ from huggingface_hub import InferenceClient
53
+ client = InferenceClient(token=token)
54
+ snippet = text[:TTS_MAX_CHARS]
55
+ logger.info("Calling Qwen3-TTS (%d chars)…", len(snippet))
56
+ audio_bytes = client.text_to_speech(text=snippet, model=QWEN_TTS_MODEL)
57
+ if not audio_bytes:
58
+ logger.warning("Qwen TTS returned empty bytes")
59
+ return None
60
+ path = get_temp_filepath(suffix=".wav")
61
+ with open(path, "wb") as f:
62
+ f.write(audio_bytes)
63
+ logger.info("Qwen TTS saved: %s (%d bytes)", path, len(audio_bytes))
64
+ return path
65
+ except Exception as exc:
66
+ logger.warning("Qwen TTS failed (%s) β€” will use Edge-TTS fallback", exc)
67
+ return None
68
+
69
+
70
+ def _edge_tts(text: str, voice: str = EDGE_VOICE_DEFAULT) -> str:
71
+ """
72
+ Generate audio with Edge-TTS (CPU, no key needed).
73
+ Returns an MP3 file path.
74
+ """
75
+ import edge_tts
76
+
77
+ snippet = text[:TTS_MAX_CHARS]
78
+ path = get_temp_filepath(suffix=".mp3")
79
+ logger.info("Edge-TTS: voice=%s, %d chars β†’ %s", voice, len(snippet), path)
80
+
81
+ async def _run():
82
+ comm = edge_tts.Communicate(snippet, voice)
83
+ await comm.save(path)
84
+
85
+ # Works whether called from sync or async context (Gradio)
86
+ try:
87
+ loop = asyncio.get_event_loop()
88
+ if loop.is_running():
89
+ import concurrent.futures
90
+ with concurrent.futures.ThreadPoolExecutor() as pool:
91
+ pool.submit(asyncio.run, _run()).result(timeout=120)
92
+ else:
93
+ loop.run_until_complete(_run())
94
+ except RuntimeError:
95
+ asyncio.run(_run())
96
+
97
+ size = os.path.getsize(path)
98
+ if size == 0:
99
+ raise RuntimeError("Edge-TTS produced an empty audio file.")
100
+ logger.info("Edge-TTS saved: %s (%d bytes)", path, size)
101
+ return path
102
+
103
+
104
+ # ══════════════════════════════════════════════════════════════════════════════
105
+ # Audio concatenation (for podcast multi-segment audio)
106
+ # ════════════════════════════════════════════════════���═════════════════════════
107
+
108
+ def _concat_audio_files(paths: list[str]) -> str:
109
+ """
110
+ Concatenate a list of audio files (WAV or MP3) into a single MP3.
111
+ Uses pydub; ffmpeg must be available (packages.txt: ffmpeg).
112
+ Falls back to copying the first file if pydub fails.
113
+ """
114
+ if len(paths) == 1:
115
+ return paths[0]
116
+
117
+ try:
118
+ from pydub import AudioSegment
119
+
120
+ combined = AudioSegment.empty()
121
+ silence = AudioSegment.silent(duration=300) # 300 ms between speakers
122
+
123
+ for p in paths:
124
+ seg = AudioSegment.from_file(p)
125
+ combined += seg + silence
126
+
127
+ out = get_temp_filepath(suffix=".mp3")
128
+ combined.export(out, format="mp3")
129
+ logger.info("Concatenated %d segments β†’ %s", len(paths), out)
130
+ return out
131
+
132
+ except Exception as exc:
133
+ logger.warning("pydub concat failed (%s) β€” returning first segment", exc)
134
+ return paths[0]
135
+
136
+
137
+ # ══════════════════════════════════════════════════════════════════════════════
138
+ # Podcast TTS (multi-voice)
139
+ # ══════════════════════════════════════════════════════════════════════════════
140
+
141
+ def _parse_podcast_script(script: str) -> list[tuple[str, str]]:
142
+ """
143
+ Parse a podcast script into a list of (speaker, text) tuples.
144
+ Expects lines like:
145
+ HOST_1: Some text here.
146
+ HOST_2: Reply text here.
147
+ Consecutive lines from the same speaker are merged.
148
+ """
149
+ segments: list[tuple[str, str]] = []
150
+
151
+ for line in script.splitlines():
152
+ line = line.strip()
153
+ if not line:
154
+ continue
155
+ if line.startswith("HOST_1:"):
156
+ text = line[len("HOST_1:"):].strip()
157
+ if text:
158
+ if segments and segments[-1][0] == "HOST_1":
159
+ segments[-1] = ("HOST_1", segments[-1][1] + " " + text)
160
+ else:
161
+ segments.append(("HOST_1", text))
162
+ elif line.startswith("HOST_2:"):
163
+ text = line[len("HOST_2:"):].strip()
164
+ if text:
165
+ if segments and segments[-1][0] == "HOST_2":
166
+ segments[-1] = ("HOST_2", segments[-1][1] + " " + text)
167
+ else:
168
+ segments.append(("HOST_2", text))
169
+ # Lines without a valid tag are silently skipped
170
+
171
+ return segments
172
+
173
+
174
+ def generate_audio_podcast(script: str) -> tuple[str, str]:
175
+ """
176
+ Generate multi-voice audio for Podcast mode.
177
+
178
+ Strategy:
179
+ 1. Parse script into (speaker, text) segments.
180
+ 2. Generate Edge-TTS audio for each segment using distinct voices.
181
+ (Qwen TTS doesn't expose per-call voice selection, so Edge-TTS is
182
+ used directly for podcast to guarantee two distinct voices.)
183
+ 3. Concatenate all segments with a short silence between speakers.
184
+
185
+ Returns:
186
+ (audio_file_path, engine_label)
187
+ """
188
+ segments = _parse_podcast_script(script)
189
+
190
+ if not segments:
191
+ logger.warning("Podcast parser found no HOST_1/HOST_2 lines β€” using single voice")
192
+ return generate_audio(script)
193
+
194
+ logger.info("Podcast mode: %d speaker segments to synthesise", len(segments))
195
+
196
+ voice_map = {
197
+ "HOST_1": EDGE_VOICE_DEFAULT,
198
+ "HOST_2": EDGE_VOICE_HOST2,
199
+ }
200
+
201
+ audio_paths: list[str] = []
202
+ for speaker, text in segments:
203
+ voice = voice_map.get(speaker, EDGE_VOICE_DEFAULT)
204
+ try:
205
+ path = _edge_tts(text, voice=voice)
206
+ audio_paths.append(path)
207
+ except Exception as exc:
208
+ logger.warning("Segment TTS failed for %s: %s β€” skipping", speaker, exc)
209
+
210
+ if not audio_paths:
211
+ raise RuntimeError("All podcast audio segments failed to generate.")
212
+
213
+ final_path = _concat_audio_files(audio_paths)
214
+ return final_path, "Edge-TTS (Podcast)"
215
+
216
+
217
+ # ══════════════════════════════════════════════════════════════════════════════
218
+ # Unified public interface (unchanged signature from original tts.py)
219
+ # ══════════════════════════════════════════════════════════════════════════════
220
+
221
+ def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
222
+ """
223
+ Generate audio from text (Summary / Song / Rap β€” single voice).
224
+
225
+ Tries Qwen3-TTS first; falls back to Edge-TTS.
226
+
227
+ Args:
228
+ text : Script text to synthesise.
229
+ voice_id : Optional Edge-TTS voice override.
230
+
231
+ Returns:
232
+ (audio_file_path, engine_label)
233
+ """
234
+ if not text or not text.strip():
235
+ raise ValueError("No text provided for audio generation.")
236
+
237
+ # Attempt primary (Qwen)
238
+ path = _qwen_tts(text)
239
+ if path and os.path.exists(path):
240
+ return path, "Qwen3-TTS"
241
+
242
+ # Fallback (Edge-TTS)
243
+ voice = voice_id or EDGE_VOICE_DEFAULT
244
+ path = _edge_tts(text, voice=voice)
245
+ return path, "Edge-TTS"