Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- app.py +364 -239
- gitattributes +35 -0
- script_gen.py +253 -101
- tts.py +245 -157
app.py
CHANGED
|
@@ -1,239 +1,364 @@
|
|
| 1 |
-
"""
|
| 2 |
-
VoiceVerse AI β Main Application.
|
| 3 |
-
|
| 4 |
-
Gradio-based UI that orchestrates the full document-to-audio pipeline:
|
| 5 |
-
1. Upload PDF/TXT β extract text
|
| 6 |
-
2. RAG: chunk, embed, retrieve relevant context
|
| 7 |
-
3.
|
| 8 |
-
4.
|
| 9 |
-
5.
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
progress(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
# ββ
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
# ββ
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Main Application.
|
| 3 |
+
|
| 4 |
+
Gradio-based UI that orchestrates the full document-to-audio pipeline:
|
| 5 |
+
1. Upload PDF/TXT β extract text
|
| 6 |
+
2. RAG: chunk, embed, retrieve relevant context β UNCHANGED
|
| 7 |
+
3. Delivery Mode selector routes to mode-specific prompt β NEW
|
| 8 |
+
4. Generate a spoken/podcast/song script via SmolLM3-3B
|
| 9 |
+
5. Convert script to audio via Qwen TTS / Edge-TTS
|
| 10 |
+
6. Play audio in the browser
|
| 11 |
+
|
| 12 |
+
Delivery Modes:
|
| 13 |
+
- Summary : single-voice structured narration
|
| 14 |
+
- Podcast : two-host dialogue (HOST_1 / HOST_2), dual voice TTS
|
| 15 |
+
- Song / Rap : rhythmic retention content, single voice
|
| 16 |
+
|
| 17 |
+
Entry point for Hugging Face Spaces deployment.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import gradio as gr
|
| 22 |
+
from utils import logger, validate_file, format_error
|
| 23 |
+
from rag import extract_text, RAGStore
|
| 24 |
+
from script_gen import generate_script
|
| 25 |
+
from tts import generate_audio, generate_audio_podcast
|
| 26 |
+
|
| 27 |
+
# ββ Global RAG Store (single-user demo) ββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
rag_store = RAGStore()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
# Pipeline Orchestration
|
| 33 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
|
| 35 |
+
def process_document(
|
| 36 |
+
file,
|
| 37 |
+
delivery_mode: str,
|
| 38 |
+
song_rap_sub: str,
|
| 39 |
+
progress=gr.Progress(),
|
| 40 |
+
):
|
| 41 |
+
"""
|
| 42 |
+
Full pipeline:
|
| 43 |
+
upload β extract β RAG β script (mode-specific) β audio
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
file : Gradio uploaded file object (.name attribute)
|
| 47 |
+
delivery_mode : "Summary" | "Podcast" | "Song / Rap"
|
| 48 |
+
song_rap_sub : "Song" | "Rap" (only relevant for Song/Rap mode)
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
(script_text, audio_file_path, status_markdown)
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# ββ Validate input βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
if file is None:
|
| 56 |
+
raise gr.Error("Please upload a PDF or TXT file first.")
|
| 57 |
+
|
| 58 |
+
file_path = file.name if hasattr(file, "name") else str(file)
|
| 59 |
+
is_valid, msg = validate_file(file_path)
|
| 60 |
+
if not is_valid:
|
| 61 |
+
raise gr.Error(msg)
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# ββ Step 1: Extract text βββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
progress(0.10, desc="π Extracting text from documentβ¦")
|
| 66 |
+
logger.info("Processing file: %s | mode: %s", file_path, delivery_mode)
|
| 67 |
+
|
| 68 |
+
text = extract_text(file_path)
|
| 69 |
+
if not text or len(text.strip()) < 50:
|
| 70 |
+
raise gr.Error(
|
| 71 |
+
"The document contains too little text. "
|
| 72 |
+
"Please upload a document with more content."
|
| 73 |
+
)
|
| 74 |
+
progress(0.20, desc="β
Text extracted")
|
| 75 |
+
|
| 76 |
+
# ββ Step 2: RAG β chunk & embed (UNCHANGED) ββββββββββββββββββββββββββ
|
| 77 |
+
progress(0.30, desc="π§ Building knowledge indexβ¦")
|
| 78 |
+
rag_store.add_document(text)
|
| 79 |
+
chunk_count = len(rag_store.chunks)
|
| 80 |
+
logger.info("RAG index built: %d chunks", chunk_count)
|
| 81 |
+
|
| 82 |
+
# ββ Step 3: Retrieve context (UNCHANGED) βββββββββββββββββββββββββββββ
|
| 83 |
+
progress(0.40, desc="π Retrieving relevant contentβ¦")
|
| 84 |
+
if chunk_count <= 8:
|
| 85 |
+
context_chunks = rag_store.get_all_chunks()
|
| 86 |
+
else:
|
| 87 |
+
context_chunks = rag_store.query(
|
| 88 |
+
"What are the main topics, key insights, and important details?",
|
| 89 |
+
top_k=6,
|
| 90 |
+
)
|
| 91 |
+
progress(0.50, desc="β
Context retrieved")
|
| 92 |
+
|
| 93 |
+
# ββ Step 4: Generate script (mode-aware) βββββββββββββββββββββββββββββ
|
| 94 |
+
mode_label = _mode_progress_label(delivery_mode, song_rap_sub)
|
| 95 |
+
progress(0.60, desc=f"βοΈ Writing {mode_label} scriptβ¦")
|
| 96 |
+
|
| 97 |
+
script = generate_script(
|
| 98 |
+
context_chunks=context_chunks,
|
| 99 |
+
mode=delivery_mode,
|
| 100 |
+
sub_mode=song_rap_sub,
|
| 101 |
+
)
|
| 102 |
+
logger.info("Script generated: %d chars", len(script))
|
| 103 |
+
progress(0.75, desc="β
Script ready")
|
| 104 |
+
|
| 105 |
+
# ββ Step 5: Generate audio (mode-aware) ββββββββββββββββββββββββββββββ
|
| 106 |
+
progress(0.80, desc="ποΈ Synthesising audioβ¦")
|
| 107 |
+
|
| 108 |
+
is_podcast = delivery_mode.strip().lower() == "podcast"
|
| 109 |
+
|
| 110 |
+
if is_podcast:
|
| 111 |
+
audio_path, engine = generate_audio_podcast(script)
|
| 112 |
+
else:
|
| 113 |
+
audio_path, engine = generate_audio(script)
|
| 114 |
+
|
| 115 |
+
logger.info("Audio generated via %s: %s", engine, audio_path)
|
| 116 |
+
progress(1.00, desc="β
Done!")
|
| 117 |
+
|
| 118 |
+
# ββ Build status card βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
+
mode_icon = {"summary": "π", "podcast": "ποΈ", "song / rap": "π΅"}.get(
|
| 120 |
+
delivery_mode.lower(), "π§"
|
| 121 |
+
)
|
| 122 |
+
status = (
|
| 123 |
+
f"### β
Generation complete!\n\n"
|
| 124 |
+
f"| | |\n|---|---|\n"
|
| 125 |
+
f"| {mode_icon} **Mode** | {delivery_mode}"
|
| 126 |
+
+ (f" β {song_rap_sub}" if "song" in delivery_mode.lower() or "rap" in delivery_mode.lower() else "")
|
| 127 |
+
+ f" |\n"
|
| 128 |
+
f"| π **Document** | {os.path.basename(file_path)} |\n"
|
| 129 |
+
f"| π§© **Chunks** | {chunk_count} |\n"
|
| 130 |
+
f"| βοΈ **Script length** | {len(script):,} chars |\n"
|
| 131 |
+
f"| π **Voice engine** | {engine} |\n"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
return script, audio_path, status
|
| 135 |
+
|
| 136 |
+
except gr.Error:
|
| 137 |
+
raise
|
| 138 |
+
except EnvironmentError as e:
|
| 139 |
+
raise gr.Error(str(e))
|
| 140 |
+
except Exception as e:
|
| 141 |
+
error_msg = format_error("pipeline", e)
|
| 142 |
+
raise gr.Error(error_msg)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _mode_progress_label(mode: str, sub_mode: str) -> str:
|
| 146 |
+
m = mode.lower()
|
| 147 |
+
if "podcast" in m:
|
| 148 |
+
return "podcast"
|
| 149 |
+
if "song" in m or "rap" in m:
|
| 150 |
+
return sub_mode.lower()
|
| 151 |
+
return "summary"
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
# Conditional UI visibility helpers
|
| 156 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
+
|
| 158 |
+
def _on_mode_change(mode: str):
|
| 159 |
+
"""
|
| 160 |
+
Return visibility updates for mode-specific sub-controls.
|
| 161 |
+
Called whenever the delivery mode radio changes.
|
| 162 |
+
"""
|
| 163 |
+
show_song_rap = "song" in mode.lower() or "rap" in mode.lower()
|
| 164 |
+
return gr.update(visible=show_song_rap)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
# Gradio UI
|
| 169 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
+
|
| 171 |
+
def build_ui() -> gr.Blocks:
|
| 172 |
+
|
| 173 |
+
css = """
|
| 174 |
+
/* ββ Header βββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 175 |
+
.main-header { text-align: center; margin-bottom: 1rem; }
|
| 176 |
+
.main-header h1 {
|
| 177 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 178 |
+
-webkit-background-clip: text;
|
| 179 |
+
-webkit-text-fill-color: transparent;
|
| 180 |
+
font-size: 2.5rem;
|
| 181 |
+
font-weight: 800;
|
| 182 |
+
margin-bottom: 0.25rem;
|
| 183 |
+
}
|
| 184 |
+
.main-header p { color: #6b7280; font-size: 1.1rem; }
|
| 185 |
+
|
| 186 |
+
/* ββ Mode selector card ββββββββββββββββββββββββββββββββββββ */
|
| 187 |
+
.mode-card {
|
| 188 |
+
background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
|
| 189 |
+
border: 1px solid #e0d9ff;
|
| 190 |
+
border-radius: 12px;
|
| 191 |
+
padding: 1rem 1.25rem;
|
| 192 |
+
margin-top: 0.5rem;
|
| 193 |
+
}
|
| 194 |
+
.mode-card h3 { color: #4c3d99; margin-bottom: 0.5rem; }
|
| 195 |
+
|
| 196 |
+
/* ββ Status ββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 197 |
+
.status-box {
|
| 198 |
+
border-left: 3px solid #667eea;
|
| 199 |
+
padding-left: 1rem;
|
| 200 |
+
margin: 0.5rem 0;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
/* ββ Sub-mode row ββββββββββββββββββββββββββββββββββββββββββ */
|
| 204 |
+
.sub-mode-row { margin-top: 0.5rem; }
|
| 205 |
+
"""
|
| 206 |
+
|
| 207 |
+
with gr.Blocks(
|
| 208 |
+
title="VoiceVerse AI β Document to Audio",
|
| 209 |
+
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
|
| 210 |
+
css=css,
|
| 211 |
+
) as app:
|
| 212 |
+
|
| 213 |
+
# ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 214 |
+
gr.HTML("""
|
| 215 |
+
<div class="main-header">
|
| 216 |
+
<h1>ποΈ VoiceVerse AI</h1>
|
| 217 |
+
<p>Transform your documents into engaging audio experiences</p>
|
| 218 |
+
</div>
|
| 219 |
+
""")
|
| 220 |
+
|
| 221 |
+
with gr.Row(equal_height=False):
|
| 222 |
+
|
| 223 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 224 |
+
# LEFT COLUMN β Upload + Mode Selector
|
| 225 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββ
|
| 226 |
+
with gr.Column(scale=1):
|
| 227 |
+
|
| 228 |
+
# ββ Upload βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
+
gr.Markdown("### π€ Upload Document")
|
| 230 |
+
file_input = gr.File(
|
| 231 |
+
label="Upload a PDF or TXT file",
|
| 232 |
+
file_types=[".pdf", ".txt"],
|
| 233 |
+
type="filepath",
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# ββ Delivery Mode Selector βββββββββββββββββββββββββββββββ
|
| 237 |
+
gr.HTML('<div class="mode-card">')
|
| 238 |
+
gr.Markdown("### π¨ Choose Audio Experience")
|
| 239 |
+
|
| 240 |
+
delivery_mode = gr.Radio(
|
| 241 |
+
choices=["Summary", "Podcast", "Song / Rap"],
|
| 242 |
+
value="Summary",
|
| 243 |
+
label=None,
|
| 244 |
+
elem_id="delivery-mode-radio",
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Song / Rap sub-option β hidden unless Song/Rap selected
|
| 248 |
+
with gr.Row(visible=False, elem_classes=["sub-mode-row"]) as song_rap_row:
|
| 249 |
+
song_rap_sub = gr.Radio(
|
| 250 |
+
choices=["Song", "Rap"],
|
| 251 |
+
value="Rap",
|
| 252 |
+
label="Style",
|
| 253 |
+
scale=1,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Mode description (updates on change)
|
| 257 |
+
mode_description = gr.Markdown(
|
| 258 |
+
value=_mode_description("Summary"),
|
| 259 |
+
elem_id="mode-desc",
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
gr.HTML("</div>") # close .mode-card
|
| 263 |
+
|
| 264 |
+
# ββ Generate Button ββββββββββββββββββββββββββββββββββββββ
|
| 265 |
+
generate_btn = gr.Button(
|
| 266 |
+
"ποΈ Generate Audio",
|
| 267 |
+
variant="primary",
|
| 268 |
+
size="lg",
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# ββ Status βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 272 |
+
status_output = gr.Markdown(
|
| 273 |
+
value="*Upload a document, choose your audio experience, then click Generate.*",
|
| 274 |
+
elem_classes=["status-box"],
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 278 |
+
# RIGHT COLUMN β Audio + Script Output
|
| 279 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
with gr.Column(scale=1):
|
| 281 |
+
|
| 282 |
+
gr.Markdown("### π§ Generated Audio")
|
| 283 |
+
audio_output = gr.Audio(
|
| 284 |
+
label="Audio",
|
| 285 |
+
type="filepath",
|
| 286 |
+
interactive=False,
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
gr.Markdown("### βοΈ Generated Script")
|
| 290 |
+
script_output = gr.Textbox(
|
| 291 |
+
label="Script",
|
| 292 |
+
lines=14,
|
| 293 |
+
max_lines=22,
|
| 294 |
+
interactive=False,
|
| 295 |
+
placeholder="The generated script will appear hereβ¦",
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# ββ Footer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 299 |
+
gr.Markdown(
|
| 300 |
+
"<center style='color:#9ca3af;margin-top:1rem;'>"
|
| 301 |
+
"Built with β€οΈ using SmolLM3-3B Β· Qwen3-TTS Β· Edge-TTS Β· Gradio"
|
| 302 |
+
"</center>"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 306 |
+
# Event wiring
|
| 307 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
|
| 309 |
+
# Show/hide Song-Rap sub-option + update description when mode changes
|
| 310 |
+
delivery_mode.change(
|
| 311 |
+
fn=_on_mode_change_full,
|
| 312 |
+
inputs=[delivery_mode],
|
| 313 |
+
outputs=[song_rap_row, mode_description],
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
# Generate button click
|
| 317 |
+
generate_btn.click(
|
| 318 |
+
fn=process_document,
|
| 319 |
+
inputs=[file_input, delivery_mode, song_rap_sub],
|
| 320 |
+
outputs=[script_output, audio_output, status_output],
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
return app
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ββ Mode description helper βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 327 |
+
|
| 328 |
+
def _mode_description(mode: str) -> str:
|
| 329 |
+
descriptions = {
|
| 330 |
+
"Summary": (
|
| 331 |
+
"*π **Summary** β A clear, structured spoken narration covering "
|
| 332 |
+
"the intro, key points, and conclusion. Single voice, neutral tone.*"
|
| 333 |
+
),
|
| 334 |
+
"Podcast": (
|
| 335 |
+
"*ποΈ **Podcast** β A two-host conversation. Host 1 guides and "
|
| 336 |
+
"asks questions; Host 2 explains and elaborates. Dual voices.*"
|
| 337 |
+
),
|
| 338 |
+
"Song / Rap": (
|
| 339 |
+
"*π΅ **Song / Rap** β Key ideas transformed into a rhythmic, "
|
| 340 |
+
"memorable format. Choose Song for smooth flow or Rap for punchy lines.*"
|
| 341 |
+
),
|
| 342 |
+
}
|
| 343 |
+
return descriptions.get(mode, "")
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def _on_mode_change_full(mode: str):
|
| 347 |
+
"""Return (song_rap_row visibility, description markdown)."""
|
| 348 |
+
show_sub = "song" in mode.lower() or "rap" in mode.lower()
|
| 349 |
+
return gr.update(visible=show_sub), _mode_description(mode)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 353 |
+
# Entry point
|
| 354 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
+
|
| 356 |
+
if __name__ == "__main__":
|
| 357 |
+
logger.info("Starting VoiceVerse AIβ¦")
|
| 358 |
+
app = build_ui()
|
| 359 |
+
app.launch(
|
| 360 |
+
server_name="0.0.0.0",
|
| 361 |
+
server_port=7860,
|
| 362 |
+
share=False,
|
| 363 |
+
show_error=True,
|
| 364 |
+
)
|
gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
script_gen.py
CHANGED
|
@@ -4,13 +4,19 @@ VoiceVerse AI β Script Generation Module.
|
|
| 4 |
Generates spoken-style scripts from retrieved document chunks
|
| 5 |
using SmolLM3-3B via the Hugging Face Inference API.
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
Design decisions:
|
| 8 |
-
-
|
| 9 |
-
-
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
- Temperature 0.4 keeps output grounded and factual
|
| 13 |
-
- Post-processing strips markdown/XML artifacts for clean TTS
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
|
@@ -18,157 +24,303 @@ import re
|
|
| 18 |
from huggingface_hub import InferenceClient
|
| 19 |
from utils import logger
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
|
| 23 |
-
MAX_NEW_TOKENS = 1024
|
| 24 |
-
TEMPERATURE = 0.4
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
|
| 34 |
-
4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
|
| 35 |
-
5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
|
| 36 |
-
6. Never say "the document says" or "according to the text". Speak as the expert.
|
| 37 |
-
7. If the content is limited, keep the script short rather than inventing information.
|
| 38 |
-
8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
|
| 39 |
-
9. Output ONLY the spoken narration text, nothing else."""
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
{context}
|
| 45 |
-
--- END ---
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
"""
|
| 56 |
-
Remove markdown
|
| 57 |
-
|
| 58 |
"""
|
| 59 |
-
# Remove <think>
|
| 60 |
-
text = re.sub(r
|
| 61 |
|
| 62 |
-
# Remove
|
| 63 |
-
text = re.sub(r
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
text = re.sub(r
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
text = re.sub(r
|
| 70 |
-
text = re.sub(r
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
text = re.sub(r
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
text = re.sub(r
|
| 77 |
-
text = re.sub(r
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
text = re.sub(r
|
|
|
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
text = re.sub(r
|
|
|
|
| 84 |
|
| 85 |
-
#
|
| 86 |
-
text = re.sub(r
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
| 90 |
|
| 91 |
-
# Collapse multiple newlines into one
|
| 92 |
-
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
|
| 97 |
-
return text.strip()
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
def _get_client() -> InferenceClient:
|
| 103 |
-
"""Create an HF Inference client with the user's token."""
|
| 104 |
token = os.environ.get("HF_TOKEN")
|
| 105 |
if not token:
|
| 106 |
raise EnvironmentError(
|
| 107 |
"HF_TOKEN environment variable is not set. "
|
| 108 |
-
"Please
|
| 109 |
)
|
| 110 |
-
return InferenceClient(
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def generate_script(
|
| 117 |
context_chunks: list[str],
|
|
|
|
|
|
|
| 118 |
topic: str = "the key ideas and insights from this document",
|
| 119 |
) -> str:
|
| 120 |
"""
|
| 121 |
-
Generate a spoken
|
| 122 |
|
| 123 |
Args:
|
| 124 |
-
context_chunks:
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
Returns:
|
| 128 |
-
A
|
|
|
|
| 129 |
"""
|
| 130 |
if not context_chunks:
|
| 131 |
raise ValueError("No document context provided. Please upload a document first.")
|
| 132 |
|
| 133 |
-
# Combine
|
| 134 |
context = "\n\n".join(context_chunks)
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)
|
| 144 |
-
|
| 145 |
-
logger.info("Generating script via %s (context: %d chars, topic: '%s')",
|
| 146 |
-
MODEL_ID, len(context), topic[:50])
|
| 147 |
-
|
| 148 |
-
client = _get_client()
|
| 149 |
-
|
| 150 |
-
# Call the model using chat_completion
|
| 151 |
-
response = client.chat_completion(
|
| 152 |
-
model=MODEL_ID,
|
| 153 |
-
messages=[
|
| 154 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 155 |
-
{"role": "user", "content": user_message},
|
| 156 |
-
],
|
| 157 |
-
max_tokens=MAX_NEW_TOKENS,
|
| 158 |
-
temperature=TEMPERATURE,
|
| 159 |
-
top_p=0.9,
|
| 160 |
)
|
| 161 |
|
| 162 |
-
|
|
|
|
| 163 |
|
| 164 |
-
if
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
if not script:
|
| 171 |
raise RuntimeError("Script was empty after cleaning. Please try again.")
|
| 172 |
|
| 173 |
-
logger.info("Script
|
| 174 |
-
return script
|
|
|
|
| 4 |
Generates spoken-style scripts from retrieved document chunks
|
| 5 |
using SmolLM3-3B via the Hugging Face Inference API.
|
| 6 |
|
| 7 |
+
Delivery Modes:
|
| 8 |
+
- Summary : Single-speaker structured narration
|
| 9 |
+
- Podcast : Two-host dialogue (HOST_1 / HOST_2 tags)
|
| 10 |
+
- Song / Rap : Rhythmic retention-style content
|
| 11 |
+
|
| 12 |
+
The core RAG pipeline (rag.py) is NOT modified.
|
| 13 |
+
Only this generation stage switches behaviour based on `mode`.
|
| 14 |
+
|
| 15 |
Design decisions:
|
| 16 |
+
- generate_script() is the single public entry point
|
| 17 |
+
- Each mode has its own system + user prompt pair
|
| 18 |
+
- Post-processing cleans markdown / XML artifacts for TTS
|
| 19 |
+
- Podcast mode preserves HOST_1 / HOST_2 tags (tts.py splits on them)
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
import os
|
|
|
|
| 24 |
from huggingface_hub import InferenceClient
|
| 25 |
from utils import logger
|
| 26 |
|
| 27 |
+
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
|
| 30 |
+
MAX_NEW_TOKENS = 1200
|
| 31 |
+
TEMPERATURE = 0.5
|
| 32 |
|
|
|
|
| 33 |
|
| 34 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
# Mode A β Summary
|
| 36 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
_SUMMARY_SYSTEM = """\
|
| 39 |
+
You are a professional narrator. Your task is to produce a clear, structured \
|
| 40 |
+
spoken summary strictly grounded in the provided source material.
|
| 41 |
|
| 42 |
+
RULES:
|
| 43 |
+
1. Use ONLY facts present in the source. Do NOT add outside knowledge.
|
| 44 |
+
2. Structure: a short introduction, key points spoken as natural sentences, \
|
| 45 |
+
then a concise conclusion.
|
| 46 |
+
3. Write in plain text only β no markdown, no bullet symbols, no headers.
|
| 47 |
+
4. Write for the ear: short sentences, conversational language.
|
| 48 |
+
5. Never say "the document says" or "according to the text". Speak as the expert.
|
| 49 |
+
6. Output ONLY the spoken narration text, nothing else.\
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
_SUMMARY_USER = """\
|
| 53 |
+
SOURCE MATERIAL:
|
| 54 |
{context}
|
|
|
|
| 55 |
|
| 56 |
+
Write a spoken summary that flows naturally. Cover the introduction, the key \
|
| 57 |
+
points, and a short conclusion β all in plain spoken sentences without headings \
|
| 58 |
+
or labels.\
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
# Mode B β Podcast (Multi-Host)
|
| 64 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
|
| 66 |
+
_PODCAST_SYSTEM = """\
|
| 67 |
+
You are a podcast script writer. Produce an engaging two-host conversation \
|
| 68 |
+
strictly grounded in the provided source material.
|
| 69 |
+
|
| 70 |
+
STRICT OUTPUT FORMAT β every line must start with a speaker tag:
|
| 71 |
+
HOST_1: <what Host 1 says>
|
| 72 |
+
HOST_2: <what Host 2 says>
|
| 73 |
+
|
| 74 |
+
RULES:
|
| 75 |
+
1. Alternate HOST_1 and HOST_2 throughout. Never have the same host speak twice in a row.
|
| 76 |
+
2. HOST_1 introduces topics, asks questions, and guides the conversation.
|
| 77 |
+
3. HOST_2 explains concepts, provides detail, and answers HOST_1's questions.
|
| 78 |
+
4. Use ONLY information present in the source material. No hallucination.
|
| 79 |
+
5. Tone: conversational, curious, engaging β like a real podcast.
|
| 80 |
+
6. Do NOT add lines that are not prefixed with HOST_1: or HOST_2:.
|
| 81 |
+
7. No markdown, no stage directions, no asterisks.
|
| 82 |
+
8. Aim for 16β24 exchanges (lines) so the conversation feels substantial.\
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
_PODCAST_USER = """\
|
| 86 |
+
SOURCE MATERIAL:
|
| 87 |
+
{context}
|
| 88 |
+
|
| 89 |
+
Write the full podcast conversation. Every single line must start with either \
|
| 90 |
+
HOST_1: or HOST_2: β no exceptions.\
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
# Mode C β Song / Rap (Retention Mode)
|
| 96 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 97 |
|
| 98 |
+
_SONG_SYSTEM = """\
|
| 99 |
+
You are a creative lyricist. Your task has two steps:
|
| 100 |
|
| 101 |
+
STEP 1 β silently extract 5 to 7 key ideas from the source material.
|
| 102 |
+
STEP 2 β turn those key ideas into a smooth, melodic SONG.
|
| 103 |
|
| 104 |
+
SONG RULES:
|
| 105 |
+
- Simple, memorable language.
|
| 106 |
+
- Rhyming couplets or AABB scheme.
|
| 107 |
+
- Include a CHORUS (label it [CHORUS]) that repeats the main concept.
|
| 108 |
+
- Label verses [VERSE 1], [VERSE 2], etc.
|
| 109 |
+
- Short lines (6β10 words each).
|
| 110 |
+
- Use repetition to aid retention.
|
| 111 |
+
- Do NOT invent facts not in the source.
|
| 112 |
+
- Output ONLY the song lyrics with section labels. No explanations.\
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
_RAP_SYSTEM = """\
|
| 116 |
+
You are a creative lyricist. Your task has two steps:
|
| 117 |
+
|
| 118 |
+
STEP 1 β silently extract 5 to 7 key ideas from the source material.
|
| 119 |
+
STEP 2 β turn those key ideas into a punchy, rhythmic RAP.
|
| 120 |
+
|
| 121 |
+
RAP RULES:
|
| 122 |
+
- Short, punchy lines (5β8 words each).
|
| 123 |
+
- Fast-flow rhyme scheme (AABB or ABAB).
|
| 124 |
+
- Include a HOOK (label it [HOOK]) that repeats the main concept.
|
| 125 |
+
- Label verses [VERSE 1], [VERSE 2], etc.
|
| 126 |
+
- Use repetition and wordplay to aid retention.
|
| 127 |
+
- Do NOT invent facts not in the source.
|
| 128 |
+
- Output ONLY the rap lyrics with section labels. No explanations.\
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
_SONG_RAP_USER = """\
|
| 132 |
+
SOURCE MATERIAL:
|
| 133 |
+
{context}
|
| 134 |
+
|
| 135 |
+
Extract the key ideas, then write the full {form} based strictly on those ideas.\
|
| 136 |
+
"""
|
| 137 |
|
| 138 |
+
|
| 139 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
# Post-processing
|
| 141 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 142 |
+
|
| 143 |
+
def _clean_for_tts(text: str, preserve_host_tags: bool = False) -> str:
|
| 144 |
"""
|
| 145 |
+
Remove markdown and XML/HTML artifacts that TTS engines would read aloud.
|
| 146 |
+
When preserve_host_tags=True, HOST_1: / HOST_2: prefixes are kept intact.
|
| 147 |
"""
|
| 148 |
+
# Remove <think>β¦</think> reasoning traces (SmolLM3)
|
| 149 |
+
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
|
| 150 |
|
| 151 |
+
# Remove remaining XML/HTML tags (but NOT HOST_1/HOST_2 lines)
|
| 152 |
+
text = re.sub(r"<[^>]+>", "", text)
|
| 153 |
|
| 154 |
+
# Markdown headers
|
| 155 |
+
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
| 156 |
|
| 157 |
+
# Bold / italic
|
| 158 |
+
text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
|
| 159 |
+
text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
|
| 160 |
|
| 161 |
+
# Links
|
| 162 |
+
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
| 163 |
|
| 164 |
+
# Code blocks
|
| 165 |
+
text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
|
| 166 |
+
text = re.sub(r"`([^`]+)`", r"\1", text)
|
| 167 |
|
| 168 |
+
# Bullet / numbered lists
|
| 169 |
+
text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
|
| 170 |
+
text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
|
| 171 |
|
| 172 |
+
# Blockquotes / horizontal rules
|
| 173 |
+
text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
|
| 174 |
+
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
|
| 175 |
|
| 176 |
+
# Collapse whitespace
|
| 177 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 178 |
+
text = re.sub(r" {2,}", " ", text)
|
| 179 |
|
| 180 |
+
return text.strip()
|
|
|
|
| 181 |
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
def _clean_summary(text: str) -> str:
|
| 184 |
+
return _clean_for_tts(text, preserve_host_tags=False)
|
| 185 |
|
|
|
|
| 186 |
|
| 187 |
+
def _clean_podcast(text: str) -> str:
|
| 188 |
+
"""
|
| 189 |
+
Keep HOST_1: / HOST_2: tags β they are required by tts.py for voice splitting.
|
| 190 |
+
Strip everything else.
|
| 191 |
+
"""
|
| 192 |
+
text = _clean_for_tts(text, preserve_host_tags=True)
|
| 193 |
+
|
| 194 |
+
# Normalise tag variants: "Host 1:", "host_1:", "HOST1:" β "HOST_1:"
|
| 195 |
+
text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
|
| 196 |
+
text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
|
| 197 |
+
|
| 198 |
+
# Remove lines that lack a speaker tag (stray stage directions etc.)
|
| 199 |
+
lines = text.splitlines()
|
| 200 |
+
clean_lines = [
|
| 201 |
+
ln for ln in lines
|
| 202 |
+
if ln.strip() == "" or ln.strip().startswith("HOST_1:") or ln.strip().startswith("HOST_2:")
|
| 203 |
+
]
|
| 204 |
+
return "\n".join(clean_lines).strip()
|
| 205 |
|
| 206 |
+
|
| 207 |
+
def _clean_song_rap(text: str) -> str:
|
| 208 |
+
"""
|
| 209 |
+
Keep section labels ([VERSE 1], [CHORUS], [HOOK]) β they help TTS pacing
|
| 210 |
+
when read aloud, and are harmless.
|
| 211 |
+
"""
|
| 212 |
+
return _clean_for_tts(text, preserve_host_tags=False)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 216 |
+
# LLM Client
|
| 217 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 218 |
|
| 219 |
def _get_client() -> InferenceClient:
|
|
|
|
| 220 |
token = os.environ.get("HF_TOKEN")
|
| 221 |
if not token:
|
| 222 |
raise EnvironmentError(
|
| 223 |
"HF_TOKEN environment variable is not set. "
|
| 224 |
+
"Please add your Hugging Face API token as a Space secret."
|
| 225 |
)
|
| 226 |
+
return InferenceClient(provider="hf-inference", token=token)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _call_llm(system_prompt: str, user_prompt: str) -> str:
|
| 230 |
+
"""Send a chat completion request and return the raw response text."""
|
| 231 |
+
client = _get_client()
|
| 232 |
+
response = client.chat_completion(
|
| 233 |
+
model=MODEL_ID,
|
| 234 |
+
messages=[
|
| 235 |
+
{"role": "system", "content": system_prompt},
|
| 236 |
+
{"role": "user", "content": user_prompt},
|
| 237 |
+
],
|
| 238 |
+
max_tokens=MAX_NEW_TOKENS,
|
| 239 |
+
temperature=TEMPERATURE,
|
| 240 |
+
top_p=0.9,
|
| 241 |
)
|
| 242 |
+
raw = response.choices[0].message.content.strip()
|
| 243 |
+
if not raw:
|
| 244 |
+
raise RuntimeError("The model returned an empty response. Please try again.")
|
| 245 |
+
return raw
|
| 246 |
|
| 247 |
|
| 248 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 249 |
+
# Public Entry Point
|
| 250 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 251 |
+
|
| 252 |
def generate_script(
|
| 253 |
context_chunks: list[str],
|
| 254 |
+
mode: str = "Summary",
|
| 255 |
+
sub_mode: str = "Rap",
|
| 256 |
topic: str = "the key ideas and insights from this document",
|
| 257 |
) -> str:
|
| 258 |
"""
|
| 259 |
+
Generate a spoken script from retrieved RAG chunks.
|
| 260 |
|
| 261 |
Args:
|
| 262 |
+
context_chunks : Chunks returned by RAGStore.query() β NOT modified here.
|
| 263 |
+
mode : "Summary" | "Podcast" | "Song / Rap"
|
| 264 |
+
sub_mode : "Song" | "Rap" (only used when mode == "Song / Rap")
|
| 265 |
+
topic : Optional human-readable topic label (unused in prompts
|
| 266 |
+
except summary, kept for logging).
|
| 267 |
|
| 268 |
Returns:
|
| 269 |
+
A clean string ready to hand to tts.generate_audio().
|
| 270 |
+
Podcast mode preserves HOST_1: / HOST_2: prefixes.
|
| 271 |
"""
|
| 272 |
if not context_chunks:
|
| 273 |
raise ValueError("No document context provided. Please upload a document first.")
|
| 274 |
|
| 275 |
+
# ββ Combine & truncate context βββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
context = "\n\n".join(context_chunks)
|
| 277 |
+
max_ctx = 6000
|
| 278 |
+
if len(context) > max_ctx:
|
| 279 |
+
context = context[:max_ctx]
|
| 280 |
+
logger.warning("Context truncated to %d chars for LLM call.", max_ctx)
|
| 281 |
+
|
| 282 |
+
logger.info(
|
| 283 |
+
"Generating script | mode=%s sub_mode=%s context=%d chars",
|
| 284 |
+
mode, sub_mode, len(context),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
)
|
| 286 |
|
| 287 |
+
# ββ Route to the correct prompt pair ββββββββββββββββββββββββββββββββββββ
|
| 288 |
+
mode_key = mode.strip().lower()
|
| 289 |
|
| 290 |
+
if mode_key == "summary":
|
| 291 |
+
raw = _call_llm(
|
| 292 |
+
_SUMMARY_SYSTEM,
|
| 293 |
+
_SUMMARY_USER.format(context=context),
|
| 294 |
+
)
|
| 295 |
+
script = _clean_summary(raw)
|
| 296 |
|
| 297 |
+
elif mode_key == "podcast":
|
| 298 |
+
raw = _call_llm(
|
| 299 |
+
_PODCAST_SYSTEM,
|
| 300 |
+
_PODCAST_USER.format(context=context),
|
| 301 |
+
)
|
| 302 |
+
script = _clean_podcast(raw)
|
| 303 |
+
|
| 304 |
+
elif "song" in mode_key or "rap" in mode_key:
|
| 305 |
+
form = sub_mode.lower() # "song" or "rap"
|
| 306 |
+
system = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
|
| 307 |
+
raw = _call_llm(
|
| 308 |
+
system,
|
| 309 |
+
_SONG_RAP_USER.format(context=context, form=form),
|
| 310 |
+
)
|
| 311 |
+
script = _clean_song_rap(raw)
|
| 312 |
+
|
| 313 |
+
else:
|
| 314 |
+
# Unknown mode β fall back to Summary so we never crash
|
| 315 |
+
logger.warning("Unknown mode '%s' β falling back to Summary.", mode)
|
| 316 |
+
raw = _call_llm(
|
| 317 |
+
_SUMMARY_SYSTEM,
|
| 318 |
+
_SUMMARY_USER.format(context=context),
|
| 319 |
+
)
|
| 320 |
+
script = _clean_summary(raw)
|
| 321 |
|
| 322 |
if not script:
|
| 323 |
raise RuntimeError("Script was empty after cleaning. Please try again.")
|
| 324 |
|
| 325 |
+
logger.info("Script ready: %d chars (raw %d chars)", len(script), len(raw))
|
| 326 |
+
return script
|
tts.py
CHANGED
|
@@ -1,157 +1,245 @@
|
|
| 1 |
-
"""
|
| 2 |
-
VoiceVerse AI β Voice Generation Module (TTS).
|
| 3 |
-
|
| 4 |
-
Converts generated scripts into
|
| 5 |
-
|
| 6 |
-
Primary: Qwen3-TTS via HF Inference API
|
| 7 |
-
Fallback: Edge-TTS (
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
""
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
logger.info("
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
"""
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Voice Generation Module (TTS).
|
| 3 |
+
|
| 4 |
+
Converts generated scripts into audio.
|
| 5 |
+
|
| 6 |
+
Primary: Qwen3-TTS via HF Inference API
|
| 7 |
+
Fallback: Edge-TTS (CPU-only, no API key needed)
|
| 8 |
+
|
| 9 |
+
Delivery Mode additions:
|
| 10 |
+
- Podcast mode : splits script on HOST_1/HOST_2 tags, generates each
|
| 11 |
+
segment with a distinct voice, then concatenates.
|
| 12 |
+
- Summary/Song : single voice, unchanged from original behaviour.
|
| 13 |
+
|
| 14 |
+
Public API (unchanged signature):
|
| 15 |
+
generate_audio(text, voice_id=None) β (path, engine_name)
|
| 16 |
+
|
| 17 |
+
New internal API:
|
| 18 |
+
generate_audio_podcast(script) β (path, engine_name)
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
import asyncio
|
| 24 |
+
import tempfile
|
| 25 |
+
from utils import logger, get_temp_filepath
|
| 26 |
+
|
| 27 |
+
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
|
| 29 |
+
QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
|
| 30 |
+
|
| 31 |
+
# Edge-TTS voices
|
| 32 |
+
EDGE_VOICE_DEFAULT = "en-US-AriaNeural" # Host 1 / single voice
|
| 33 |
+
EDGE_VOICE_HOST2 = "en-US-GuyNeural" # Host 2 (podcast)
|
| 34 |
+
|
| 35 |
+
TTS_MAX_CHARS = 3000 # hard cap per TTS call
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
# Low-level TTS helpers
|
| 40 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
|
| 42 |
+
def _qwen_tts(text: str) -> str | None:
|
| 43 |
+
"""
|
| 44 |
+
Call Qwen3-TTS via HF Inference API.
|
| 45 |
+
Returns a WAV file path on success, None on any failure.
|
| 46 |
+
"""
|
| 47 |
+
token = os.environ.get("HF_TOKEN")
|
| 48 |
+
if not token:
|
| 49 |
+
logger.warning("HF_TOKEN not set β skipping Qwen TTS")
|
| 50 |
+
return None
|
| 51 |
+
try:
|
| 52 |
+
from huggingface_hub import InferenceClient
|
| 53 |
+
client = InferenceClient(token=token)
|
| 54 |
+
snippet = text[:TTS_MAX_CHARS]
|
| 55 |
+
logger.info("Calling Qwen3-TTS (%d chars)β¦", len(snippet))
|
| 56 |
+
audio_bytes = client.text_to_speech(text=snippet, model=QWEN_TTS_MODEL)
|
| 57 |
+
if not audio_bytes:
|
| 58 |
+
logger.warning("Qwen TTS returned empty bytes")
|
| 59 |
+
return None
|
| 60 |
+
path = get_temp_filepath(suffix=".wav")
|
| 61 |
+
with open(path, "wb") as f:
|
| 62 |
+
f.write(audio_bytes)
|
| 63 |
+
logger.info("Qwen TTS saved: %s (%d bytes)", path, len(audio_bytes))
|
| 64 |
+
return path
|
| 65 |
+
except Exception as exc:
|
| 66 |
+
logger.warning("Qwen TTS failed (%s) β will use Edge-TTS fallback", exc)
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _edge_tts(text: str, voice: str = EDGE_VOICE_DEFAULT) -> str:
|
| 71 |
+
"""
|
| 72 |
+
Generate audio with Edge-TTS (CPU, no key needed).
|
| 73 |
+
Returns an MP3 file path.
|
| 74 |
+
"""
|
| 75 |
+
import edge_tts
|
| 76 |
+
|
| 77 |
+
snippet = text[:TTS_MAX_CHARS]
|
| 78 |
+
path = get_temp_filepath(suffix=".mp3")
|
| 79 |
+
logger.info("Edge-TTS: voice=%s, %d chars β %s", voice, len(snippet), path)
|
| 80 |
+
|
| 81 |
+
async def _run():
|
| 82 |
+
comm = edge_tts.Communicate(snippet, voice)
|
| 83 |
+
await comm.save(path)
|
| 84 |
+
|
| 85 |
+
# Works whether called from sync or async context (Gradio)
|
| 86 |
+
try:
|
| 87 |
+
loop = asyncio.get_event_loop()
|
| 88 |
+
if loop.is_running():
|
| 89 |
+
import concurrent.futures
|
| 90 |
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
| 91 |
+
pool.submit(asyncio.run, _run()).result(timeout=120)
|
| 92 |
+
else:
|
| 93 |
+
loop.run_until_complete(_run())
|
| 94 |
+
except RuntimeError:
|
| 95 |
+
asyncio.run(_run())
|
| 96 |
+
|
| 97 |
+
size = os.path.getsize(path)
|
| 98 |
+
if size == 0:
|
| 99 |
+
raise RuntimeError("Edge-TTS produced an empty audio file.")
|
| 100 |
+
logger.info("Edge-TTS saved: %s (%d bytes)", path, size)
|
| 101 |
+
return path
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 105 |
+
# Audio concatenation (for podcast multi-segment audio)
|
| 106 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββ
|
| 107 |
+
|
| 108 |
+
def _concat_audio_files(paths: list[str]) -> str:
|
| 109 |
+
"""
|
| 110 |
+
Concatenate a list of audio files (WAV or MP3) into a single MP3.
|
| 111 |
+
Uses pydub; ffmpeg must be available (packages.txt: ffmpeg).
|
| 112 |
+
Falls back to copying the first file if pydub fails.
|
| 113 |
+
"""
|
| 114 |
+
if len(paths) == 1:
|
| 115 |
+
return paths[0]
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
from pydub import AudioSegment
|
| 119 |
+
|
| 120 |
+
combined = AudioSegment.empty()
|
| 121 |
+
silence = AudioSegment.silent(duration=300) # 300 ms between speakers
|
| 122 |
+
|
| 123 |
+
for p in paths:
|
| 124 |
+
seg = AudioSegment.from_file(p)
|
| 125 |
+
combined += seg + silence
|
| 126 |
+
|
| 127 |
+
out = get_temp_filepath(suffix=".mp3")
|
| 128 |
+
combined.export(out, format="mp3")
|
| 129 |
+
logger.info("Concatenated %d segments β %s", len(paths), out)
|
| 130 |
+
return out
|
| 131 |
+
|
| 132 |
+
except Exception as exc:
|
| 133 |
+
logger.warning("pydub concat failed (%s) β returning first segment", exc)
|
| 134 |
+
return paths[0]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
# Podcast TTS (multi-voice)
|
| 139 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
|
| 141 |
+
def _parse_podcast_script(script: str) -> list[tuple[str, str]]:
|
| 142 |
+
"""
|
| 143 |
+
Parse a podcast script into a list of (speaker, text) tuples.
|
| 144 |
+
Expects lines like:
|
| 145 |
+
HOST_1: Some text here.
|
| 146 |
+
HOST_2: Reply text here.
|
| 147 |
+
Consecutive lines from the same speaker are merged.
|
| 148 |
+
"""
|
| 149 |
+
segments: list[tuple[str, str]] = []
|
| 150 |
+
|
| 151 |
+
for line in script.splitlines():
|
| 152 |
+
line = line.strip()
|
| 153 |
+
if not line:
|
| 154 |
+
continue
|
| 155 |
+
if line.startswith("HOST_1:"):
|
| 156 |
+
text = line[len("HOST_1:"):].strip()
|
| 157 |
+
if text:
|
| 158 |
+
if segments and segments[-1][0] == "HOST_1":
|
| 159 |
+
segments[-1] = ("HOST_1", segments[-1][1] + " " + text)
|
| 160 |
+
else:
|
| 161 |
+
segments.append(("HOST_1", text))
|
| 162 |
+
elif line.startswith("HOST_2:"):
|
| 163 |
+
text = line[len("HOST_2:"):].strip()
|
| 164 |
+
if text:
|
| 165 |
+
if segments and segments[-1][0] == "HOST_2":
|
| 166 |
+
segments[-1] = ("HOST_2", segments[-1][1] + " " + text)
|
| 167 |
+
else:
|
| 168 |
+
segments.append(("HOST_2", text))
|
| 169 |
+
# Lines without a valid tag are silently skipped
|
| 170 |
+
|
| 171 |
+
return segments
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def generate_audio_podcast(script: str) -> tuple[str, str]:
|
| 175 |
+
"""
|
| 176 |
+
Generate multi-voice audio for Podcast mode.
|
| 177 |
+
|
| 178 |
+
Strategy:
|
| 179 |
+
1. Parse script into (speaker, text) segments.
|
| 180 |
+
2. Generate Edge-TTS audio for each segment using distinct voices.
|
| 181 |
+
(Qwen TTS doesn't expose per-call voice selection, so Edge-TTS is
|
| 182 |
+
used directly for podcast to guarantee two distinct voices.)
|
| 183 |
+
3. Concatenate all segments with a short silence between speakers.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
(audio_file_path, engine_label)
|
| 187 |
+
"""
|
| 188 |
+
segments = _parse_podcast_script(script)
|
| 189 |
+
|
| 190 |
+
if not segments:
|
| 191 |
+
logger.warning("Podcast parser found no HOST_1/HOST_2 lines β using single voice")
|
| 192 |
+
return generate_audio(script)
|
| 193 |
+
|
| 194 |
+
logger.info("Podcast mode: %d speaker segments to synthesise", len(segments))
|
| 195 |
+
|
| 196 |
+
voice_map = {
|
| 197 |
+
"HOST_1": EDGE_VOICE_DEFAULT,
|
| 198 |
+
"HOST_2": EDGE_VOICE_HOST2,
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
audio_paths: list[str] = []
|
| 202 |
+
for speaker, text in segments:
|
| 203 |
+
voice = voice_map.get(speaker, EDGE_VOICE_DEFAULT)
|
| 204 |
+
try:
|
| 205 |
+
path = _edge_tts(text, voice=voice)
|
| 206 |
+
audio_paths.append(path)
|
| 207 |
+
except Exception as exc:
|
| 208 |
+
logger.warning("Segment TTS failed for %s: %s β skipping", speaker, exc)
|
| 209 |
+
|
| 210 |
+
if not audio_paths:
|
| 211 |
+
raise RuntimeError("All podcast audio segments failed to generate.")
|
| 212 |
+
|
| 213 |
+
final_path = _concat_audio_files(audio_paths)
|
| 214 |
+
return final_path, "Edge-TTS (Podcast)"
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 218 |
+
# Unified public interface (unchanged signature from original tts.py)
|
| 219 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 220 |
+
|
| 221 |
+
def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
|
| 222 |
+
"""
|
| 223 |
+
Generate audio from text (Summary / Song / Rap β single voice).
|
| 224 |
+
|
| 225 |
+
Tries Qwen3-TTS first; falls back to Edge-TTS.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
text : Script text to synthesise.
|
| 229 |
+
voice_id : Optional Edge-TTS voice override.
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
(audio_file_path, engine_label)
|
| 233 |
+
"""
|
| 234 |
+
if not text or not text.strip():
|
| 235 |
+
raise ValueError("No text provided for audio generation.")
|
| 236 |
+
|
| 237 |
+
# Attempt primary (Qwen)
|
| 238 |
+
path = _qwen_tts(text)
|
| 239 |
+
if path and os.path.exists(path):
|
| 240 |
+
return path, "Qwen3-TTS"
|
| 241 |
+
|
| 242 |
+
# Fallback (Edge-TTS)
|
| 243 |
+
voice = voice_id or EDGE_VOICE_DEFAULT
|
| 244 |
+
path = _edge_tts(text, voice=voice)
|
| 245 |
+
return path, "Edge-TTS"
|