Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- README.md +57 -13
- app.py +393 -0
- convert_to_word.ps1 +58 -0
- gitattributes +35 -0
- ingestion.py +217 -0
- packages.txt +0 -0
- rag.py +198 -0
- requirements.txt +13 -0
- script_gen.py +310 -0
- tts.py +293 -0
- utils.py +62 -0
README.md
CHANGED
|
@@ -1,13 +1,57 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VoiceVerse AI
|
| 3 |
+
emoji: ποΈ
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.23.1"
|
| 8 |
+
python_version: "3.10"
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# ποΈ VoiceVerse AI β Document to Audio
|
| 14 |
+
|
| 15 |
+
Transform uploaded documents into engaging, emotionally expressive podcast-style audio narrations.
|
| 16 |
+
|
| 17 |
+
## Pipeline
|
| 18 |
+
|
| 19 |
+
```
|
| 20 |
+
PDF/TXT β Text Extraction β RAG (chunk + embed + retrieve) β Script Generation (Mistral-7B) β TTS (Qwen3-TTS / Edge-TTS) β Audio Playback
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Models Used
|
| 24 |
+
|
| 25 |
+
| Component | Model | How |
|
| 26 |
+
|-----------|-------|-----|
|
| 27 |
+
| Embeddings | `all-MiniLM-L6-v2` | Local (CPU) |
|
| 28 |
+
| Script Gen | `Mistral-7B-Instruct-v0.3` | HF Inference API |
|
| 29 |
+
| TTS (primary) | `Qwen3-TTS` | HF Inference API |
|
| 30 |
+
| TTS (fallback) | `Edge-TTS (AriaNeural)` | Local (CPU) |
|
| 31 |
+
|
| 32 |
+
## Setup
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
pip install -r requirements.txt
|
| 36 |
+
export HF_TOKEN="your_huggingface_token_here"
|
| 37 |
+
python app.py
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Deployment on HF Spaces
|
| 41 |
+
|
| 42 |
+
1. Create a new Space (Gradio SDK)
|
| 43 |
+
2. Upload all project files
|
| 44 |
+
3. Set `HF_TOKEN` as a Space Secret
|
| 45 |
+
4. The app will auto-launch on port 7860
|
| 46 |
+
|
| 47 |
+
## Project Structure
|
| 48 |
+
|
| 49 |
+
```
|
| 50 |
+
app.py # Gradio UI entry point
|
| 51 |
+
rag.py # Document ingestion, chunking, embedding, retrieval
|
| 52 |
+
script_gen.py # LLM script generation (Mistral-7B-Instruct)
|
| 53 |
+
tts.py # Text-to-speech (Qwen3-TTS + Edge-TTS fallback)
|
| 54 |
+
utils.py # Helpers (temp files, validation, error formatting)
|
| 55 |
+
requirements.txt # Python dependencies
|
| 56 |
+
packages.txt # System packages (ffmpeg)
|
| 57 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Main Application.
|
| 3 |
+
|
| 4 |
+
Input sources (three tabs):
|
| 5 |
+
Tab 1 β Upload PDF or TXT file
|
| 6 |
+
Tab 2 β URL / YouTube link
|
| 7 |
+
Tab 3 β Paste raw text
|
| 8 |
+
|
| 9 |
+
Delivery Modes:
|
| 10 |
+
Summary / Podcast / Song / Rap / Debate
|
| 11 |
+
|
| 12 |
+
No status card shown. RAG pipeline unchanged.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import gradio as gr
|
| 17 |
+
from utils import logger, validate_file, format_error
|
| 18 |
+
from rag import extract_text, RAGStore
|
| 19 |
+
from script_gen import generate_script
|
| 20 |
+
from tts import generate_audio, generate_audio_podcast, generate_audio_debate, generate_audio_rap, generate_audio_story
|
| 21 |
+
from ingestion import ingest_from_url_or_text, extract_pasted_text
|
| 22 |
+
|
| 23 |
+
# ββ Global RAG Store ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
rag_store = RAGStore()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
# Shared RAG + Script + TTS pipeline
|
| 29 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
|
| 31 |
+
def _run_pipeline(
|
| 32 |
+
text: str,
|
| 33 |
+
delivery_mode: str,
|
| 34 |
+
song_rap_sub: str,
|
| 35 |
+
progress,
|
| 36 |
+
) -> tuple[str, str]:
|
| 37 |
+
"""
|
| 38 |
+
RAG β script β audio. Shared by all three input tabs.
|
| 39 |
+
Returns (script, audio_path).
|
| 40 |
+
"""
|
| 41 |
+
# RAG: chunk & embed
|
| 42 |
+
progress(0.30, desc="π§ Building knowledge indexβ¦")
|
| 43 |
+
rag_store.add_document(text)
|
| 44 |
+
chunk_count = len(rag_store.chunks)
|
| 45 |
+
logger.info("RAG index: %d chunks", chunk_count)
|
| 46 |
+
|
| 47 |
+
# RAG: retrieve
|
| 48 |
+
progress(0.45, desc="π Retrieving relevant contentβ¦")
|
| 49 |
+
if chunk_count <= 8:
|
| 50 |
+
context_chunks = rag_store.get_all_chunks()
|
| 51 |
+
else:
|
| 52 |
+
context_chunks = rag_store.query(
|
| 53 |
+
"What are the main topics, key insights, and important details?",
|
| 54 |
+
top_k=6,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Script generation
|
| 58 |
+
progress(0.60, desc=f"βοΈ Writing {_mode_label(delivery_mode, song_rap_sub)} scriptβ¦")
|
| 59 |
+
script = generate_script(
|
| 60 |
+
context_chunks=context_chunks,
|
| 61 |
+
mode=delivery_mode,
|
| 62 |
+
sub_mode=song_rap_sub,
|
| 63 |
+
)
|
| 64 |
+
logger.info("Script: %d chars", len(script))
|
| 65 |
+
|
| 66 |
+
# TTS β route by mode
|
| 67 |
+
progress(0.80, desc="ποΈ Synthesising audioβ¦")
|
| 68 |
+
m = delivery_mode.strip().lower()
|
| 69 |
+
if m == "podcast":
|
| 70 |
+
audio_path, engine = generate_audio_podcast(script)
|
| 71 |
+
elif m == "debate":
|
| 72 |
+
audio_path, engine = generate_audio_debate(script)
|
| 73 |
+
elif m == "song / rap" and song_rap_sub.lower() == "rap":
|
| 74 |
+
audio_path, engine = generate_audio_rap(script)
|
| 75 |
+
elif m == "story":
|
| 76 |
+
audio_path, engine = generate_audio_story(script)
|
| 77 |
+
else:
|
| 78 |
+
audio_path, engine = generate_audio(script)
|
| 79 |
+
logger.info("Audio via %s: %s", engine, audio_path)
|
| 80 |
+
|
| 81 |
+
progress(1.00, desc="β
Done!")
|
| 82 |
+
return script, audio_path
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _mode_label(mode: str, sub_mode: str) -> str:
|
| 86 |
+
m = mode.lower()
|
| 87 |
+
if "podcast" in m:
|
| 88 |
+
return "podcast"
|
| 89 |
+
if "debate" in m:
|
| 90 |
+
return "debate"
|
| 91 |
+
if "story" in m:
|
| 92 |
+
return "story"
|
| 93 |
+
if "song" in m or "rap" in m:
|
| 94 |
+
return sub_mode.lower()
|
| 95 |
+
return "summary"
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
# Per-tab handlers
|
| 100 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
|
| 102 |
+
def process_file(file, delivery_mode, song_rap_sub, progress=gr.Progress()):
|
| 103 |
+
if file is None:
|
| 104 |
+
raise gr.Error("Please upload a PDF or TXT file first.")
|
| 105 |
+
file_path = file.name if hasattr(file, "name") else str(file)
|
| 106 |
+
is_valid, msg = validate_file(file_path)
|
| 107 |
+
if not is_valid:
|
| 108 |
+
raise gr.Error(msg)
|
| 109 |
+
try:
|
| 110 |
+
progress(0.10, desc="π Extracting text from documentβ¦")
|
| 111 |
+
text = extract_text(file_path)
|
| 112 |
+
if not text or len(text.strip()) < 50:
|
| 113 |
+
raise gr.Error("Document has too little text. Please upload a richer file.")
|
| 114 |
+
progress(0.20, desc="β
Text extracted")
|
| 115 |
+
return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
|
| 116 |
+
except gr.Error:
|
| 117 |
+
raise
|
| 118 |
+
except EnvironmentError as e:
|
| 119 |
+
raise gr.Error(str(e))
|
| 120 |
+
except Exception as e:
|
| 121 |
+
raise gr.Error(format_error("pipeline", e))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def process_url(url_input, delivery_mode, song_rap_sub, progress=gr.Progress()):
|
| 125 |
+
if not url_input or not url_input.strip():
|
| 126 |
+
raise gr.Error("Please enter a URL or YouTube link.")
|
| 127 |
+
try:
|
| 128 |
+
progress(0.05, desc="π Fetching contentβ¦")
|
| 129 |
+
text, source_label = ingest_from_url_or_text(url_input.strip())
|
| 130 |
+
logger.info("Ingested from %s: %d chars", source_label, len(text))
|
| 131 |
+
progress(0.20, desc=f"β
Content fetched from {source_label}")
|
| 132 |
+
return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
|
| 133 |
+
except gr.Error:
|
| 134 |
+
raise
|
| 135 |
+
except ValueError as e:
|
| 136 |
+
raise gr.Error(str(e))
|
| 137 |
+
except EnvironmentError as e:
|
| 138 |
+
raise gr.Error(str(e))
|
| 139 |
+
except Exception as e:
|
| 140 |
+
raise gr.Error(format_error("pipeline", e))
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def process_paste(pasted_text, delivery_mode, song_rap_sub, progress=gr.Progress()):
|
| 144 |
+
if not pasted_text or not pasted_text.strip():
|
| 145 |
+
raise gr.Error("Please paste some text first.")
|
| 146 |
+
try:
|
| 147 |
+
progress(0.10, desc="π Processing pasted textβ¦")
|
| 148 |
+
text = extract_pasted_text(pasted_text)
|
| 149 |
+
progress(0.20, desc="β
Text ready")
|
| 150 |
+
return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
|
| 151 |
+
except gr.Error:
|
| 152 |
+
raise
|
| 153 |
+
except ValueError as e:
|
| 154 |
+
raise gr.Error(str(e))
|
| 155 |
+
except EnvironmentError as e:
|
| 156 |
+
raise gr.Error(str(e))
|
| 157 |
+
except Exception as e:
|
| 158 |
+
raise gr.Error(format_error("pipeline", e))
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 162 |
+
# UI helpers
|
| 163 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
|
| 165 |
+
def _mode_description(mode: str) -> str:
|
| 166 |
+
return {
|
| 167 |
+
"Summary": (
|
| 168 |
+
"*π **Summary** β Structured narration: intro, key points, conclusion. "
|
| 169 |
+
"Single voice, neutral tone.*"
|
| 170 |
+
),
|
| 171 |
+
"Podcast": (
|
| 172 |
+
"*ποΈ **Podcast** β Two-host conversation. Female host guides; "
|
| 173 |
+
"Male host explains. Dual voices.*"
|
| 174 |
+
),
|
| 175 |
+
"Song / Rap": (
|
| 176 |
+
"*π΅ **Song / Rap** β Key ideas as a rhythmic track. "
|
| 177 |
+
"Song = smooth flow Β· Rap = fast, punchy, bass-boosted.*"
|
| 178 |
+
),
|
| 179 |
+
"Debate": (
|
| 180 |
+
"*βοΈ **Debate** β Two debaters argue opposing sides. "
|
| 181 |
+
"Female voice (pro, assertive) vs Male voice (con, deliberate).*"
|
| 182 |
+
),
|
| 183 |
+
"Story": (
|
| 184 |
+
"*π **Story** β Content retold as an immersive narrative. "
|
| 185 |
+
"Slow, warm delivery with expressive pauses.*"
|
| 186 |
+
),
|
| 187 |
+
}.get(mode, "")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _on_mode_change(mode: str):
|
| 191 |
+
show_sub = "song" in mode.lower() or "rap" in mode.lower()
|
| 192 |
+
return gr.update(visible=show_sub), _mode_description(mode)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
# Gradio UI
|
| 197 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 198 |
+
|
| 199 |
+
def build_ui() -> gr.Blocks:
|
| 200 |
+
|
| 201 |
+
css = """
|
| 202 |
+
.main-header { text-align: center; margin-bottom: 1rem; }
|
| 203 |
+
.main-header h1 {
|
| 204 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 205 |
+
-webkit-background-clip: text;
|
| 206 |
+
-webkit-text-fill-color: transparent;
|
| 207 |
+
font-size: 2.5rem;
|
| 208 |
+
font-weight: 800;
|
| 209 |
+
margin-bottom: 0.25rem;
|
| 210 |
+
}
|
| 211 |
+
.main-header p { color: #6b7280; font-size: 1.1rem; }
|
| 212 |
+
|
| 213 |
+
.mode-card {
|
| 214 |
+
background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
|
| 215 |
+
border: 1px solid #e0d9ff;
|
| 216 |
+
border-radius: 12px;
|
| 217 |
+
padding: 1rem 1.25rem;
|
| 218 |
+
margin-top: 0.75rem;
|
| 219 |
+
margin-bottom: 0.75rem;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
/* Hide the "Radio" label Gradio adds automatically */
|
| 223 |
+
#delivery-mode-radio .label-wrap { display: none !important; }
|
| 224 |
+
|
| 225 |
+
.url-hint { color: #6b7280; font-size: 0.82rem; margin-top: 0.3rem; }
|
| 226 |
+
"""
|
| 227 |
+
|
| 228 |
+
with gr.Blocks(
|
| 229 |
+
title="VoiceVerse AI",
|
| 230 |
+
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
|
| 231 |
+
css=css,
|
| 232 |
+
) as app:
|
| 233 |
+
|
| 234 |
+
# ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 235 |
+
gr.HTML("""
|
| 236 |
+
<div class="main-header">
|
| 237 |
+
<h1>ποΈ VoiceVerse AI</h1>
|
| 238 |
+
<p>Transform any content into an engaging audio experience</p>
|
| 239 |
+
</div>
|
| 240 |
+
""")
|
| 241 |
+
|
| 242 |
+
with gr.Row(equal_height=False):
|
| 243 |
+
|
| 244 |
+
# ββ LEFT COLUMN ββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββ
|
| 245 |
+
with gr.Column(scale=1):
|
| 246 |
+
|
| 247 |
+
gr.Markdown("### π₯ Choose Your Content Source")
|
| 248 |
+
|
| 249 |
+
with gr.Tabs():
|
| 250 |
+
|
| 251 |
+
# ββ Tab 1: File upload ββββββββββββββββββββββββββββββββββββ
|
| 252 |
+
with gr.Tab("π File Upload"):
|
| 253 |
+
file_input = gr.File(
|
| 254 |
+
label="Upload a PDF or TXT file",
|
| 255 |
+
file_types=[".pdf", ".txt"],
|
| 256 |
+
type="filepath",
|
| 257 |
+
)
|
| 258 |
+
file_btn = gr.Button(
|
| 259 |
+
"ποΈ Generate Audio",
|
| 260 |
+
variant="primary",
|
| 261 |
+
size="lg",
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# ββ Tab 2: URL / YouTube ββββββββββββββββββββββββββββββββββ
|
| 265 |
+
with gr.Tab("π URL"):
|
| 266 |
+
url_input = gr.Textbox(
|
| 267 |
+
label=None,
|
| 268 |
+
placeholder=(
|
| 269 |
+
"Paste any link hereβ¦\n\n"
|
| 270 |
+
"βΆ YouTube: https://youtube.com/watch?v=...\n"
|
| 271 |
+
"π° Article: https://example.com/article\n"
|
| 272 |
+
"π Website: https://en.wikipedia.org/wiki/..."
|
| 273 |
+
),
|
| 274 |
+
lines=5,
|
| 275 |
+
max_lines=6,
|
| 276 |
+
show_label=False,
|
| 277 |
+
)
|
| 278 |
+
gr.HTML(
|
| 279 |
+
"<p class='url-hint'>"
|
| 280 |
+
"β
Works with: YouTube (with captions), news articles, "
|
| 281 |
+
"blogs, Wikipedia, most public pages.<br>"
|
| 282 |
+
"β Won't work: paywalled or login-required pages."
|
| 283 |
+
"</p>"
|
| 284 |
+
)
|
| 285 |
+
url_btn = gr.Button(
|
| 286 |
+
"ποΈ Generate Audio",
|
| 287 |
+
variant="primary",
|
| 288 |
+
size="lg",
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# ββ Tab 3: Paste text βββββββββββββββββββββββββββββββββββββ
|
| 292 |
+
with gr.Tab("π Paste Text"):
|
| 293 |
+
paste_input = gr.Textbox(
|
| 294 |
+
label=None,
|
| 295 |
+
placeholder=(
|
| 296 |
+
"Paste any text here β article content, notes, "
|
| 297 |
+
"transcripts, research, anythingβ¦"
|
| 298 |
+
),
|
| 299 |
+
lines=10,
|
| 300 |
+
max_lines=40,
|
| 301 |
+
show_label=False,
|
| 302 |
+
)
|
| 303 |
+
paste_btn = gr.Button(
|
| 304 |
+
"ποΈ Generate Audio",
|
| 305 |
+
variant="primary",
|
| 306 |
+
size="lg",
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
# ββ Delivery Mode card ββββββββββββββββββββββββββββββββββββββββ
|
| 310 |
+
gr.Markdown("### π¨ Choose Audio Experience")
|
| 311 |
+
|
| 312 |
+
delivery_mode = gr.Radio(
|
| 313 |
+
choices=["Summary", "Podcast", "Song / Rap", "Debate", "Story"],
|
| 314 |
+
value="Summary",
|
| 315 |
+
show_label=False, # removes the "Radio" label
|
| 316 |
+
elem_id="delivery-mode-radio",
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# Song/Rap sub-option β hidden unless Song/Rap is selected
|
| 320 |
+
with gr.Row(visible=False) as song_rap_row:
|
| 321 |
+
song_rap_sub = gr.Radio(
|
| 322 |
+
choices=["Song", "Rap"],
|
| 323 |
+
value="Rap",
|
| 324 |
+
label="Style",
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
mode_description = gr.Markdown(value=_mode_description("Summary"))
|
| 328 |
+
|
| 329 |
+
# ββ RIGHT COLUMN ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 330 |
+
with gr.Column(scale=1):
|
| 331 |
+
|
| 332 |
+
gr.Markdown("### π§ Generated Audio")
|
| 333 |
+
audio_output = gr.Audio(
|
| 334 |
+
label="Audio",
|
| 335 |
+
type="filepath",
|
| 336 |
+
interactive=False,
|
| 337 |
+
show_download_button=True,
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
gr.Markdown("### βοΈ Generated Script")
|
| 341 |
+
script_output = gr.Textbox(
|
| 342 |
+
label="Script",
|
| 343 |
+
lines=14,
|
| 344 |
+
max_lines=22,
|
| 345 |
+
interactive=False,
|
| 346 |
+
placeholder="Your generated script will appear hereβ¦",
|
| 347 |
+
show_copy_button=True,
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# ββ Footer ββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββ
|
| 351 |
+
gr.Markdown(
|
| 352 |
+
"<center style='color:#9ca3af;margin-top:1rem;'>"
|
| 353 |
+
"Built with β€οΈ using SmolLM3-3B Β· Qwen3-TTS Β· Edge-TTS Β· Gradio"
|
| 354 |
+
"</center>"
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
# ββ Event wiring βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 358 |
+
|
| 359 |
+
delivery_mode.change(
|
| 360 |
+
fn=_on_mode_change,
|
| 361 |
+
inputs=[delivery_mode],
|
| 362 |
+
outputs=[song_rap_row, mode_description],
|
| 363 |
+
)
|
| 364 |
+
file_btn.click(
|
| 365 |
+
fn=process_file,
|
| 366 |
+
inputs=[file_input, delivery_mode, song_rap_sub],
|
| 367 |
+
outputs=[script_output, audio_output],
|
| 368 |
+
)
|
| 369 |
+
url_btn.click(
|
| 370 |
+
fn=process_url,
|
| 371 |
+
inputs=[url_input, delivery_mode, song_rap_sub],
|
| 372 |
+
outputs=[script_output, audio_output],
|
| 373 |
+
)
|
| 374 |
+
paste_btn.click(
|
| 375 |
+
fn=process_paste,
|
| 376 |
+
inputs=[paste_input, delivery_mode, song_rap_sub],
|
| 377 |
+
outputs=[script_output, audio_output],
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
return app
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
# ββ Entry point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
|
| 385 |
+
if __name__ == "__main__":
|
| 386 |
+
logger.info("Starting VoiceVerse AIβ¦")
|
| 387 |
+
app = build_ui()
|
| 388 |
+
app.launch(
|
| 389 |
+
server_name="0.0.0.0",
|
| 390 |
+
server_port=7860,
|
| 391 |
+
share=False,
|
| 392 |
+
show_error=True,
|
| 393 |
+
)
|
convert_to_word.ps1
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
$markdownPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\project_report.md"
|
| 3 |
+
$wordPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\VoiceVerse_AI_Project_Report.docx"
|
| 4 |
+
|
| 5 |
+
if (-not (Test-Path $markdownPath)) {
|
| 6 |
+
Write-Error "Markdown file not found at $markdownPath"
|
| 7 |
+
exit 1
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
$content = Get-Content -Path $markdownPath -Raw
|
| 11 |
+
|
| 12 |
+
# Create Word Object
|
| 13 |
+
try {
|
| 14 |
+
$word = New-Object -ComObject Word.Application
|
| 15 |
+
$word.Visible = $false
|
| 16 |
+
$doc = $word.Documents.Add()
|
| 17 |
+
$selection = $word.Selection
|
| 18 |
+
|
| 19 |
+
# Basic Markdown Parsing (Simplified)
|
| 20 |
+
$lines = $content -split "`r?`n"
|
| 21 |
+
foreach ($line in $lines) {
|
| 22 |
+
if ($line -match "^# (.*)") {
|
| 23 |
+
$selection.Style = "Title"
|
| 24 |
+
$selection.TypeText($matches[1])
|
| 25 |
+
$selection.TypeParagraph()
|
| 26 |
+
} elseif ($line -match "^## (.*)") {
|
| 27 |
+
$selection.Style = "Heading 1"
|
| 28 |
+
$selection.TypeText($matches[1])
|
| 29 |
+
$selection.TypeParagraph()
|
| 30 |
+
} elseif ($line -match "^### (.*)") {
|
| 31 |
+
$selection.Style = "Heading 2"
|
| 32 |
+
$selection.TypeText($matches[1])
|
| 33 |
+
$selection.TypeParagraph()
|
| 34 |
+
} elseif ($line -match "^---") {
|
| 35 |
+
# Skip horizontal rules or add a page break?
|
| 36 |
+
# For now just skip
|
| 37 |
+
} elseif ($line -match "^\|") {
|
| 38 |
+
# Table handling is complex, for now just TypeText
|
| 39 |
+
$selection.Style = "Normal"
|
| 40 |
+
$selection.TypeText($line)
|
| 41 |
+
$selection.TypeParagraph()
|
| 42 |
+
} else {
|
| 43 |
+
$selection.Style = "Normal"
|
| 44 |
+
# Remove bold/italic markers for cleaner look
|
| 45 |
+
$cleanLine = $line -replace "\*\*", "" -replace "\*", ""
|
| 46 |
+
$selection.TypeText($cleanLine)
|
| 47 |
+
$selection.TypeParagraph()
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
$doc.SaveAs([ref]$wordPath)
|
| 52 |
+
$doc.Close()
|
| 53 |
+
$word.Quit()
|
| 54 |
+
Write-Host "Word document created successfully at $wordPath"
|
| 55 |
+
} catch {
|
| 56 |
+
Write-Error "Failed to create Word document: $_"
|
| 57 |
+
if ($word) { $word.Quit() }
|
| 58 |
+
}
|
gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
ingestion.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Content Ingestion Module.
|
| 3 |
+
|
| 4 |
+
Handles all input sources beyond file upload:
|
| 5 |
+
- YouTube links β transcript via youtube-transcript-api
|
| 6 |
+
- Article / website β readable text via trafilatura + BeautifulSoup fallback
|
| 7 |
+
- Pasted raw text β light cleaning and validation
|
| 8 |
+
|
| 9 |
+
Returns plain text string that feeds into RAGStore.add_document().
|
| 10 |
+
rag.py is completely unchanged.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
import urllib.parse
|
| 15 |
+
from utils import logger
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
# URL type detection
|
| 20 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
def _is_youtube(url: str) -> bool:
|
| 23 |
+
parsed = urllib.parse.urlparse(url.strip())
|
| 24 |
+
host = parsed.netloc.lower().replace("www.", "")
|
| 25 |
+
return host in ("youtube.com", "youtu.be")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _extract_youtube_id(url: str) -> str | None:
|
| 29 |
+
patterns = [
|
| 30 |
+
r"(?:v=)([a-zA-Z0-9_-]{11})",
|
| 31 |
+
r"youtu\.be/([a-zA-Z0-9_-]{11})",
|
| 32 |
+
r"embed/([a-zA-Z0-9_-]{11})",
|
| 33 |
+
r"shorts/([a-zA-Z0-9_-]{11})",
|
| 34 |
+
]
|
| 35 |
+
for pattern in patterns:
|
| 36 |
+
match = re.search(pattern, url)
|
| 37 |
+
if match:
|
| 38 |
+
return match.group(1)
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
# YouTube transcript
|
| 44 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
|
| 46 |
+
def extract_youtube(url: str) -> str:
|
| 47 |
+
try:
|
| 48 |
+
from youtube_transcript_api import (
|
| 49 |
+
YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
|
| 50 |
+
)
|
| 51 |
+
except ImportError:
|
| 52 |
+
raise ImportError(
|
| 53 |
+
"youtube-transcript-api is not installed. "
|
| 54 |
+
"Add 'youtube-transcript-api' to requirements.txt and restart the Space."
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
video_id = _extract_youtube_id(url)
|
| 58 |
+
if not video_id:
|
| 59 |
+
raise ValueError(f"Could not extract a YouTube video ID from: {url}")
|
| 60 |
+
|
| 61 |
+
logger.info("Fetching YouTube transcript: video_id=%s", video_id)
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 65 |
+
|
| 66 |
+
# Prefer English manual captions, then English auto, then anything available
|
| 67 |
+
try:
|
| 68 |
+
transcript = transcript_list.find_manually_created_transcript(
|
| 69 |
+
["en", "en-US", "en-GB"]
|
| 70 |
+
)
|
| 71 |
+
except NoTranscriptFound:
|
| 72 |
+
try:
|
| 73 |
+
transcript = transcript_list.find_generated_transcript(
|
| 74 |
+
["en", "en-US", "en-GB"]
|
| 75 |
+
)
|
| 76 |
+
except NoTranscriptFound:
|
| 77 |
+
transcript = next(iter(transcript_list))
|
| 78 |
+
logger.info("No English transcript β using: %s", transcript.language)
|
| 79 |
+
|
| 80 |
+
entries = transcript.fetch()
|
| 81 |
+
text = " ".join(entry["text"] for entry in entries)
|
| 82 |
+
|
| 83 |
+
# Clean YouTube caption artifacts
|
| 84 |
+
text = re.sub(r"\[.*?\]", "", text) # [Music], [Applause] etc.
|
| 85 |
+
text = re.sub(r"\s{2,}", " ", text).strip()
|
| 86 |
+
|
| 87 |
+
if len(text) < 50:
|
| 88 |
+
raise ValueError("YouTube transcript is too short to process.")
|
| 89 |
+
|
| 90 |
+
logger.info("YouTube transcript: %d chars", len(text))
|
| 91 |
+
return text
|
| 92 |
+
|
| 93 |
+
except (NoTranscriptFound, TranscriptsDisabled) as e:
|
| 94 |
+
raise ValueError(
|
| 95 |
+
f"No transcript available for this video. "
|
| 96 |
+
f"The video may have captions disabled or be private.\n\n"
|
| 97 |
+
f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
# Article / website URL
|
| 103 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
|
| 105 |
+
def extract_url(url: str) -> str:
|
| 106 |
+
"""
|
| 107 |
+
Fetch a webpage and extract readable text.
|
| 108 |
+
Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
|
| 109 |
+
"""
|
| 110 |
+
url = url.strip()
|
| 111 |
+
logger.info("Fetching URL: %s", url)
|
| 112 |
+
|
| 113 |
+
headers = {
|
| 114 |
+
"User-Agent": (
|
| 115 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 116 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 117 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 118 |
+
)
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# ββ Attempt 1: trafilatura ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
try:
|
| 123 |
+
import trafilatura
|
| 124 |
+
downloaded = trafilatura.fetch_url(url)
|
| 125 |
+
if downloaded:
|
| 126 |
+
text = trafilatura.extract(
|
| 127 |
+
downloaded,
|
| 128 |
+
include_comments=False,
|
| 129 |
+
include_tables=True,
|
| 130 |
+
no_fallback=False,
|
| 131 |
+
)
|
| 132 |
+
if text and len(text.strip()) > 100:
|
| 133 |
+
logger.info("trafilatura extracted %d chars", len(text))
|
| 134 |
+
return text.strip()
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.warning("trafilatura failed (%s) β trying BeautifulSoup", e)
|
| 137 |
+
|
| 138 |
+
# ββ Attempt 2: requests + BeautifulSoup ββββββββββββββββββββββββββββββββββ
|
| 139 |
+
try:
|
| 140 |
+
import requests
|
| 141 |
+
from bs4 import BeautifulSoup
|
| 142 |
+
|
| 143 |
+
resp = requests.get(url, headers=headers, timeout=15)
|
| 144 |
+
resp.raise_for_status()
|
| 145 |
+
|
| 146 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 147 |
+
for tag in soup(["script", "style", "nav", "footer", "header",
|
| 148 |
+
"aside", "form", "noscript", "iframe"]):
|
| 149 |
+
tag.decompose()
|
| 150 |
+
|
| 151 |
+
article = soup.find("article") or soup.find("main") or soup.find("body")
|
| 152 |
+
text = (
|
| 153 |
+
article.get_text(separator=" ", strip=True)
|
| 154 |
+
if article
|
| 155 |
+
else soup.get_text(separator=" ", strip=True)
|
| 156 |
+
)
|
| 157 |
+
text = re.sub(r"\s{3,}", "\n\n", text)
|
| 158 |
+
text = re.sub(r" {2,}", " ", text).strip()
|
| 159 |
+
|
| 160 |
+
if len(text) < 100:
|
| 161 |
+
raise ValueError("Could not extract enough text from this page.")
|
| 162 |
+
|
| 163 |
+
logger.info("BeautifulSoup extracted %d chars", len(text))
|
| 164 |
+
return text
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
raise ValueError(
|
| 168 |
+
f"Could not fetch content from: {url}\n\n"
|
| 169 |
+
f"Reason: {e}\n\n"
|
| 170 |
+
"The page may require a login or block bots. "
|
| 171 |
+
"Try copying the article text and pasting it in the Paste Text tab."
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
+
# Pasted raw text
|
| 177 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 178 |
+
|
| 179 |
+
def extract_pasted_text(text: str) -> str:
|
| 180 |
+
if not text or not text.strip():
|
| 181 |
+
raise ValueError("No text was pasted. Please paste some content.")
|
| 182 |
+
|
| 183 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 184 |
+
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
| 185 |
+
text = re.sub(r" {2,}", " ", text).strip()
|
| 186 |
+
|
| 187 |
+
if len(text) < 50:
|
| 188 |
+
raise ValueError(
|
| 189 |
+
"Pasted text is too short. Please paste at least a paragraph of content."
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
logger.info("Pasted text ingested: %d chars", len(text))
|
| 193 |
+
return text
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 197 |
+
# Unified entry point
|
| 198 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 199 |
+
|
| 200 |
+
def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
|
| 201 |
+
"""
|
| 202 |
+
Auto-detect whether input is a YouTube URL, article URL, or plain text.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
(extracted_text, source_label)
|
| 206 |
+
"""
|
| 207 |
+
raw = raw_input.strip()
|
| 208 |
+
if not raw:
|
| 209 |
+
raise ValueError("Please enter a URL or paste some text.")
|
| 210 |
+
|
| 211 |
+
if re.match(r"https?://", raw, re.IGNORECASE):
|
| 212 |
+
if _is_youtube(raw):
|
| 213 |
+
return extract_youtube(raw), "YouTube"
|
| 214 |
+
else:
|
| 215 |
+
return extract_url(raw), "Article / Website"
|
| 216 |
+
else:
|
| 217 |
+
return extract_pasted_text(raw), "Pasted Text"
|
packages.txt
ADDED
|
File without changes
|
rag.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β RAG Pipeline.
|
| 3 |
+
|
| 4 |
+
Handles document ingestion, text chunking, embedding generation,
|
| 5 |
+
and semantic retrieval using an in-memory vector store.
|
| 6 |
+
|
| 7 |
+
Models used:
|
| 8 |
+
- sentence-transformers/all-MiniLM-L6-v2 for embeddings (22 MB, CPU-friendly)
|
| 9 |
+
|
| 10 |
+
Design decisions:
|
| 11 |
+
- NumPy cosine similarity instead of FAISS to avoid heavy native deps
|
| 12 |
+
- Overlapping chunks to preserve context across boundaries
|
| 13 |
+
- Single-document architecture (clear store on new upload)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import numpy as np
|
| 18 |
+
from utils import logger
|
| 19 |
+
|
| 20 |
+
# ββ Text Extraction ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
def extract_text(file_path: str) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Extract plain text from a PDF or TXT file.
|
| 25 |
+
Returns the full document text as a single string.
|
| 26 |
+
"""
|
| 27 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 28 |
+
|
| 29 |
+
if ext == ".pdf":
|
| 30 |
+
return _extract_pdf(file_path)
|
| 31 |
+
elif ext == ".txt":
|
| 32 |
+
return _extract_txt(file_path)
|
| 33 |
+
else:
|
| 34 |
+
raise ValueError(f"Unsupported file type: {ext}")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _extract_pdf(file_path: str) -> str:
|
| 38 |
+
"""Extract text from PDF using PyMuPDF."""
|
| 39 |
+
import fitz # PyMuPDF
|
| 40 |
+
|
| 41 |
+
text_parts = []
|
| 42 |
+
with fitz.open(file_path) as doc:
|
| 43 |
+
for page_num, page in enumerate(doc):
|
| 44 |
+
page_text = page.get_text("text")
|
| 45 |
+
if page_text.strip():
|
| 46 |
+
text_parts.append(page_text)
|
| 47 |
+
logger.debug("Extracted page %d: %d chars", page_num + 1, len(page_text))
|
| 48 |
+
|
| 49 |
+
full_text = "\n\n".join(text_parts)
|
| 50 |
+
logger.info("PDF extraction complete: %d pages, %d chars total",
|
| 51 |
+
len(text_parts), len(full_text))
|
| 52 |
+
return full_text
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _extract_txt(file_path: str) -> str:
|
| 56 |
+
"""Read plain text file with encoding fallback."""
|
| 57 |
+
for encoding in ("utf-8", "utf-8-sig", "latin-1", "cp1252"):
|
| 58 |
+
try:
|
| 59 |
+
with open(file_path, "r", encoding=encoding) as f:
|
| 60 |
+
text = f.read()
|
| 61 |
+
logger.info("TXT extraction complete (%s): %d chars", encoding, len(text))
|
| 62 |
+
return text
|
| 63 |
+
except UnicodeDecodeError:
|
| 64 |
+
continue
|
| 65 |
+
raise ValueError("Could not decode the text file with any supported encoding.")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ββ Text Chunking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
|
| 70 |
+
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
|
| 71 |
+
"""
|
| 72 |
+
Split text into overlapping chunks of roughly `chunk_size` characters.
|
| 73 |
+
Overlap ensures context isn't lost at chunk boundaries.
|
| 74 |
+
|
| 75 |
+
Uses sentence-aware splitting: tries to break at sentence boundaries
|
| 76 |
+
within the chunk window for more coherent chunks.
|
| 77 |
+
"""
|
| 78 |
+
if not text or not text.strip():
|
| 79 |
+
return []
|
| 80 |
+
|
| 81 |
+
# Clean up whitespace
|
| 82 |
+
text = " ".join(text.split())
|
| 83 |
+
|
| 84 |
+
chunks = []
|
| 85 |
+
start = 0
|
| 86 |
+
|
| 87 |
+
while start < len(text):
|
| 88 |
+
end = start + chunk_size
|
| 89 |
+
|
| 90 |
+
# If not at the end, try to break at a sentence boundary
|
| 91 |
+
if end < len(text):
|
| 92 |
+
# Look for sentence-ending punctuation near the end
|
| 93 |
+
search_start = max(start + chunk_size // 2, start)
|
| 94 |
+
last_period = -1
|
| 95 |
+
for i in range(min(end, len(text)) - 1, search_start - 1, -1):
|
| 96 |
+
if text[i] in ".!?" and (i + 1 >= len(text) or text[i + 1] == " "):
|
| 97 |
+
last_period = i
|
| 98 |
+
break
|
| 99 |
+
if last_period > start:
|
| 100 |
+
end = last_period + 1
|
| 101 |
+
|
| 102 |
+
chunk = text[start:end].strip()
|
| 103 |
+
if chunk:
|
| 104 |
+
chunks.append(chunk)
|
| 105 |
+
|
| 106 |
+
# Move forward by (chunk length - overlap)
|
| 107 |
+
start = max(start + 1, end - overlap)
|
| 108 |
+
|
| 109 |
+
logger.info("Chunking complete: %d chunks (size=%d, overlap=%d)",
|
| 110 |
+
len(chunks), chunk_size, overlap)
|
| 111 |
+
return chunks
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ββ Embedding & Vector Store βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
+
|
| 116 |
+
class RAGStore:
|
| 117 |
+
"""
|
| 118 |
+
In-memory vector store using sentence-transformers embeddings
|
| 119 |
+
and NumPy cosine similarity.
|
| 120 |
+
|
| 121 |
+
Usage:
|
| 122 |
+
store = RAGStore()
|
| 123 |
+
store.add_document("full document text here")
|
| 124 |
+
results = store.query("what is this about?", top_k=5)
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 128 |
+
|
| 129 |
+
def __init__(self):
|
| 130 |
+
self._model = None
|
| 131 |
+
self.chunks: list[str] = []
|
| 132 |
+
self.embeddings: np.ndarray | None = None
|
| 133 |
+
|
| 134 |
+
@property
|
| 135 |
+
def model(self):
|
| 136 |
+
"""Lazy-load the embedding model to avoid startup cost."""
|
| 137 |
+
if self._model is None:
|
| 138 |
+
logger.info("Loading embedding model: %s", self.MODEL_NAME)
|
| 139 |
+
from sentence_transformers import SentenceTransformer
|
| 140 |
+
self._model = SentenceTransformer(self.MODEL_NAME)
|
| 141 |
+
logger.info("Embedding model loaded successfully")
|
| 142 |
+
return self._model
|
| 143 |
+
|
| 144 |
+
def clear(self):
|
| 145 |
+
"""Clear the store for a new document."""
|
| 146 |
+
self.chunks = []
|
| 147 |
+
self.embeddings = None
|
| 148 |
+
|
| 149 |
+
def add_document(self, text: str, chunk_size: int = 512, overlap: int = 50):
|
| 150 |
+
"""
|
| 151 |
+
Process a document: chunk the text, generate embeddings, and store.
|
| 152 |
+
Clears any previously stored document.
|
| 153 |
+
"""
|
| 154 |
+
self.clear()
|
| 155 |
+
|
| 156 |
+
self.chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 157 |
+
if not self.chunks:
|
| 158 |
+
raise ValueError("No text chunks could be extracted from the document.")
|
| 159 |
+
|
| 160 |
+
logger.info("Generating embeddings for %d chunks...", len(self.chunks))
|
| 161 |
+
self.embeddings = self.model.encode(
|
| 162 |
+
self.chunks,
|
| 163 |
+
show_progress_bar=False,
|
| 164 |
+
convert_to_numpy=True,
|
| 165 |
+
normalize_embeddings=True, # Pre-normalize for faster cosine sim
|
| 166 |
+
)
|
| 167 |
+
logger.info("Embeddings generated: shape %s", self.embeddings.shape)
|
| 168 |
+
|
| 169 |
+
def query(self, question: str, top_k: int = 5) -> list[str]:
|
| 170 |
+
"""
|
| 171 |
+
Retrieve the top-k most relevant chunks for the given question.
|
| 172 |
+
Uses cosine similarity (dot product on normalized vectors).
|
| 173 |
+
"""
|
| 174 |
+
if self.embeddings is None or len(self.chunks) == 0:
|
| 175 |
+
return []
|
| 176 |
+
|
| 177 |
+
# Embed the query
|
| 178 |
+
query_embedding = self.model.encode(
|
| 179 |
+
[question],
|
| 180 |
+
convert_to_numpy=True,
|
| 181 |
+
normalize_embeddings=True,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# Cosine similarity = dot product (vectors are pre-normalized)
|
| 185 |
+
similarities = np.dot(self.embeddings, query_embedding.T).flatten()
|
| 186 |
+
|
| 187 |
+
# Get top-k indices
|
| 188 |
+
top_k = min(top_k, len(self.chunks))
|
| 189 |
+
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
| 190 |
+
|
| 191 |
+
results = [self.chunks[i] for i in top_indices]
|
| 192 |
+
logger.info("Retrieved %d chunks (top similarity: %.3f)",
|
| 193 |
+
len(results), similarities[top_indices[0]])
|
| 194 |
+
return results
|
| 195 |
+
|
| 196 |
+
def get_all_chunks(self) -> list[str]:
|
| 197 |
+
"""Return all stored chunks (useful for short documents)."""
|
| 198 |
+
return self.chunks.copy()
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.23.1,<6.0
|
| 2 |
+
huggingface-hub>=0.25
|
| 3 |
+
pydantic>=2.0,<2.11
|
| 4 |
+
sentence-transformers
|
| 5 |
+
numpy
|
| 6 |
+
PyMuPDF
|
| 7 |
+
edge-tts
|
| 8 |
+
scipy
|
| 9 |
+
pydub
|
| 10 |
+
requests
|
| 11 |
+
beautifulsoup4
|
| 12 |
+
trafilatura
|
| 13 |
+
youtube-transcript-api
|
script_gen.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Script Generation Module.
|
| 3 |
+
|
| 4 |
+
Delivery Modes:
|
| 5 |
+
Summary β single-speaker structured narration
|
| 6 |
+
Podcast β HOST_1 / HOST_2 two-host dialogue
|
| 7 |
+
Song/Rap β rhythmic retention content
|
| 8 |
+
Debate β DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import re
|
| 13 |
+
from huggingface_hub import InferenceClient
|
| 14 |
+
from utils import logger
|
| 15 |
+
|
| 16 |
+
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
|
| 17 |
+
MAX_NEW_TOKENS = 1200
|
| 18 |
+
TEMPERATURE = 0.5
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
# Prompts
|
| 23 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
|
| 25 |
+
# ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
_SUMMARY_SYSTEM = """\
|
| 27 |
+
You are a professional narrator. Produce a clear spoken summary strictly from the source material.
|
| 28 |
+
RULES:
|
| 29 |
+
1. Use ONLY facts from the source. Do NOT add outside knowledge.
|
| 30 |
+
2. Structure: short intro β key points as natural spoken sentences β concise conclusion.
|
| 31 |
+
3. Plain text only β no markdown, no bullets, no headers.
|
| 32 |
+
4. Write for the ear: short sentences, conversational tone.
|
| 33 |
+
5. Never say "the document says". Speak as the expert.
|
| 34 |
+
6. Output ONLY the narration text, nothing else."""
|
| 35 |
+
|
| 36 |
+
_SUMMARY_USER = """\
|
| 37 |
+
SOURCE MATERIAL:
|
| 38 |
+
{context}
|
| 39 |
+
|
| 40 |
+
Write a flowing spoken summary (intro, key points, conclusion) in plain sentences."""
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ββ Podcast βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
_PODCAST_SYSTEM = """\
|
| 45 |
+
You are a podcast script writer. Write a two-host conversation strictly from the source material.
|
| 46 |
+
|
| 47 |
+
STRICT FORMAT β every single line must start with a speaker tag:
|
| 48 |
+
HOST_1: <what Host 1 says>
|
| 49 |
+
HOST_2: <what Host 2 says>
|
| 50 |
+
|
| 51 |
+
RULES:
|
| 52 |
+
1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
|
| 53 |
+
2. HOST_1 introduces topics and asks questions.
|
| 54 |
+
3. HOST_2 explains concepts and answers.
|
| 55 |
+
4. Use ONLY information from the source. No hallucination.
|
| 56 |
+
5. Conversational, engaging tone.
|
| 57 |
+
6. No markdown, no stage directions, no lines without a HOST tag.
|
| 58 |
+
7. Aim for 16β24 exchanges."""
|
| 59 |
+
|
| 60 |
+
_PODCAST_USER = """\
|
| 61 |
+
SOURCE MATERIAL:
|
| 62 |
+
{context}
|
| 63 |
+
|
| 64 |
+
Write the full podcast. Every line must start with HOST_1: or HOST_2:"""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ Song / Rap ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
_SONG_SYSTEM = """\
|
| 69 |
+
You are a lyricist. Two steps:
|
| 70 |
+
STEP 1 β silently extract 5β7 key ideas from the source.
|
| 71 |
+
STEP 2 β write a smooth melodic SONG from those ideas.
|
| 72 |
+
|
| 73 |
+
RULES:
|
| 74 |
+
- Simple memorable language, rhyming couplets (AABB).
|
| 75 |
+
- Label sections [VERSE 1], [VERSE 2], [CHORUS].
|
| 76 |
+
- [CHORUS] repeats the main concept.
|
| 77 |
+
- Short lines (6β10 words). Use repetition.
|
| 78 |
+
- Do NOT invent facts not in the source.
|
| 79 |
+
- Output ONLY the lyrics with section labels."""
|
| 80 |
+
|
| 81 |
+
_RAP_SYSTEM = """\
|
| 82 |
+
You are a lyricist. Two steps:
|
| 83 |
+
STEP 1 β silently extract 5β7 key ideas from the source.
|
| 84 |
+
STEP 2 β write a punchy rhythmic RAP from those ideas.
|
| 85 |
+
|
| 86 |
+
RULES:
|
| 87 |
+
- Short punchy lines (5β8 words), fast-flow rhyme (AABB or ABAB).
|
| 88 |
+
- Label sections [VERSE 1], [VERSE 2], [HOOK].
|
| 89 |
+
- [HOOK] repeats the main concept.
|
| 90 |
+
- Wordplay and repetition to aid retention.
|
| 91 |
+
- Do NOT invent facts not in the source.
|
| 92 |
+
- Output ONLY the lyrics with section labels."""
|
| 93 |
+
|
| 94 |
+
_SONG_RAP_USER = """\
|
| 95 |
+
SOURCE MATERIAL:
|
| 96 |
+
{context}
|
| 97 |
+
|
| 98 |
+
Extract the key ideas, then write the full {form}."""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββ Debate ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
_DEBATE_SYSTEM = """\
|
| 103 |
+
You are a debate script writer. Write a structured two-person debate strictly grounded \
|
| 104 |
+
in the provided source material.
|
| 105 |
+
|
| 106 |
+
STRICT FORMAT β every single line must start with a speaker tag:
|
| 107 |
+
DEBATER_A: <what Debater A says>
|
| 108 |
+
DEBATER_B: <what Debater B says>
|
| 109 |
+
|
| 110 |
+
CHARACTER PROFILES:
|
| 111 |
+
- DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
|
| 112 |
+
- DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.
|
| 113 |
+
|
| 114 |
+
DEBATE STRUCTURE:
|
| 115 |
+
1. DEBATER_A opens with a strong statement supporting the topic.
|
| 116 |
+
2. DEBATER_B immediately challenges with a counterpoint.
|
| 117 |
+
3. They alternate, each directly responding to the other's previous point.
|
| 118 |
+
4. Both use evidence and logic from the source material only.
|
| 119 |
+
5. End with each debater giving a brief closing statement.
|
| 120 |
+
|
| 121 |
+
RULES:
|
| 122 |
+
- Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
|
| 123 |
+
- Use ONLY information from the source material. No hallucination.
|
| 124 |
+
- Each turn should be 1β3 sentences β punchy, not long speeches.
|
| 125 |
+
- No markdown, no stage directions, no narration outside the speaker tags.
|
| 126 |
+
- Aim for 16β22 exchanges total."""
|
| 127 |
+
|
| 128 |
+
_DEBATE_USER = """\
|
| 129 |
+
SOURCE MATERIAL:
|
| 130 |
+
{context}
|
| 131 |
+
|
| 132 |
+
Write the full debate on the key topics from this material. \
|
| 133 |
+
Every line must start with DEBATER_A: or DEBATER_B:"""
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# ββ Story βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
+
_STORY_SYSTEM = """\
|
| 138 |
+
You are a master storyteller. Retell the ideas from the source material as an \
|
| 139 |
+
immersive narrative story written for slow, expressive audio delivery.
|
| 140 |
+
|
| 141 |
+
RULES:
|
| 142 |
+
1. Transform factual content into a story β use characters, scenes, a narrative arc \
|
| 143 |
+
(beginning, middle, end). Characters can be fictional stand-ins for real concepts.
|
| 144 |
+
2. Use ONLY information and ideas from the source. Do NOT invent new facts.
|
| 145 |
+
3. Warm, descriptive storytelling voice. Vivid but calm.
|
| 146 |
+
4. Short paragraphs, 1β3 sentences each, separated by blank lines.
|
| 147 |
+
5. Plain text only β no markdown, no bullets, no headers.
|
| 148 |
+
6. Begin with an evocative scene-setting sentence.
|
| 149 |
+
7. End with a closing reflection or lesson drawn from the source.
|
| 150 |
+
8. Output ONLY the story text, nothing else."""
|
| 151 |
+
|
| 152 |
+
_STORY_USER = """\
|
| 153 |
+
SOURCE MATERIAL:
|
| 154 |
+
{context}
|
| 155 |
+
|
| 156 |
+
Transform this into a rich narrative story for slow, expressive audio. \
|
| 157 |
+
Use short paragraphs with blank lines between them."""
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
# Post-processing
|
| 162 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
+
|
| 164 |
+
def _clean(text: str) -> str:
|
| 165 |
+
"""Remove all markdown and XML artifacts from LLM output."""
|
| 166 |
+
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
|
| 167 |
+
text = re.sub(r"<[^>]+>", "", text)
|
| 168 |
+
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
| 169 |
+
text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
|
| 170 |
+
text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
|
| 171 |
+
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
| 172 |
+
text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
|
| 173 |
+
text = re.sub(r"`([^`]+)`", r"\1", text)
|
| 174 |
+
text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
|
| 175 |
+
text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
|
| 176 |
+
text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
|
| 177 |
+
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
|
| 178 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 179 |
+
text = re.sub(r" {2,}", " ", text)
|
| 180 |
+
return text.strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
|
| 184 |
+
"""
|
| 185 |
+
Clean output that must have speaker tags (podcast or debate).
|
| 186 |
+
Normalises tag variants, removes lines without valid tags.
|
| 187 |
+
"""
|
| 188 |
+
text = _clean(text)
|
| 189 |
+
|
| 190 |
+
# Normalise tag variants the model might produce
|
| 191 |
+
if tag_a == "HOST_1":
|
| 192 |
+
text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
|
| 193 |
+
text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
|
| 194 |
+
elif tag_a == "DEBATER_A":
|
| 195 |
+
text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
|
| 196 |
+
text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
|
| 197 |
+
# Also catch "Pro:" / "Con:" / "Speaker A:" variants
|
| 198 |
+
text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
|
| 199 |
+
text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
|
| 200 |
+
text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
|
| 201 |
+
text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)
|
| 202 |
+
|
| 203 |
+
# Keep only lines that have a valid speaker tag
|
| 204 |
+
lines = text.splitlines()
|
| 205 |
+
clean_lines = [
|
| 206 |
+
ln for ln in lines
|
| 207 |
+
if ln.strip() == ""
|
| 208 |
+
or ln.strip().startswith(f"{tag_a}:")
|
| 209 |
+
or ln.strip().startswith(f"{tag_b}:")
|
| 210 |
+
]
|
| 211 |
+
return "\n".join(clean_lines).strip()
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 215 |
+
# LLM client
|
| 216 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 217 |
+
|
| 218 |
+
def _get_client() -> InferenceClient:
|
| 219 |
+
token = os.environ.get("HF_TOKEN")
|
| 220 |
+
if not token:
|
| 221 |
+
raise EnvironmentError(
|
| 222 |
+
"HF_TOKEN not set. Add your Hugging Face token as a Space secret."
|
| 223 |
+
)
|
| 224 |
+
return InferenceClient(provider="hf-inference", token=token)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def _call_llm(system: str, user: str) -> str:
|
| 228 |
+
client = _get_client()
|
| 229 |
+
response = client.chat_completion(
|
| 230 |
+
model=MODEL_ID,
|
| 231 |
+
messages=[
|
| 232 |
+
{"role": "system", "content": system},
|
| 233 |
+
{"role": "user", "content": user},
|
| 234 |
+
],
|
| 235 |
+
max_tokens=MAX_NEW_TOKENS,
|
| 236 |
+
temperature=TEMPERATURE,
|
| 237 |
+
top_p=0.9,
|
| 238 |
+
)
|
| 239 |
+
raw = response.choices[0].message.content.strip()
|
| 240 |
+
if not raw:
|
| 241 |
+
raise RuntimeError("Model returned empty response. Please try again.")
|
| 242 |
+
return raw
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 246 |
+
# Public entry point
|
| 247 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 248 |
+
|
| 249 |
+
def generate_script(
|
| 250 |
+
context_chunks: list[str],
|
| 251 |
+
mode: str = "Summary",
|
| 252 |
+
sub_mode: str = "Rap",
|
| 253 |
+
topic: str = "the key ideas from this document",
|
| 254 |
+
) -> str:
|
| 255 |
+
"""
|
| 256 |
+
Generate a spoken script from RAG chunks.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
context_chunks : chunks from RAGStore β NOT modified here
|
| 260 |
+
mode : "Summary" | "Podcast" | "Song / Rap" | "Debate"
|
| 261 |
+
sub_mode : "Song" | "Rap" (only for Song/Rap mode)
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
|
| 265 |
+
Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
|
| 266 |
+
"""
|
| 267 |
+
if not context_chunks:
|
| 268 |
+
raise ValueError("No document context. Please upload or paste content first.")
|
| 269 |
+
|
| 270 |
+
context = "\n\n".join(context_chunks)
|
| 271 |
+
if len(context) > 6000:
|
| 272 |
+
context = context[:6000]
|
| 273 |
+
logger.warning("Context truncated to 6000 chars")
|
| 274 |
+
|
| 275 |
+
logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))
|
| 276 |
+
|
| 277 |
+
m = mode.strip().lower()
|
| 278 |
+
|
| 279 |
+
if m == "summary":
|
| 280 |
+
raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
|
| 281 |
+
script = _clean(raw)
|
| 282 |
+
|
| 283 |
+
elif m == "podcast":
|
| 284 |
+
raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
|
| 285 |
+
script = _clean_dialogue(raw, "HOST_1", "HOST_2")
|
| 286 |
+
|
| 287 |
+
elif "song" in m or "rap" in m:
|
| 288 |
+
form = sub_mode.lower()
|
| 289 |
+
sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
|
| 290 |
+
raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
|
| 291 |
+
script = _clean(raw)
|
| 292 |
+
|
| 293 |
+
elif "debate" in m:
|
| 294 |
+
raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
|
| 295 |
+
script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")
|
| 296 |
+
|
| 297 |
+
elif "story" in m:
|
| 298 |
+
raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
|
| 299 |
+
script = _clean(raw)
|
| 300 |
+
|
| 301 |
+
else:
|
| 302 |
+
logger.warning("Unknown mode '%s' β falling back to Summary", mode)
|
| 303 |
+
raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
|
| 304 |
+
script = _clean(raw)
|
| 305 |
+
|
| 306 |
+
if not script:
|
| 307 |
+
raise RuntimeError("Script was empty after cleaning. Please try again.")
|
| 308 |
+
|
| 309 |
+
logger.info("Script ready: %d chars", len(script))
|
| 310 |
+
return script
|
tts.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β TTS Module.
|
| 3 |
+
|
| 4 |
+
Primary: Qwen3-TTS via HF Inference API
|
| 5 |
+
Fallback: Edge-TTS (CPU, no key needed)
|
| 6 |
+
|
| 7 |
+
Voice + audio style per mode:
|
| 8 |
+
Summary β neutral female voice, normal rate
|
| 9 |
+
Podcast β HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
|
| 10 |
+
Rap β male voice, faster rate (+40%), bass boost via pydub
|
| 11 |
+
Song β female voice, normal rate
|
| 12 |
+
Debate β DEBATER_A female (JennyNeural, +5%) / DEBATER_B male (DavisNeural, -5%)
|
| 13 |
+
Story β female voice, slow rate (-30%), long silence gaps between sentences
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import asyncio
|
| 19 |
+
from utils import logger, get_temp_filepath
|
| 20 |
+
|
| 21 |
+
QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
|
| 22 |
+
TTS_MAX_CHARS = 3000
|
| 23 |
+
|
| 24 |
+
# ββ Voice assignments βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
# Summary / Song / Story β single female voice
|
| 26 |
+
EDGE_VOICE_FEMALE = "en-US-AriaNeural"
|
| 27 |
+
|
| 28 |
+
# Podcast
|
| 29 |
+
EDGE_VOICE_HOST_FEMALE = "en-US-AriaNeural" # HOST_1 β female
|
| 30 |
+
EDGE_VOICE_HOST_MALE = "en-US-GuyNeural" # HOST_2 β male
|
| 31 |
+
|
| 32 |
+
# Rap β male voice reads the rap
|
| 33 |
+
EDGE_VOICE_RAP = "en-US-GuyNeural"
|
| 34 |
+
RAP_RATE = "+40%" # fast delivery
|
| 35 |
+
|
| 36 |
+
# Debate
|
| 37 |
+
EDGE_VOICE_DEBATER_A = "en-US-JennyNeural" # female, pro β assertive
|
| 38 |
+
EDGE_VOICE_DEBATER_B = "en-US-DavisNeural" # male, con β skeptical
|
| 39 |
+
DEBATE_RATE_A = "+8%" # slightly faster
|
| 40 |
+
DEBATE_RATE_B = "-5%" # slightly slower, deliberate
|
| 41 |
+
|
| 42 |
+
# Story β slow, warm delivery
|
| 43 |
+
EDGE_VOICE_STORY = "en-US-AriaNeural"
|
| 44 |
+
STORY_RATE = "-30%" # noticeably slower
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
# Low-level TTS helpers
|
| 49 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
|
| 51 |
+
def _qwen_tts(text: str) -> str | None:
|
| 52 |
+
token = os.environ.get("HF_TOKEN")
|
| 53 |
+
if not token:
|
| 54 |
+
return None
|
| 55 |
+
try:
|
| 56 |
+
from huggingface_hub import InferenceClient
|
| 57 |
+
client = InferenceClient(token=token)
|
| 58 |
+
audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
|
| 59 |
+
if not audio_bytes:
|
| 60 |
+
return None
|
| 61 |
+
path = get_temp_filepath(suffix=".wav")
|
| 62 |
+
with open(path, "wb") as f:
|
| 63 |
+
f.write(audio_bytes)
|
| 64 |
+
logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
|
| 65 |
+
return path
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.warning("Qwen TTS failed: %s", e)
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%") -> str:
|
| 72 |
+
"""
|
| 73 |
+
Generate audio via Edge-TTS.
|
| 74 |
+
rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
|
| 75 |
+
"""
|
| 76 |
+
import edge_tts
|
| 77 |
+
path = get_temp_filepath(suffix=".mp3")
|
| 78 |
+
snippet = text[:TTS_MAX_CHARS]
|
| 79 |
+
|
| 80 |
+
async def _run():
|
| 81 |
+
communicate = edge_tts.Communicate(snippet, voice, rate=rate)
|
| 82 |
+
await communicate.save(path)
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
loop = asyncio.get_event_loop()
|
| 86 |
+
if loop.is_running():
|
| 87 |
+
import concurrent.futures
|
| 88 |
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
| 89 |
+
pool.submit(asyncio.run, _run()).result(timeout=120)
|
| 90 |
+
else:
|
| 91 |
+
loop.run_until_complete(_run())
|
| 92 |
+
except RuntimeError:
|
| 93 |
+
asyncio.run(_run())
|
| 94 |
+
|
| 95 |
+
if os.path.getsize(path) == 0:
|
| 96 |
+
raise RuntimeError("Edge-TTS produced an empty audio file.")
|
| 97 |
+
logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
|
| 98 |
+
return path
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
# Audio post-processing
|
| 103 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
|
| 105 |
+
def _apply_rap_fx(path: str) -> str:
|
| 106 |
+
"""
|
| 107 |
+
Apply bass boost to a rap audio file using pydub.
|
| 108 |
+
Low-frequency boost makes it sound punchier and more rap-like.
|
| 109 |
+
Returns path to processed file (new file).
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
from pydub import AudioSegment
|
| 113 |
+
from pydub.effects import low_pass_filter
|
| 114 |
+
|
| 115 |
+
audio = AudioSegment.from_file(path)
|
| 116 |
+
|
| 117 |
+
# Split into bass (low) and mid/high frequencies
|
| 118 |
+
bass = low_pass_filter(audio, 200) # frequencies below 200 Hz
|
| 119 |
+
highs = audio - low_pass_filter(audio, 200) # everything above
|
| 120 |
+
|
| 121 |
+
# Boost bass by 6 dB, keep highs as-is, combine
|
| 122 |
+
boosted = (bass + 6).overlay(highs)
|
| 123 |
+
|
| 124 |
+
out = get_temp_filepath(suffix=".mp3")
|
| 125 |
+
boosted.export(out, format="mp3")
|
| 126 |
+
logger.info("Rap bass boost applied β %s", out)
|
| 127 |
+
return out
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.warning("Rap FX failed (%s) β returning original audio", e)
|
| 130 |
+
return path
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _concat(paths: list[str], silence_ms: int = 300) -> str:
|
| 134 |
+
"""Concatenate audio files with silence between each segment."""
|
| 135 |
+
if len(paths) == 1:
|
| 136 |
+
return paths[0]
|
| 137 |
+
try:
|
| 138 |
+
from pydub import AudioSegment
|
| 139 |
+
combined = AudioSegment.empty()
|
| 140 |
+
silence = AudioSegment.silent(duration=silence_ms)
|
| 141 |
+
for p in paths:
|
| 142 |
+
combined += AudioSegment.from_file(p) + silence
|
| 143 |
+
out = get_temp_filepath(suffix=".mp3")
|
| 144 |
+
combined.export(out, format="mp3")
|
| 145 |
+
logger.info("Concatenated %d segments β %s", len(paths), out)
|
| 146 |
+
return out
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.warning("pydub concat failed (%s) β returning first segment", e)
|
| 149 |
+
return paths[0]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _add_story_gaps(path: str) -> str:
|
| 153 |
+
"""
|
| 154 |
+
Insert longer silence gaps between sentences in story audio.
|
| 155 |
+
Gives the warm, unhurried feel of a storyteller.
|
| 156 |
+
"""
|
| 157 |
+
try:
|
| 158 |
+
from pydub import AudioSegment
|
| 159 |
+
audio = AudioSegment.from_file(path)
|
| 160 |
+
gap = AudioSegment.silent(duration=600) # 600 ms between sentences
|
| 161 |
+
# Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
|
| 162 |
+
chunk_ms = 5000
|
| 163 |
+
chunks = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
|
| 164 |
+
combined = AudioSegment.empty()
|
| 165 |
+
for chunk in chunks:
|
| 166 |
+
combined += chunk + gap
|
| 167 |
+
out = get_temp_filepath(suffix=".mp3")
|
| 168 |
+
combined.export(out, format="mp3")
|
| 169 |
+
logger.info("Story gaps applied β %s", out)
|
| 170 |
+
return out
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.warning("Story gap insertion failed (%s) β returning original", e)
|
| 173 |
+
return path
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
+
# Dialogue script parser
|
| 178 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
+
|
| 180 |
+
def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
|
| 181 |
+
"""Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
|
| 182 |
+
segments: list[tuple[str, str]] = []
|
| 183 |
+
prefix_a = f"{tag_a}:"
|
| 184 |
+
prefix_b = f"{tag_b}:"
|
| 185 |
+
|
| 186 |
+
for line in script.splitlines():
|
| 187 |
+
line = line.strip()
|
| 188 |
+
if line.startswith(prefix_a):
|
| 189 |
+
text = line[len(prefix_a):].strip()
|
| 190 |
+
if text:
|
| 191 |
+
if segments and segments[-1][0] == tag_a:
|
| 192 |
+
segments[-1] = (tag_a, segments[-1][1] + " " + text)
|
| 193 |
+
else:
|
| 194 |
+
segments.append((tag_a, text))
|
| 195 |
+
elif line.startswith(prefix_b):
|
| 196 |
+
text = line[len(prefix_b):].strip()
|
| 197 |
+
if text:
|
| 198 |
+
if segments and segments[-1][0] == tag_b:
|
| 199 |
+
segments[-1] = (tag_b, segments[-1][1] + " " + text)
|
| 200 |
+
else:
|
| 201 |
+
segments.append((tag_b, text))
|
| 202 |
+
return segments
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 206 |
+
# Per-mode audio generators
|
| 207 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 208 |
+
|
| 209 |
+
def generate_audio_podcast(script: str) -> tuple[str, str]:
|
| 210 |
+
"""
|
| 211 |
+
Podcast: HOST_1 = female (AriaNeural), HOST_2 = male (GuyNeural).
|
| 212 |
+
Normal conversational rate, 300 ms silence between turns.
|
| 213 |
+
"""
|
| 214 |
+
segments = _parse_dialogue(script, "HOST_1", "HOST_2")
|
| 215 |
+
if not segments:
|
| 216 |
+
logger.warning("No HOST tags β falling back to single voice")
|
| 217 |
+
return generate_audio(script)
|
| 218 |
+
|
| 219 |
+
voice_map = {
|
| 220 |
+
"HOST_1": (EDGE_VOICE_HOST_FEMALE, "+0%"),
|
| 221 |
+
"HOST_2": (EDGE_VOICE_HOST_MALE, "+0%"),
|
| 222 |
+
}
|
| 223 |
+
paths = []
|
| 224 |
+
for speaker, text in segments:
|
| 225 |
+
voice, rate = voice_map[speaker]
|
| 226 |
+
try:
|
| 227 |
+
paths.append(_edge_tts(text, voice=voice, rate=rate))
|
| 228 |
+
except Exception as e:
|
| 229 |
+
logger.warning("Podcast segment failed %s: %s", speaker, e)
|
| 230 |
+
|
| 231 |
+
if not paths:
|
| 232 |
+
raise RuntimeError("All podcast segments failed.")
|
| 233 |
+
return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def generate_audio_debate(script: str) -> tuple[str, str]:
|
| 237 |
+
"""
|
| 238 |
+
Debate: DEBATER_A = female (JennyNeural, assertive +8%),
|
| 239 |
+
DEBATER_B = male (DavisNeural, deliberate -5%).
|
| 240 |
+
400 ms silence between turns for debate feel.
|
| 241 |
+
"""
|
| 242 |
+
segments = _parse_dialogue(script, "DEBATER_A", "DEBATER_B")
|
| 243 |
+
if not segments:
|
| 244 |
+
logger.warning("No DEBATER tags β falling back to single voice")
|
| 245 |
+
return generate_audio(script)
|
| 246 |
+
|
| 247 |
+
voice_map = {
|
| 248 |
+
"DEBATER_A": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
|
| 249 |
+
"DEBATER_B": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
|
| 250 |
+
}
|
| 251 |
+
paths = []
|
| 252 |
+
for speaker, text in segments:
|
| 253 |
+
voice, rate = voice_map[speaker]
|
| 254 |
+
try:
|
| 255 |
+
paths.append(_edge_tts(text, voice=voice, rate=rate))
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.warning("Debate segment failed %s: %s", speaker, e)
|
| 258 |
+
|
| 259 |
+
if not paths:
|
| 260 |
+
raise RuntimeError("All debate segments failed.")
|
| 261 |
+
return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def generate_audio_rap(script: str) -> tuple[str, str]:
|
| 265 |
+
"""
|
| 266 |
+
Rap: male voice, fast rate (+40%), then bass boost applied via pydub.
|
| 267 |
+
"""
|
| 268 |
+
path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
|
| 269 |
+
path = _apply_rap_fx(path)
|
| 270 |
+
return path, "Edge-TTS (Rap)"
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def generate_audio_story(script: str) -> tuple[str, str]:
|
| 274 |
+
"""
|
| 275 |
+
Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
|
| 276 |
+
"""
|
| 277 |
+
path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
|
| 278 |
+
path = _add_story_gaps(path)
|
| 279 |
+
return path, "Edge-TTS (Story)"
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 283 |
+
# Unified public interface
|
| 284 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
+
|
| 286 |
+
def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
|
| 287 |
+
"""Single-voice TTS for Summary and Song modes. Tries Qwen first."""
|
| 288 |
+
if not text or not text.strip():
|
| 289 |
+
raise ValueError("No text provided for audio generation.")
|
| 290 |
+
path = _qwen_tts(text)
|
| 291 |
+
if path and os.path.exists(path):
|
| 292 |
+
return path, "Qwen3-TTS"
|
| 293 |
+
return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"
|
utils.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceVerse AI β Utility helpers.
|
| 3 |
+
|
| 4 |
+
Provides temp file management and error formatting
|
| 5 |
+
used across the pipeline.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(
|
| 13 |
+
level=logging.INFO,
|
| 14 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 15 |
+
)
|
| 16 |
+
logger = logging.getLogger("voiceverse")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_temp_filepath(suffix: str = ".wav") -> str:
|
| 20 |
+
"""Return a path to a new temporary file that won't be auto-deleted."""
|
| 21 |
+
fd, path = tempfile.mkstemp(suffix=suffix)
|
| 22 |
+
os.close(fd)
|
| 23 |
+
return path
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def format_error(stage: str, error: Exception) -> str:
|
| 27 |
+
"""
|
| 28 |
+
Return a user-friendly error string.
|
| 29 |
+
Hides raw tracebacks; logs the full error for debugging.
|
| 30 |
+
"""
|
| 31 |
+
logger.error("Error in %s: %s", stage, error, exc_info=True)
|
| 32 |
+
friendly_messages = {
|
| 33 |
+
"upload": "Could not read the uploaded file. Please try a different PDF or TXT file.",
|
| 34 |
+
"rag": "Failed to process the document text. The file may be empty or corrupted.",
|
| 35 |
+
"script": "Could not generate the audio script. Please check your HF_TOKEN and try again.",
|
| 36 |
+
"tts": "Audio generation failed. The system will retry with a fallback voice.",
|
| 37 |
+
}
|
| 38 |
+
return friendly_messages.get(stage, f"An unexpected error occurred: {stage}")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def validate_file(file_path: str) -> tuple[bool, str]:
|
| 42 |
+
"""
|
| 43 |
+
Validate an uploaded file. Returns (is_valid, message).
|
| 44 |
+
"""
|
| 45 |
+
if file_path is None:
|
| 46 |
+
return False, "Please upload a PDF or TXT file first."
|
| 47 |
+
|
| 48 |
+
if not os.path.exists(file_path):
|
| 49 |
+
return False, "The uploaded file could not be found. Please try again."
|
| 50 |
+
|
| 51 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 52 |
+
if ext not in (".pdf", ".txt"):
|
| 53 |
+
return False, f"Unsupported file format '{ext}'. Please upload a PDF or TXT file."
|
| 54 |
+
|
| 55 |
+
size = os.path.getsize(file_path)
|
| 56 |
+
if size == 0:
|
| 57 |
+
return False, "The uploaded file is empty. Please upload a file with content."
|
| 58 |
+
|
| 59 |
+
if size > 20 * 1024 * 1024: # 20 MB limit
|
| 60 |
+
return False, "File is too large (>20 MB). Please upload a smaller document."
|
| 61 |
+
|
| 62 |
+
return True, "File is valid."
|