Spaces:

Isshi14
/

Voiceover-ai-2

Sleeping

App Files Files Community

Isshi14 commited on Feb 20

Commit

3828c7d

verified ·

1 Parent(s): 3e9d922

Upload 11 files

Browse files

Files changed (11) hide show

README.md +57 -13
app.py +393 -0
convert_to_word.ps1 +58 -0
gitattributes +35 -0
ingestion.py +217 -0
packages.txt +0 -0
rag.py +198 -0
requirements.txt +13 -0
script_gen.py +310 -0
tts.py +293 -0
utils.py +62 -0

README.md CHANGED Viewed

@@ -1,13 +1,57 @@
----
-title: Voiceover Ai 2
-emoji: 👀
-colorFrom: red
-colorTo: pink
-sdk: gradio
-sdk_version: 6.6.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: VoiceVerse AI
+emoji: 🎙️
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+sdk_version: "5.23.1"
+python_version: "3.10"
+app_file: app.py
+pinned: false
+---
+# 🎙️ VoiceVerse AI — Document to Audio
+Transform uploaded documents into engaging, emotionally expressive podcast-style audio narrations.
+## Pipeline
+```
+PDF/TXT → Text Extraction → RAG (chunk + embed + retrieve) → Script Generation (Mistral-7B) → TTS (Qwen3-TTS / Edge-TTS) → Audio Playback
+```
+## Models Used
+| Component | Model | How |
+|-----------|-------|-----|
+| Embeddings | `all-MiniLM-L6-v2` | Local (CPU) |
+| Script Gen | `Mistral-7B-Instruct-v0.3` | HF Inference API |
+| TTS (primary) | `Qwen3-TTS` | HF Inference API |
+| TTS (fallback) | `Edge-TTS (AriaNeural)` | Local (CPU) |
+## Setup
+```bash
+pip install -r requirements.txt
+export HF_TOKEN="your_huggingface_token_here"
+python app.py
+```
+## Deployment on HF Spaces
+1. Create a new Space (Gradio SDK)
+2. Upload all project files
+3. Set `HF_TOKEN` as a Space Secret
+4. The app will auto-launch on port 7860
+## Project Structure
+```
+app.py           # Gradio UI entry point
+rag.py           # Document ingestion, chunking, embedding, retrieval
+script_gen.py    # LLM script generation (Mistral-7B-Instruct)
+tts.py           # Text-to-speech (Qwen3-TTS + Edge-TTS fallback)
+utils.py         # Helpers (temp files, validation, error formatting)
+requirements.txt # Python dependencies
+packages.txt     # System packages (ffmpeg)
+```

app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+VoiceVerse AI — Main Application.
+Input sources (three tabs):
+  Tab 1 — Upload PDF or TXT file
+  Tab 2 — URL / YouTube link
+  Tab 3 — Paste raw text
+Delivery Modes:
+  Summary / Podcast / Song / Rap / Debate
+No status card shown. RAG pipeline unchanged.
+"""
+import os
+import gradio as gr
+from utils import logger, validate_file, format_error
+from rag import extract_text, RAGStore
+from script_gen import generate_script
+from tts import generate_audio, generate_audio_podcast, generate_audio_debate, generate_audio_rap, generate_audio_story
+from ingestion import ingest_from_url_or_text, extract_pasted_text
+# ── Global RAG Store ──────────────────────────────────────────────────────────
+rag_store = RAGStore()
+# ══════════════════════════════════════════════════════════════════════════════
+# Shared RAG + Script + TTS pipeline
+# ══════════════════════════════════════════════════════════════════════════════
+def _run_pipeline(
+    text: str,
+    delivery_mode: str,
+    song_rap_sub: str,
+    progress,
+) -> tuple[str, str]:
+    """
+    RAG → script → audio. Shared by all three input tabs.
+    Returns (script, audio_path).
+    """
+    # RAG: chunk & embed
+    progress(0.30, desc="🧠 Building knowledge index…")
+    rag_store.add_document(text)
+    chunk_count = len(rag_store.chunks)
+    logger.info("RAG index: %d chunks", chunk_count)
+    # RAG: retrieve
+    progress(0.45, desc="🔍 Retrieving relevant content…")
+    if chunk_count <= 8:
+        context_chunks = rag_store.get_all_chunks()
+    else:
+        context_chunks = rag_store.query(
+            "What are the main topics, key insights, and important details?",
+            top_k=6,
+        )
+    # Script generation
+    progress(0.60, desc=f"✍️ Writing {_mode_label(delivery_mode, song_rap_sub)} script…")
+    script = generate_script(
+        context_chunks=context_chunks,
+        mode=delivery_mode,
+        sub_mode=song_rap_sub,
+    )
+    logger.info("Script: %d chars", len(script))
+    # TTS — route by mode
+    progress(0.80, desc="🎙️ Synthesising audio…")
+    m = delivery_mode.strip().lower()
+    if m == "podcast":
+        audio_path, engine = generate_audio_podcast(script)
+    elif m == "debate":
+        audio_path, engine = generate_audio_debate(script)
+    elif m == "song / rap" and song_rap_sub.lower() == "rap":
+        audio_path, engine = generate_audio_rap(script)
+    elif m == "story":
+        audio_path, engine = generate_audio_story(script)
+    else:
+        audio_path, engine = generate_audio(script)
+    logger.info("Audio via %s: %s", engine, audio_path)
+    progress(1.00, desc="✅ Done!")
+    return script, audio_path
+def _mode_label(mode: str, sub_mode: str) -> str:
+    m = mode.lower()
+    if "podcast" in m:
+        return "podcast"
+    if "debate" in m:
+        return "debate"
+    if "story" in m:
+        return "story"
+    if "song" in m or "rap" in m:
+        return sub_mode.lower()
+    return "summary"
+# ══════════════════════════════════════════════════════════════════════════════
+# Per-tab handlers
+# ══════════════════════════════════════════════════════════════════════════════
+def process_file(file, delivery_mode, song_rap_sub, progress=gr.Progress()):
+    if file is None:
+        raise gr.Error("Please upload a PDF or TXT file first.")
+    file_path = file.name if hasattr(file, "name") else str(file)
+    is_valid, msg = validate_file(file_path)
+    if not is_valid:
+        raise gr.Error(msg)
+    try:
+        progress(0.10, desc="📄 Extracting text from document…")
+        text = extract_text(file_path)
+        if not text or len(text.strip()) < 50:
+            raise gr.Error("Document has too little text. Please upload a richer file.")
+        progress(0.20, desc="✅ Text extracted")
+        return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
+    except gr.Error:
+        raise
+    except EnvironmentError as e:
+        raise gr.Error(str(e))
+    except Exception as e:
+        raise gr.Error(format_error("pipeline", e))
+def process_url(url_input, delivery_mode, song_rap_sub, progress=gr.Progress()):
+    if not url_input or not url_input.strip():
+        raise gr.Error("Please enter a URL or YouTube link.")
+    try:
+        progress(0.05, desc="🌐 Fetching content…")
+        text, source_label = ingest_from_url_or_text(url_input.strip())
+        logger.info("Ingested from %s: %d chars", source_label, len(text))
+        progress(0.20, desc=f"✅ Content fetched from {source_label}")
+        return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
+    except gr.Error:
+        raise
+    except ValueError as e:
+        raise gr.Error(str(e))
+    except EnvironmentError as e:
+        raise gr.Error(str(e))
+    except Exception as e:
+        raise gr.Error(format_error("pipeline", e))
+def process_paste(pasted_text, delivery_mode, song_rap_sub, progress=gr.Progress()):
+    if not pasted_text or not pasted_text.strip():
+        raise gr.Error("Please paste some text first.")
+    try:
+        progress(0.10, desc="📋 Processing pasted text…")
+        text = extract_pasted_text(pasted_text)
+        progress(0.20, desc="✅ Text ready")
+        return _run_pipeline(text, delivery_mode, song_rap_sub, progress)
+    except gr.Error:
+        raise
+    except ValueError as e:
+        raise gr.Error(str(e))
+    except EnvironmentError as e:
+        raise gr.Error(str(e))
+    except Exception as e:
+        raise gr.Error(format_error("pipeline", e))
+# ══════════════════════════════════════════════════════════════════════════════
+# UI helpers
+# ══════════════════════════════════════════════════════════════════════════════
+def _mode_description(mode: str) -> str:
+    return {
+        "Summary": (
+            "*📋 **Summary** — Structured narration: intro, key points, conclusion. "
+            "Single voice, neutral tone.*"
+        ),
+        "Podcast": (
+            "*🎙️ **Podcast** — Two-host conversation. Female host guides; "
+            "Male host explains. Dual voices.*"
+        ),
+        "Song / Rap": (
+            "*🎵 **Song / Rap** — Key ideas as a rhythmic track. "
+            "Song = smooth flow · Rap = fast, punchy, bass-boosted.*"
+        ),
+        "Debate": (
+            "*⚔️ **Debate** — Two debaters argue opposing sides. "
+            "Female voice (pro, assertive) vs Male voice (con, deliberate).*"
+        ),
+        "Story": (
+            "*📖 **Story** — Content retold as an immersive narrative. "
+            "Slow, warm delivery with expressive pauses.*"
+        ),
+    }.get(mode, "")
+def _on_mode_change(mode: str):
+    show_sub = "song" in mode.lower() or "rap" in mode.lower()
+    return gr.update(visible=show_sub), _mode_description(mode)
+# ══════════════════════════════════════════════════════════════════════════════
+# Gradio UI
+# ══════════════════════════════════════════════════════════════════════════════
+def build_ui() -> gr.Blocks:
+    css = """
+    .main-header { text-align: center; margin-bottom: 1rem; }
+    .main-header h1 {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-size: 2.5rem;
+        font-weight: 800;
+        margin-bottom: 0.25rem;
+    }
+    .main-header p { color: #6b7280; font-size: 1.1rem; }
+    .mode-card {
+        background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
+        border: 1px solid #e0d9ff;
+        border-radius: 12px;
+        padding: 1rem 1.25rem;
+        margin-top: 0.75rem;
+        margin-bottom: 0.75rem;
+    }
+    /* Hide the "Radio" label Gradio adds automatically */
+    #delivery-mode-radio .label-wrap { display: none !important; }
+    .url-hint { color: #6b7280; font-size: 0.82rem; margin-top: 0.3rem; }
+    """
+    with gr.Blocks(
+        title="VoiceVerse AI",
+        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
+        css=css,
+    ) as app:
+        # ── Header ───────────────────────────────────────────────────────────
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🎙️ VoiceVerse AI</h1>
+            <p>Transform any content into an engaging audio experience</p>
+        </div>
+        """)
+        with gr.Row(equal_height=False):
+            # ── LEFT COLUMN ────────────────────────────────────────────────���──
+            with gr.Column(scale=1):
+                gr.Markdown("### 📥 Choose Your Content Source")
+                with gr.Tabs():
+                    # ── Tab 1: File upload ────────────────────────────────────
+                    with gr.Tab("📄 File Upload"):
+                        file_input = gr.File(
+                            label="Upload a PDF or TXT file",
+                            file_types=[".pdf", ".txt"],
+                            type="filepath",
+                        )
+                        file_btn = gr.Button(
+                            "🎙️ Generate Audio",
+                            variant="primary",
+                            size="lg",
+                        )
+                    # ── Tab 2: URL / YouTube ──────────────────────────────────
+                    with gr.Tab("🔗 URL"):
+                        url_input = gr.Textbox(
+                            label=None,
+                            placeholder=(
+                                "Paste any link here…\n\n"
+                                "▶  YouTube:  https://youtube.com/watch?v=...\n"
+                                "📰 Article:  https://example.com/article\n"
+                                "🌐 Website:  https://en.wikipedia.org/wiki/..."
+                            ),
+                            lines=5,
+                            max_lines=6,
+                            show_label=False,
+                        )
+                        gr.HTML(
+                            "<p class='url-hint'>"
+                            "✅ Works with: YouTube (with captions), news articles, "
+                            "blogs, Wikipedia, most public pages.<br>"
+                            "❌ Won't work: paywalled or login-required pages."
+                            "</p>"
+                        )
+                        url_btn = gr.Button(
+                            "🎙️ Generate Audio",
+                            variant="primary",
+                            size="lg",
+                        )
+                    # ── Tab 3: Paste text ─────────────────────────────────────
+                    with gr.Tab("📋 Paste Text"):
+                        paste_input = gr.Textbox(
+                            label=None,
+                            placeholder=(
+                                "Paste any text here — article content, notes, "
+                                "transcripts, research, anything…"
+                            ),
+                            lines=10,
+                            max_lines=40,
+                            show_label=False,
+                        )
+                        paste_btn = gr.Button(
+                            "🎙️ Generate Audio",
+                            variant="primary",
+                            size="lg",
+                        )
+                # ── Delivery Mode card ────────────────────────────────────────
+                gr.Markdown("### 🎨 Choose Audio Experience")
+                delivery_mode = gr.Radio(
+                    choices=["Summary", "Podcast", "Song / Rap", "Debate", "Story"],
+                    value="Summary",
+                    show_label=False,          # removes the "Radio" label
+                    elem_id="delivery-mode-radio",
+                )
+                # Song/Rap sub-option — hidden unless Song/Rap is selected
+                with gr.Row(visible=False) as song_rap_row:
+                    song_rap_sub = gr.Radio(
+                        choices=["Song", "Rap"],
+                        value="Rap",
+                        label="Style",
+                    )
+                mode_description = gr.Markdown(value=_mode_description("Summary"))
+            # ── RIGHT COLUMN ──────────────────────────────────────────────────
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎧 Generated Audio")
+                audio_output = gr.Audio(
+                    label="Audio",
+                    type="filepath",
+                    interactive=False,
+                    show_download_button=True,
+                )
+                gr.Markdown("### ✍️ Generated Script")
+                script_output = gr.Textbox(
+                    label="Script",
+                    lines=14,
+                    max_lines=22,
+                    interactive=False,
+                    placeholder="Your generated script will appear here…",
+                    show_copy_button=True,
+                )
+        # ── Footer ────────────────���──────────────────────────────────────────
+        gr.Markdown(
+            "<center style='color:#9ca3af;margin-top:1rem;'>"
+            "Built with ❤️ using SmolLM3-3B · Qwen3-TTS · Edge-TTS · Gradio"
+            "</center>"
+        )
+        # ── Event wiring ─────────────────────────────────────────────────────
+        delivery_mode.change(
+            fn=_on_mode_change,
+            inputs=[delivery_mode],
+            outputs=[song_rap_row, mode_description],
+        )
+        file_btn.click(
+            fn=process_file,
+            inputs=[file_input, delivery_mode, song_rap_sub],
+            outputs=[script_output, audio_output],
+        )
+        url_btn.click(
+            fn=process_url,
+            inputs=[url_input, delivery_mode, song_rap_sub],
+            outputs=[script_output, audio_output],
+        )
+        paste_btn.click(
+            fn=process_paste,
+            inputs=[paste_input, delivery_mode, song_rap_sub],
+            outputs=[script_output, audio_output],
+        )
+    return app
+# ── Entry point ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    logger.info("Starting VoiceVerse AI…")
+    app = build_ui()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+    )

convert_to_word.ps1 ADDED Viewed

	@@ -0,0 +1,58 @@

+$markdownPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\project_report.md"
+$wordPath = "C:\Users\hp\.gemini\antigravity\brain\04f3e1c4-7b81-497c-a7c5-5d0513033dfa\VoiceVerse_AI_Project_Report.docx"
+if (-not (Test-Path $markdownPath)) {
+    Write-Error "Markdown file not found at $markdownPath"
+    exit 1
+}
+$content = Get-Content -Path $markdownPath -Raw
+# Create Word Object
+try {
+    $word = New-Object -ComObject Word.Application
+    $word.Visible = $false
+    $doc = $word.Documents.Add()
+    $selection = $word.Selection
+    # Basic Markdown Parsing (Simplified)
+    $lines = $content -split "`r?`n"
+    foreach ($line in $lines) {
+        if ($line -match "^# (.*)") {
+            $selection.Style = "Title"
+            $selection.TypeText($matches[1])
+            $selection.TypeParagraph()
+        } elseif ($line -match "^## (.*)") {
+            $selection.Style = "Heading 1"
+            $selection.TypeText($matches[1])
+            $selection.TypeParagraph()
+        } elseif ($line -match "^### (.*)") {
+            $selection.Style = "Heading 2"
+            $selection.TypeText($matches[1])
+            $selection.TypeParagraph()
+        } elseif ($line -match "^---") {
+            # Skip horizontal rules or add a page break?
+            # For now just skip
+        } elseif ($line -match "^\|") {
+            # Table handling is complex, for now just TypeText
+            $selection.Style = "Normal"
+            $selection.TypeText($line)
+            $selection.TypeParagraph()
+        } else {
+            $selection.Style = "Normal"
+            # Remove bold/italic markers for cleaner look
+            $cleanLine = $line -replace "\*\*", "" -replace "\*", ""
+            $selection.TypeText($cleanLine)
+            $selection.TypeParagraph()
+        }
+    }
+    $doc.SaveAs([ref]$wordPath)
+    $doc.Close()
+    $word.Quit()
+    Write-Host "Word document created successfully at $wordPath"
+} catch {
+    Write-Error "Failed to create Word document: $_"
+    if ($word) { $word.Quit() }
+}

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

ingestion.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+VoiceVerse AI — Content Ingestion Module.
+Handles all input sources beyond file upload:
+  - YouTube links      → transcript via youtube-transcript-api
+  - Article / website  → readable text via trafilatura + BeautifulSoup fallback
+  - Pasted raw text    → light cleaning and validation
+Returns plain text string that feeds into RAGStore.add_document().
+rag.py is completely unchanged.
+"""
+import re
+import urllib.parse
+from utils import logger
+# ══════════════════════════════════════════════════════════════════════════════
+# URL type detection
+# ══════════════════════════════════════════════════════════════════════════════
+def _is_youtube(url: str) -> bool:
+    parsed = urllib.parse.urlparse(url.strip())
+    host = parsed.netloc.lower().replace("www.", "")
+    return host in ("youtube.com", "youtu.be")
+def _extract_youtube_id(url: str) -> str | None:
+    patterns = [
+        r"(?:v=)([a-zA-Z0-9_-]{11})",
+        r"youtu\.be/([a-zA-Z0-9_-]{11})",
+        r"embed/([a-zA-Z0-9_-]{11})",
+        r"shorts/([a-zA-Z0-9_-]{11})",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
+# ══════════════════════════════════════════════════════════════════════════════
+# YouTube transcript
+# ══════════════════════════════════════════════════════════════════════════════
+def extract_youtube(url: str) -> str:
+    try:
+        from youtube_transcript_api import (
+            YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
+        )
+    except ImportError:
+        raise ImportError(
+            "youtube-transcript-api is not installed. "
+            "Add 'youtube-transcript-api' to requirements.txt and restart the Space."
+        )
+    video_id = _extract_youtube_id(url)
+    if not video_id:
+        raise ValueError(f"Could not extract a YouTube video ID from: {url}")
+    logger.info("Fetching YouTube transcript: video_id=%s", video_id)
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        # Prefer English manual captions, then English auto, then anything available
+        try:
+            transcript = transcript_list.find_manually_created_transcript(
+                ["en", "en-US", "en-GB"]
+            )
+        except NoTranscriptFound:
+            try:
+                transcript = transcript_list.find_generated_transcript(
+                    ["en", "en-US", "en-GB"]
+                )
+            except NoTranscriptFound:
+                transcript = next(iter(transcript_list))
+                logger.info("No English transcript — using: %s", transcript.language)
+        entries = transcript.fetch()
+        text = " ".join(entry["text"] for entry in entries)
+        # Clean YouTube caption artifacts
+        text = re.sub(r"\[.*?\]", "", text)        # [Music], [Applause] etc.
+        text = re.sub(r"\s{2,}", " ", text).strip()
+        if len(text) < 50:
+            raise ValueError("YouTube transcript is too short to process.")
+        logger.info("YouTube transcript: %d chars", len(text))
+        return text
+    except (NoTranscriptFound, TranscriptsDisabled) as e:
+        raise ValueError(
+            f"No transcript available for this video. "
+            f"The video may have captions disabled or be private.\n\n"
+            f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
+        )
+# ══════════════════════════════════════════════════════════════════════════════
+# Article / website URL
+# ══════════════════════════════════════════════════════════════════════════════
+def extract_url(url: str) -> str:
+    """
+    Fetch a webpage and extract readable text.
+    Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
+    """
+    url = url.strip()
+    logger.info("Fetching URL: %s", url)
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0.0.0 Safari/537.36"
+        )
+    }
+    # ── Attempt 1: trafilatura ────────────────────────────────────────────────
+    try:
+        import trafilatura
+        downloaded = trafilatura.fetch_url(url)
+        if downloaded:
+            text = trafilatura.extract(
+                downloaded,
+                include_comments=False,
+                include_tables=True,
+                no_fallback=False,
+            )
+            if text and len(text.strip()) > 100:
+                logger.info("trafilatura extracted %d chars", len(text))
+                return text.strip()
+    except Exception as e:
+        logger.warning("trafilatura failed (%s) — trying BeautifulSoup", e)
+    # ── Attempt 2: requests + BeautifulSoup ──────────────────────────────────
+    try:
+        import requests
+        from bs4 import BeautifulSoup
+        resp = requests.get(url, headers=headers, timeout=15)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        for tag in soup(["script", "style", "nav", "footer", "header",
+                          "aside", "form", "noscript", "iframe"]):
+            tag.decompose()
+        article = soup.find("article") or soup.find("main") or soup.find("body")
+        text = (
+            article.get_text(separator=" ", strip=True)
+            if article
+            else soup.get_text(separator=" ", strip=True)
+        )
+        text = re.sub(r"\s{3,}", "\n\n", text)
+        text = re.sub(r" {2,}", " ", text).strip()
+        if len(text) < 100:
+            raise ValueError("Could not extract enough text from this page.")
+        logger.info("BeautifulSoup extracted %d chars", len(text))
+        return text
+    except Exception as e:
+        raise ValueError(
+            f"Could not fetch content from: {url}\n\n"
+            f"Reason: {e}\n\n"
+            "The page may require a login or block bots. "
+            "Try copying the article text and pasting it in the Paste Text tab."
+        )
+# ══════════════════════════════════════════════════════════════════════════════
+# Pasted raw text
+# ══════════════════════════════════════════════════════════════════════════════
+def extract_pasted_text(text: str) -> str:
+    if not text or not text.strip():
+        raise ValueError("No text was pasted. Please paste some content.")
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    text = re.sub(r"\n{4,}", "\n\n\n", text)
+    text = re.sub(r" {2,}", " ", text).strip()
+    if len(text) < 50:
+        raise ValueError(
+            "Pasted text is too short. Please paste at least a paragraph of content."
+        )
+    logger.info("Pasted text ingested: %d chars", len(text))
+    return text
+# ══════════════════════════════════════════════════════════════════════════════
+# Unified entry point
+# ══════════════════════════════════════════════════════════════════════════════
+def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
+    """
+    Auto-detect whether input is a YouTube URL, article URL, or plain text.
+    Returns:
+        (extracted_text, source_label)
+    """
+    raw = raw_input.strip()
+    if not raw:
+        raise ValueError("Please enter a URL or paste some text.")
+    if re.match(r"https?://", raw, re.IGNORECASE):
+        if _is_youtube(raw):
+            return extract_youtube(raw), "YouTube"
+        else:
+            return extract_url(raw), "Article / Website"
+    else:
+        return extract_pasted_text(raw), "Pasted Text"

packages.txt ADDED Viewed

File without changes

rag.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+VoiceVerse AI — RAG Pipeline.
+Handles document ingestion, text chunking, embedding generation,
+and semantic retrieval using an in-memory vector store.
+Models used:
+  - sentence-transformers/all-MiniLM-L6-v2 for embeddings (22 MB, CPU-friendly)
+Design decisions:
+  - NumPy cosine similarity instead of FAISS to avoid heavy native deps
+  - Overlapping chunks to preserve context across boundaries
+  - Single-document architecture (clear store on new upload)
+"""
+import os
+import numpy as np
+from utils import logger
+# ── Text Extraction ──────────────────────────────────────────────────────────
+def extract_text(file_path: str) -> str:
+    """
+    Extract plain text from a PDF or TXT file.
+    Returns the full document text as a single string.
+    """
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".pdf":
+        return _extract_pdf(file_path)
+    elif ext == ".txt":
+        return _extract_txt(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+def _extract_pdf(file_path: str) -> str:
+    """Extract text from PDF using PyMuPDF."""
+    import fitz  # PyMuPDF
+    text_parts = []
+    with fitz.open(file_path) as doc:
+        for page_num, page in enumerate(doc):
+            page_text = page.get_text("text")
+            if page_text.strip():
+                text_parts.append(page_text)
+            logger.debug("Extracted page %d: %d chars", page_num + 1, len(page_text))
+    full_text = "\n\n".join(text_parts)
+    logger.info("PDF extraction complete: %d pages, %d chars total",
+                len(text_parts), len(full_text))
+    return full_text
+def _extract_txt(file_path: str) -> str:
+    """Read plain text file with encoding fallback."""
+    for encoding in ("utf-8", "utf-8-sig", "latin-1", "cp1252"):
+        try:
+            with open(file_path, "r", encoding=encoding) as f:
+                text = f.read()
+            logger.info("TXT extraction complete (%s): %d chars", encoding, len(text))
+            return text
+        except UnicodeDecodeError:
+            continue
+    raise ValueError("Could not decode the text file with any supported encoding.")
+# ── Text Chunking ────────────────────────────────────────────────────────────
+def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
+    """
+    Split text into overlapping chunks of roughly `chunk_size` characters.
+    Overlap ensures context isn't lost at chunk boundaries.
+    Uses sentence-aware splitting: tries to break at sentence boundaries
+    within the chunk window for more coherent chunks.
+    """
+    if not text or not text.strip():
+        return []
+    # Clean up whitespace
+    text = " ".join(text.split())
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        # If not at the end, try to break at a sentence boundary
+        if end < len(text):
+            # Look for sentence-ending punctuation near the end
+            search_start = max(start + chunk_size // 2, start)
+            last_period = -1
+            for i in range(min(end, len(text)) - 1, search_start - 1, -1):
+                if text[i] in ".!?" and (i + 1 >= len(text) or text[i + 1] == " "):
+                    last_period = i
+                    break
+            if last_period > start:
+                end = last_period + 1
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        # Move forward by (chunk length - overlap)
+        start = max(start + 1, end - overlap)
+    logger.info("Chunking complete: %d chunks (size=%d, overlap=%d)",
+                len(chunks), chunk_size, overlap)
+    return chunks
+# ── Embedding & Vector Store ─────────────────────────────────────────────────
+class RAGStore:
+    """
+    In-memory vector store using sentence-transformers embeddings
+    and NumPy cosine similarity.
+    Usage:
+        store = RAGStore()
+        store.add_document("full document text here")
+        results = store.query("what is this about?", top_k=5)
+    """
+    MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+    def __init__(self):
+        self._model = None
+        self.chunks: list[str] = []
+        self.embeddings: np.ndarray | None = None
+    @property
+    def model(self):
+        """Lazy-load the embedding model to avoid startup cost."""
+        if self._model is None:
+            logger.info("Loading embedding model: %s", self.MODEL_NAME)
+            from sentence_transformers import SentenceTransformer
+            self._model = SentenceTransformer(self.MODEL_NAME)
+            logger.info("Embedding model loaded successfully")
+        return self._model
+    def clear(self):
+        """Clear the store for a new document."""
+        self.chunks = []
+        self.embeddings = None
+    def add_document(self, text: str, chunk_size: int = 512, overlap: int = 50):
+        """
+        Process a document: chunk the text, generate embeddings, and store.
+        Clears any previously stored document.
+        """
+        self.clear()
+        self.chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
+        if not self.chunks:
+            raise ValueError("No text chunks could be extracted from the document.")
+        logger.info("Generating embeddings for %d chunks...", len(self.chunks))
+        self.embeddings = self.model.encode(
+            self.chunks,
+            show_progress_bar=False,
+            convert_to_numpy=True,
+            normalize_embeddings=True,  # Pre-normalize for faster cosine sim
+        )
+        logger.info("Embeddings generated: shape %s", self.embeddings.shape)
+    def query(self, question: str, top_k: int = 5) -> list[str]:
+        """
+        Retrieve the top-k most relevant chunks for the given question.
+        Uses cosine similarity (dot product on normalized vectors).
+        """
+        if self.embeddings is None or len(self.chunks) == 0:
+            return []
+        # Embed the query
+        query_embedding = self.model.encode(
+            [question],
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+        )
+        # Cosine similarity = dot product (vectors are pre-normalized)
+        similarities = np.dot(self.embeddings, query_embedding.T).flatten()
+        # Get top-k indices
+        top_k = min(top_k, len(self.chunks))
+        top_indices = np.argsort(similarities)[-top_k:][::-1]
+        results = [self.chunks[i] for i in top_indices]
+        logger.info("Retrieved %d chunks (top similarity: %.3f)",
+                     len(results), similarities[top_indices[0]])
+        return results
+    def get_all_chunks(self) -> list[str]:
+        """Return all stored chunks (useful for short documents)."""
+        return self.chunks.copy()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio>=5.23.1,<6.0
+huggingface-hub>=0.25
+pydantic>=2.0,<2.11
+sentence-transformers
+numpy
+PyMuPDF
+edge-tts
+scipy
+pydub
+requests
+beautifulsoup4
+trafilatura
+youtube-transcript-api

script_gen.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+VoiceVerse AI — Script Generation Module.
+Delivery Modes:
+  Summary  — single-speaker structured narration
+  Podcast  — HOST_1 / HOST_2 two-host dialogue
+  Song/Rap — rhythmic retention content
+  Debate   — DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
+"""
+import os
+import re
+from huggingface_hub import InferenceClient
+from utils import logger
+MODEL_ID       = "HuggingFaceTB/SmolLM3-3B"
+MAX_NEW_TOKENS = 1200
+TEMPERATURE    = 0.5
+# ══════════════════════════════════════════════════════════════════════════════
+# Prompts
+# ══════════════════════════════════════════════════════════════════════════════
+# ── Summary ───────────────────────────────────────────────────────────────────
+_SUMMARY_SYSTEM = """\
+You are a professional narrator. Produce a clear spoken summary strictly from the source material.
+RULES:
+1. Use ONLY facts from the source. Do NOT add outside knowledge.
+2. Structure: short intro → key points as natural spoken sentences → concise conclusion.
+3. Plain text only — no markdown, no bullets, no headers.
+4. Write for the ear: short sentences, conversational tone.
+5. Never say "the document says". Speak as the expert.
+6. Output ONLY the narration text, nothing else."""
+_SUMMARY_USER = """\
+SOURCE MATERIAL:
+{context}
+Write a flowing spoken summary (intro, key points, conclusion) in plain sentences."""
+# ── Podcast ───────────────────────────────────────────────────────────────────
+_PODCAST_SYSTEM = """\
+You are a podcast script writer. Write a two-host conversation strictly from the source material.
+STRICT FORMAT — every single line must start with a speaker tag:
+HOST_1: <what Host 1 says>
+HOST_2: <what Host 2 says>
+RULES:
+1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
+2. HOST_1 introduces topics and asks questions.
+3. HOST_2 explains concepts and answers.
+4. Use ONLY information from the source. No hallucination.
+5. Conversational, engaging tone.
+6. No markdown, no stage directions, no lines without a HOST tag.
+7. Aim for 16–24 exchanges."""
+_PODCAST_USER = """\
+SOURCE MATERIAL:
+{context}
+Write the full podcast. Every line must start with HOST_1: or HOST_2:"""
+# ── Song / Rap ────────────────────────────────────────────────────────────────
+_SONG_SYSTEM = """\
+You are a lyricist. Two steps:
+STEP 1 — silently extract 5–7 key ideas from the source.
+STEP 2 — write a smooth melodic SONG from those ideas.
+RULES:
+- Simple memorable language, rhyming couplets (AABB).
+- Label sections [VERSE 1], [VERSE 2], [CHORUS].
+- [CHORUS] repeats the main concept.
+- Short lines (6–10 words). Use repetition.
+- Do NOT invent facts not in the source.
+- Output ONLY the lyrics with section labels."""
+_RAP_SYSTEM = """\
+You are a lyricist. Two steps:
+STEP 1 — silently extract 5–7 key ideas from the source.
+STEP 2 — write a punchy rhythmic RAP from those ideas.
+RULES:
+- Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
+- Label sections [VERSE 1], [VERSE 2], [HOOK].
+- [HOOK] repeats the main concept.
+- Wordplay and repetition to aid retention.
+- Do NOT invent facts not in the source.
+- Output ONLY the lyrics with section labels."""
+_SONG_RAP_USER = """\
+SOURCE MATERIAL:
+{context}
+Extract the key ideas, then write the full {form}."""
+# ── Debate ────────────────────────────────────────────────────────────────────
+_DEBATE_SYSTEM = """\
+You are a debate script writer. Write a structured two-person debate strictly grounded \
+in the provided source material.
+STRICT FORMAT — every single line must start with a speaker tag:
+DEBATER_A: <what Debater A says>
+DEBATER_B: <what Debater B says>
+CHARACTER PROFILES:
+- DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
+- DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.
+DEBATE STRUCTURE:
+1. DEBATER_A opens with a strong statement supporting the topic.
+2. DEBATER_B immediately challenges with a counterpoint.
+3. They alternate, each directly responding to the other's previous point.
+4. Both use evidence and logic from the source material only.
+5. End with each debater giving a brief closing statement.
+RULES:
+- Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
+- Use ONLY information from the source material. No hallucination.
+- Each turn should be 1–3 sentences — punchy, not long speeches.
+- No markdown, no stage directions, no narration outside the speaker tags.
+- Aim for 16–22 exchanges total."""
+_DEBATE_USER = """\
+SOURCE MATERIAL:
+{context}
+Write the full debate on the key topics from this material. \
+Every line must start with DEBATER_A: or DEBATER_B:"""
+# ── Story ─────────────────────────────────────────────────────────────────────
+_STORY_SYSTEM = """\
+You are a master storyteller. Retell the ideas from the source material as an \
+immersive narrative story written for slow, expressive audio delivery.
+RULES:
+1. Transform factual content into a story — use characters, scenes, a narrative arc \
+   (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
+2. Use ONLY information and ideas from the source. Do NOT invent new facts.
+3. Warm, descriptive storytelling voice. Vivid but calm.
+4. Short paragraphs, 1–3 sentences each, separated by blank lines.
+5. Plain text only — no markdown, no bullets, no headers.
+6. Begin with an evocative scene-setting sentence.
+7. End with a closing reflection or lesson drawn from the source.
+8. Output ONLY the story text, nothing else."""
+_STORY_USER = """\
+SOURCE MATERIAL:
+{context}
+Transform this into a rich narrative story for slow, expressive audio. \
+Use short paragraphs with blank lines between them."""
+# ══════════════════════════════════════════════════════════════════════════════
+# Post-processing
+# ══════════════════════════════════════════════════════════════════════════════
+def _clean(text: str) -> str:
+    """Remove all markdown and XML artifacts from LLM output."""
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<[^>]+>", "", text)
+    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
+    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
+    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
+    text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
+    text = re.sub(r"`([^`]+)`", r"\1", text)
+    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r" {2,}", " ", text)
+    return text.strip()
+def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
+    """
+    Clean output that must have speaker tags (podcast or debate).
+    Normalises tag variants, removes lines without valid tags.
+    """
+    text = _clean(text)
+    # Normalise tag variants the model might produce
+    if tag_a == "HOST_1":
+        text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
+        text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
+    elif tag_a == "DEBATER_A":
+        text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
+        text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
+        # Also catch "Pro:" / "Con:" / "Speaker A:" variants
+        text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
+        text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
+        text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
+        text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)
+    # Keep only lines that have a valid speaker tag
+    lines = text.splitlines()
+    clean_lines = [
+        ln for ln in lines
+        if ln.strip() == ""
+        or ln.strip().startswith(f"{tag_a}:")
+        or ln.strip().startswith(f"{tag_b}:")
+    ]
+    return "\n".join(clean_lines).strip()
+# ══════════════════════════════════════════════════════════════════════════════
+# LLM client
+# ══════════════════════════════════════════════════════════════════════════════
+def _get_client() -> InferenceClient:
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise EnvironmentError(
+            "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
+        )
+    return InferenceClient(provider="hf-inference", token=token)
+def _call_llm(system: str, user: str) -> str:
+    client = _get_client()
+    response = client.chat_completion(
+        model=MODEL_ID,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user",   "content": user},
+        ],
+        max_tokens=MAX_NEW_TOKENS,
+        temperature=TEMPERATURE,
+        top_p=0.9,
+    )
+    raw = response.choices[0].message.content.strip()
+    if not raw:
+        raise RuntimeError("Model returned empty response. Please try again.")
+    return raw
+# ══════════════════════════════════════════════════════════════════════════════
+# Public entry point
+# ══════════════════════════════════════════════════════════════════════════════
+def generate_script(
+    context_chunks: list[str],
+    mode: str = "Summary",
+    sub_mode: str = "Rap",
+    topic: str = "the key ideas from this document",
+) -> str:
+    """
+    Generate a spoken script from RAG chunks.
+    Args:
+        context_chunks : chunks from RAGStore — NOT modified here
+        mode           : "Summary" | "Podcast" | "Song / Rap" | "Debate"
+        sub_mode       : "Song" | "Rap"  (only for Song/Rap mode)
+    Returns:
+        Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
+        Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
+    """
+    if not context_chunks:
+        raise ValueError("No document context. Please upload or paste content first.")
+    context = "\n\n".join(context_chunks)
+    if len(context) > 6000:
+        context = context[:6000]
+        logger.warning("Context truncated to 6000 chars")
+    logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))
+    m = mode.strip().lower()
+    if m == "summary":
+        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
+        script = _clean(raw)
+    elif m == "podcast":
+        raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
+        script = _clean_dialogue(raw, "HOST_1", "HOST_2")
+    elif "song" in m or "rap" in m:
+        form = sub_mode.lower()
+        sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
+        raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
+        script = _clean(raw)
+    elif "debate" in m:
+        raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
+        script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")
+    elif "story" in m:
+        raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
+        script = _clean(raw)
+    else:
+        logger.warning("Unknown mode '%s' — falling back to Summary", mode)
+        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
+        script = _clean(raw)
+    if not script:
+        raise RuntimeError("Script was empty after cleaning. Please try again.")
+    logger.info("Script ready: %d chars", len(script))
+    return script

tts.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""
+VoiceVerse AI — TTS Module.
+Primary:  Qwen3-TTS via HF Inference API
+Fallback: Edge-TTS (CPU, no key needed)
+Voice + audio style per mode:
+  Summary   — neutral female voice, normal rate
+  Podcast   — HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
+  Rap       — male voice, faster rate (+40%), bass boost via pydub
+  Song      — female voice, normal rate
+  Debate    — DEBATER_A female (JennyNeural, +5%) / DEBATER_B male (DavisNeural, -5%)
+  Story     — female voice, slow rate (-30%), long silence gaps between sentences
+"""
+import os
+import re
+import asyncio
+from utils import logger, get_temp_filepath
+QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
+TTS_MAX_CHARS  = 3000
+# ── Voice assignments ─────────────────────────────────────────────────────────
+# Summary / Song / Story — single female voice
+EDGE_VOICE_FEMALE        = "en-US-AriaNeural"
+# Podcast
+EDGE_VOICE_HOST_FEMALE   = "en-US-AriaNeural"    # HOST_1 — female
+EDGE_VOICE_HOST_MALE     = "en-US-GuyNeural"     # HOST_2 — male
+# Rap — male voice reads the rap
+EDGE_VOICE_RAP           = "en-US-GuyNeural"
+RAP_RATE                 = "+40%"                 # fast delivery
+# Debate
+EDGE_VOICE_DEBATER_A     = "en-US-JennyNeural"   # female, pro — assertive
+EDGE_VOICE_DEBATER_B     = "en-US-DavisNeural"   # male, con  — skeptical
+DEBATE_RATE_A            = "+8%"                  # slightly faster
+DEBATE_RATE_B            = "-5%"                  # slightly slower, deliberate
+# Story — slow, warm delivery
+EDGE_VOICE_STORY         = "en-US-AriaNeural"
+STORY_RATE               = "-30%"                 # noticeably slower
+# ══════════════════════════════════════════════════════════════════════════════
+# Low-level TTS helpers
+# ══════════════════════════════════════════════════════════════════════════════
+def _qwen_tts(text: str) -> str | None:
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        return None
+    try:
+        from huggingface_hub import InferenceClient
+        client = InferenceClient(token=token)
+        audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
+        if not audio_bytes:
+            return None
+        path = get_temp_filepath(suffix=".wav")
+        with open(path, "wb") as f:
+            f.write(audio_bytes)
+        logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
+        return path
+    except Exception as e:
+        logger.warning("Qwen TTS failed: %s", e)
+        return None
+def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%") -> str:
+    """
+    Generate audio via Edge-TTS.
+    rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
+    """
+    import edge_tts
+    path = get_temp_filepath(suffix=".mp3")
+    snippet = text[:TTS_MAX_CHARS]
+    async def _run():
+        communicate = edge_tts.Communicate(snippet, voice, rate=rate)
+        await communicate.save(path)
+    try:
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                pool.submit(asyncio.run, _run()).result(timeout=120)
+        else:
+            loop.run_until_complete(_run())
+    except RuntimeError:
+        asyncio.run(_run())
+    if os.path.getsize(path) == 0:
+        raise RuntimeError("Edge-TTS produced an empty audio file.")
+    logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
+    return path
+# ══════════════════════════════════════════════════════════════════════════════
+# Audio post-processing
+# ══════════════════════════════════════════════════════════════════════════════
+def _apply_rap_fx(path: str) -> str:
+    """
+    Apply bass boost to a rap audio file using pydub.
+    Low-frequency boost makes it sound punchier and more rap-like.
+    Returns path to processed file (new file).
+    """
+    try:
+        from pydub import AudioSegment
+        from pydub.effects import low_pass_filter
+        audio = AudioSegment.from_file(path)
+        # Split into bass (low) and mid/high frequencies
+        bass  = low_pass_filter(audio, 200)          # frequencies below 200 Hz
+        highs = audio - low_pass_filter(audio, 200)  # everything above
+        # Boost bass by 6 dB, keep highs as-is, combine
+        boosted = (bass + 6).overlay(highs)
+        out = get_temp_filepath(suffix=".mp3")
+        boosted.export(out, format="mp3")
+        logger.info("Rap bass boost applied → %s", out)
+        return out
+    except Exception as e:
+        logger.warning("Rap FX failed (%s) — returning original audio", e)
+        return path
+def _concat(paths: list[str], silence_ms: int = 300) -> str:
+    """Concatenate audio files with silence between each segment."""
+    if len(paths) == 1:
+        return paths[0]
+    try:
+        from pydub import AudioSegment
+        combined = AudioSegment.empty()
+        silence  = AudioSegment.silent(duration=silence_ms)
+        for p in paths:
+            combined += AudioSegment.from_file(p) + silence
+        out = get_temp_filepath(suffix=".mp3")
+        combined.export(out, format="mp3")
+        logger.info("Concatenated %d segments → %s", len(paths), out)
+        return out
+    except Exception as e:
+        logger.warning("pydub concat failed (%s) — returning first segment", e)
+        return paths[0]
+def _add_story_gaps(path: str) -> str:
+    """
+    Insert longer silence gaps between sentences in story audio.
+    Gives the warm, unhurried feel of a storyteller.
+    """
+    try:
+        from pydub import AudioSegment
+        audio   = AudioSegment.from_file(path)
+        gap     = AudioSegment.silent(duration=600)   # 600 ms between sentences
+        # Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
+        chunk_ms = 5000
+        chunks   = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
+        combined = AudioSegment.empty()
+        for chunk in chunks:
+            combined += chunk + gap
+        out = get_temp_filepath(suffix=".mp3")
+        combined.export(out, format="mp3")
+        logger.info("Story gaps applied → %s", out)
+        return out
+    except Exception as e:
+        logger.warning("Story gap insertion failed (%s) — returning original", e)
+        return path
+# ══════════════════════════════════════════════════════════════════════════════
+# Dialogue script parser
+# ══════════════════════════════════════════════════════════════════════════════
+def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
+    """Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
+    segments: list[tuple[str, str]] = []
+    prefix_a = f"{tag_a}:"
+    prefix_b = f"{tag_b}:"
+    for line in script.splitlines():
+        line = line.strip()
+        if line.startswith(prefix_a):
+            text = line[len(prefix_a):].strip()
+            if text:
+                if segments and segments[-1][0] == tag_a:
+                    segments[-1] = (tag_a, segments[-1][1] + " " + text)
+                else:
+                    segments.append((tag_a, text))
+        elif line.startswith(prefix_b):
+            text = line[len(prefix_b):].strip()
+            if text:
+                if segments and segments[-1][0] == tag_b:
+                    segments[-1] = (tag_b, segments[-1][1] + " " + text)
+                else:
+                    segments.append((tag_b, text))
+    return segments
+# ══════════════════════════════════════════════════════════════════════════════
+# Per-mode audio generators
+# ══════════════════════════════════════════════════════════════════════════════
+def generate_audio_podcast(script: str) -> tuple[str, str]:
+    """
+    Podcast: HOST_1 = female (AriaNeural), HOST_2 = male (GuyNeural).
+    Normal conversational rate, 300 ms silence between turns.
+    """
+    segments = _parse_dialogue(script, "HOST_1", "HOST_2")
+    if not segments:
+        logger.warning("No HOST tags — falling back to single voice")
+        return generate_audio(script)
+    voice_map = {
+        "HOST_1": (EDGE_VOICE_HOST_FEMALE, "+0%"),
+        "HOST_2": (EDGE_VOICE_HOST_MALE,   "+0%"),
+    }
+    paths = []
+    for speaker, text in segments:
+        voice, rate = voice_map[speaker]
+        try:
+            paths.append(_edge_tts(text, voice=voice, rate=rate))
+        except Exception as e:
+            logger.warning("Podcast segment failed %s: %s", speaker, e)
+    if not paths:
+        raise RuntimeError("All podcast segments failed.")
+    return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"
+def generate_audio_debate(script: str) -> tuple[str, str]:
+    """
+    Debate: DEBATER_A = female (JennyNeural, assertive +8%),
+            DEBATER_B = male   (DavisNeural, deliberate -5%).
+    400 ms silence between turns for debate feel.
+    """
+    segments = _parse_dialogue(script, "DEBATER_A", "DEBATER_B")
+    if not segments:
+        logger.warning("No DEBATER tags — falling back to single voice")
+        return generate_audio(script)
+    voice_map = {
+        "DEBATER_A": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
+        "DEBATER_B": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
+    }
+    paths = []
+    for speaker, text in segments:
+        voice, rate = voice_map[speaker]
+        try:
+            paths.append(_edge_tts(text, voice=voice, rate=rate))
+        except Exception as e:
+            logger.warning("Debate segment failed %s: %s", speaker, e)
+    if not paths:
+        raise RuntimeError("All debate segments failed.")
+    return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"
+def generate_audio_rap(script: str) -> tuple[str, str]:
+    """
+    Rap: male voice, fast rate (+40%), then bass boost applied via pydub.
+    """
+    path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
+    path = _apply_rap_fx(path)
+    return path, "Edge-TTS (Rap)"
+def generate_audio_story(script: str) -> tuple[str, str]:
+    """
+    Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
+    """
+    path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
+    path = _add_story_gaps(path)
+    return path, "Edge-TTS (Story)"
+# ══════════════════════════════════════════════════════════════════════════════
+# Unified public interface
+# ══════════════════════════════════════════════════════════════════════════════
+def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
+    """Single-voice TTS for Summary and Song modes. Tries Qwen first."""
+    if not text or not text.strip():
+        raise ValueError("No text provided for audio generation.")
+    path = _qwen_tts(text)
+    if path and os.path.exists(path):
+        return path, "Qwen3-TTS"
+    return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"

utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+VoiceVerse AI — Utility helpers.
+Provides temp file management and error formatting
+used across the pipeline.
+"""
+import os
+import tempfile
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger("voiceverse")
+def get_temp_filepath(suffix: str = ".wav") -> str:
+    """Return a path to a new temporary file that won't be auto-deleted."""
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    return path
+def format_error(stage: str, error: Exception) -> str:
+    """
+    Return a user-friendly error string.
+    Hides raw tracebacks; logs the full error for debugging.
+    """
+    logger.error("Error in %s: %s", stage, error, exc_info=True)
+    friendly_messages = {
+        "upload": "Could not read the uploaded file. Please try a different PDF or TXT file.",
+        "rag": "Failed to process the document text. The file may be empty or corrupted.",
+        "script": "Could not generate the audio script. Please check your HF_TOKEN and try again.",
+        "tts": "Audio generation failed. The system will retry with a fallback voice.",
+    }
+    return friendly_messages.get(stage, f"An unexpected error occurred: {stage}")
+def validate_file(file_path: str) -> tuple[bool, str]:
+    """
+    Validate an uploaded file. Returns (is_valid, message).
+    """
+    if file_path is None:
+        return False, "Please upload a PDF or TXT file first."
+    if not os.path.exists(file_path):
+        return False, "The uploaded file could not be found. Please try again."
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext not in (".pdf", ".txt"):
+        return False, f"Unsupported file format '{ext}'. Please upload a PDF or TXT file."
+    size = os.path.getsize(file_path)
+    if size == 0:
+        return False, "The uploaded file is empty. Please upload a file with content."
+    if size > 20 * 1024 * 1024:  # 20 MB limit
+        return False, "File is too large (>20 MB). Please upload a smaller document."
+    return True, "File is valid."