voiceverse-ai-test

Sleeping

App Files Files Community

Isshi14 commited on Feb 19

Commit

fbe59b1

verified ·

1 Parent(s): 7723b85

Upload 9 files

Browse files

Files changed (4) hide show

app.py +364 -239
gitattributes +35 -0
script_gen.py +253 -101
tts.py +245 -157

app.py CHANGED Viewed

@@ -1,239 +1,364 @@
-"""
-VoiceVerse AI — Main Application.
-Gradio-based UI that orchestrates the full document-to-audio pipeline:
-  1. Upload PDF/TXT → extract text
-  2. RAG: chunk, embed, retrieve relevant context
-  3. Generate a spoken-style script via Mistral-7B-Instruct
-  4. Convert script to expressive audio via Qwen TTS / Edge-TTS
-  5. Play audio in the browser
-Entry point for Hugging Face Spaces deployment.
-"""
-import os
-import gradio as gr
-from utils import logger, validate_file, format_error
-from rag import extract_text, RAGStore
-from script_gen import generate_script
-from tts import generate_audio
-# ── Global RAG Store (single-user demo) ──────────────────────────────────────
-rag_store = RAGStore()
-# ── Pipeline Orchestration ───────────────────────────────────────────────────
-def process_document(file, progress=gr.Progress()):
-    """
-    Full pipeline: upload → extract → RAG → script → audio.
-    Args:
-        file: Gradio uploaded file object (has .name attribute)
-    Returns:
-        Tuple of (script_text, audio_file_path, status_message)
-    """
-    # ── Step 0: Validate ─────────────────────────────────────────────────
-    if file is None:
-        raise gr.Error("Please upload a PDF or TXT file first.")
-    file_path = file.name if hasattr(file, "name") else str(file)
-    is_valid, msg = validate_file(file_path)
-    if not is_valid:
-        raise gr.Error(msg)
-    try:
-        # ── Step 1: Extract Text ─────────────────────────────────────────
-        progress(0.1, desc="📄 Extracting text from document...")
-        logger.info("Processing file: %s", file_path)
-        text = extract_text(file_path)
-        if not text or len(text.strip()) < 50:
-            raise gr.Error(
-                "The document contains too little text to generate audio. "
-                "Please upload a document with more content."
-            )
-        progress(0.2, desc="✅ Text extracted successfully")
-        # ── Step 2: RAG — Chunk & Embed ──────────────────────────────────
-        progress(0.3, desc="🧠 Processing document with AI...")
-        rag_store.add_document(text)
-        chunk_count = len(rag_store.chunks)
-        logger.info("Document processed: %d chunks created", chunk_count)
-        # ── Step 3: Retrieve Context ─────────────────────────────────────
-        progress(0.4, desc="🔍 Retrieving key content...")
-        # For short documents, use all chunks; for longer ones, retrieve smartly
-        if chunk_count <= 8:
-            context_chunks = rag_store.get_all_chunks()
-        else:
-            context_chunks = rag_store.query(
-                "What are the main topics, key insights, and important details?",
-                top_k=6,
-            )
-        progress(0.5, desc="✅ Context retrieved")
-        # ── Step 4: Generate Script ──────────────────────────────────────
-        progress(0.6, desc="✍️ Writing spoken script...")
-        script = generate_script(context_chunks)
-        logger.info("Script generated: %d characters", len(script))
-        progress(0.75, desc="✅ Script ready")
-        # ── Step 5: Generate Audio ───────────────────────────────────────
-        progress(0.8, desc="🎙️ Generating expressive audio...")
-        audio_path, engine = generate_audio(script)
-        logger.info("Audio generated via %s: %s", engine, audio_path)
-        progress(1.0, desc="✅ Audio ready!")
-        # ── Build status message ─────────────────────────────────────────
-        status = (
-            f"✅ **Generation complete!**\n\n"
-            f"- 📄 Document: {os.path.basename(file_path)}\n"
-            f"- 📝 Text extracted: {len(text):,} characters\n"
-            f"- 🧩 Chunks created: {chunk_count}\n"
-            f"- ✍️ Script length: {len(script):,} characters\n"
-            f"- 🎙️ Voice engine: {engine}\n"
-        )
-        return script, audio_path, status
-    except gr.Error:
-        raise  # Re-raise Gradio errors as-is
-    except EnvironmentError as e:
-        raise gr.Error(str(e))
-    except Exception as e:
-        error_msg = format_error("pipeline", e)
-        raise gr.Error(error_msg)
-# ── Gradio UI ────────────────────────────────────────────────────────────────
-def build_ui() -> gr.Blocks:
-    """Build and return the Gradio Blocks interface."""
-    # Custom CSS for a clean, polished look
-    css = """
-    .main-header {
-        text-align: center;
-        margin-bottom: 1rem;
-    }
-    .main-header h1 {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        font-size: 2.5rem;
-        font-weight: 800;
-        margin-bottom: 0.25rem;
-    }
-    .main-header p {
-        color: #6b7280;
-        font-size: 1.1rem;
-    }
-    .status-box {
-        border-left: 3px solid #667eea;
-        padding-left: 1rem;
-        margin: 0.5rem 0;
-    }
-    """
-    with gr.Blocks(
-        title="VoiceVerse AI — Document to Audio",
-        theme=gr.themes.Soft(
-            primary_hue="indigo",
-            secondary_hue="purple",
-        ),
-        css=css,
-    ) as app:
-        # ── Header ───────────────────────────────────────────────────────
-        gr.HTML("""
-        <div class="main-header">
-            <h1>🎙️ VoiceVerse AI</h1>
-            <p>Transform your documents into engaging podcast-style audio</p>
-        </div>
-        """)
-        with gr.Row():
-            # ── Left Column: Input ───────────────────────────────────────
-            with gr.Column(scale=1):
-                gr.Markdown("### 📤 Upload Document")
-                file_input = gr.File(
-                    label="Upload a PDF or TXT file",
-                    file_types=[".pdf", ".txt"],
-                    type="filepath",
-                    elem_id="file-upload",
-                )
-                generate_btn = gr.Button(
-                    "🎙️ Generate Audio",
-                    variant="primary",
-                    size="lg",
-                    elem_id="generate-btn",
-                )
-                status_output = gr.Markdown(
-                    value="*Upload a document and click Generate to start.*",
-                    elem_classes=["status-box"],
-                )
-            # ── Right Column: Output ─────────────────────────────────────
-            with gr.Column(scale=1):
-                gr.Markdown("### 🎧 Generated Audio")
-                audio_output = gr.Audio(
-                    label="Audio Narration",
-                    type="filepath",
-                    elem_id="audio-player",
-                    interactive=False,
-                )
-                gr.Markdown("### ✍️ Generated Script")
-                script_output = gr.Textbox(
-                    label="Spoken Script",
-                    lines=12,
-                    max_lines=20,
-                    interactive=False,
-                    placeholder="The generated script will appear here...",
-                    elem_id="script-display",
-                )
-        # ── Wire up the generate button ──────────────────────────────────
-        generate_btn.click(
-            fn=process_document,
-            inputs=[file_input],
-            outputs=[script_output, audio_output, status_output],
-        )
-        # ── Footer ───────────────────────────────────────────────────────
-        gr.Markdown(
-            "<center style='color: #9ca3af; margin-top: 1rem;'>"
-            "Built with ❤️ using Mistral-7B-Instruct · Qwen3-TTS · Edge-TTS · Gradio"
-            "</center>"
-        )
-    return app
-# ── Entry Point ──────────────────────────────────────────────────────────────
-if __name__ == "__main__":
-    logger.info("Starting VoiceVerse AI...")
-    app = build_ui()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True,
-    )

+"""
+VoiceVerse AI — Main Application.
+Gradio-based UI that orchestrates the full document-to-audio pipeline:
+  1. Upload PDF/TXT → extract text
+  2. RAG: chunk, embed, retrieve relevant context          ← UNCHANGED
+  3. Delivery Mode selector routes to mode-specific prompt ← NEW
+  4. Generate a spoken/podcast/song script via SmolLM3-3B
+  5. Convert script to audio via Qwen TTS / Edge-TTS
+  6. Play audio in the browser
+Delivery Modes:
+  - Summary       : single-voice structured narration
+  - Podcast       : two-host dialogue (HOST_1 / HOST_2), dual voice TTS
+  - Song / Rap    : rhythmic retention content, single voice
+Entry point for Hugging Face Spaces deployment.
+"""
+import os
+import gradio as gr
+from utils import logger, validate_file, format_error
+from rag import extract_text, RAGStore
+from script_gen import generate_script
+from tts import generate_audio, generate_audio_podcast
+# ── Global RAG Store (single-user demo) ──────────────────────────────────────
+rag_store = RAGStore()
+# ══════════════════════════════════════════════════════════════════════════════
+# Pipeline Orchestration
+# ══════════════════════════════════════════════════════════════════════════════
+def process_document(
+    file,
+    delivery_mode: str,
+    song_rap_sub: str,
+    progress=gr.Progress(),
+):
+    """
+    Full pipeline:
+        upload → extract → RAG → script (mode-specific) → audio
+    Args:
+        file          : Gradio uploaded file object (.name attribute)
+        delivery_mode : "Summary" | "Podcast" | "Song / Rap"
+        song_rap_sub  : "Song" | "Rap"  (only relevant for Song/Rap mode)
+    Returns:
+        (script_text, audio_file_path, status_markdown)
+    """
+    # ── Validate input ───────────────────────────────────────────────────────
+    if file is None:
+        raise gr.Error("Please upload a PDF or TXT file first.")
+    file_path = file.name if hasattr(file, "name") else str(file)
+    is_valid, msg = validate_file(file_path)
+    if not is_valid:
+        raise gr.Error(msg)
+    try:
+        # ── Step 1: Extract text ─────────────────────────────────────────────
+        progress(0.10, desc="📄 Extracting text from document…")
+        logger.info("Processing file: %s | mode: %s", file_path, delivery_mode)
+        text = extract_text(file_path)
+        if not text or len(text.strip()) < 50:
+            raise gr.Error(
+                "The document contains too little text. "
+                "Please upload a document with more content."
+            )
+        progress(0.20, desc="✅ Text extracted")
+        # ── Step 2: RAG — chunk & embed (UNCHANGED) ──────────────────────────
+        progress(0.30, desc="🧠 Building knowledge index…")
+        rag_store.add_document(text)
+        chunk_count = len(rag_store.chunks)
+        logger.info("RAG index built: %d chunks", chunk_count)
+        # ── Step 3: Retrieve context (UNCHANGED) ─────────────────────────────
+        progress(0.40, desc="🔍 Retrieving relevant content…")
+        if chunk_count <= 8:
+            context_chunks = rag_store.get_all_chunks()
+        else:
+            context_chunks = rag_store.query(
+                "What are the main topics, key insights, and important details?",
+                top_k=6,
+            )
+        progress(0.50, desc="✅ Context retrieved")
+        # ── Step 4: Generate script (mode-aware) ─────────────────────────────
+        mode_label = _mode_progress_label(delivery_mode, song_rap_sub)
+        progress(0.60, desc=f"✍️ Writing {mode_label} script…")
+        script = generate_script(
+            context_chunks=context_chunks,
+            mode=delivery_mode,
+            sub_mode=song_rap_sub,
+        )
+        logger.info("Script generated: %d chars", len(script))
+        progress(0.75, desc="✅ Script ready")
+        # ── Step 5: Generate audio (mode-aware) ──────────────────────────────
+        progress(0.80, desc="🎙️ Synthesising audio…")
+        is_podcast = delivery_mode.strip().lower() == "podcast"
+        if is_podcast:
+            audio_path, engine = generate_audio_podcast(script)
+        else:
+            audio_path, engine = generate_audio(script)
+        logger.info("Audio generated via %s: %s", engine, audio_path)
+        progress(1.00, desc="✅ Done!")
+        # ── Build status card ─────────────────────────────────────────────────
+        mode_icon = {"summary": "📋", "podcast": "🎙️", "song / rap": "🎵"}.get(
+            delivery_mode.lower(), "🎧"
+        )
+        status = (
+            f"### ✅ Generation complete!\n\n"
+            f"| | |\n|---|---|\n"
+            f"| {mode_icon} **Mode** | {delivery_mode}"
+            + (f" — {song_rap_sub}" if "song" in delivery_mode.lower() or "rap" in delivery_mode.lower() else "")
+            + f" |\n"
+            f"| 📄 **Document** | {os.path.basename(file_path)} |\n"
+            f"| 🧩 **Chunks** | {chunk_count} |\n"
+            f"| ✍️ **Script length** | {len(script):,} chars |\n"
+            f"| 🔊 **Voice engine** | {engine} |\n"
+        )
+        return script, audio_path, status
+    except gr.Error:
+        raise
+    except EnvironmentError as e:
+        raise gr.Error(str(e))
+    except Exception as e:
+        error_msg = format_error("pipeline", e)
+        raise gr.Error(error_msg)
+def _mode_progress_label(mode: str, sub_mode: str) -> str:
+    m = mode.lower()
+    if "podcast" in m:
+        return "podcast"
+    if "song" in m or "rap" in m:
+        return sub_mode.lower()
+    return "summary"
+# ══════════════════════════════════════════════════════════════════════════════
+# Conditional UI visibility helpers
+# ══════════════════════════════════════════════════════════════════════════════
+def _on_mode_change(mode: str):
+    """
+    Return visibility updates for mode-specific sub-controls.
+    Called whenever the delivery mode radio changes.
+    """
+    show_song_rap = "song" in mode.lower() or "rap" in mode.lower()
+    return gr.update(visible=show_song_rap)
+# ══════════════════════════════════════════════════════════════════════════════
+# Gradio UI
+# ══════════════════════════════════════════════════════════════════════════════
+def build_ui() -> gr.Blocks:
+    css = """
+    /* ── Header ─────────────────────────────────────────────── */
+    .main-header { text-align: center; margin-bottom: 1rem; }
+    .main-header h1 {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-size: 2.5rem;
+        font-weight: 800;
+        margin-bottom: 0.25rem;
+    }
+    .main-header p { color: #6b7280; font-size: 1.1rem; }
+    /* ── Mode selector card ──────────────────────────────────── */
+    .mode-card {
+        background: linear-gradient(135deg, #f8f7ff 0%, #f0edff 100%);
+        border: 1px solid #e0d9ff;
+        border-radius: 12px;
+        padding: 1rem 1.25rem;
+        margin-top: 0.5rem;
+    }
+    .mode-card h3 { color: #4c3d99; margin-bottom: 0.5rem; }
+    /* ── Status ──────────────────────────────────────────────── */
+    .status-box {
+        border-left: 3px solid #667eea;
+        padding-left: 1rem;
+        margin: 0.5rem 0;
+    }
+    /* ── Sub-mode row ────────────────────────────────────────── */
+    .sub-mode-row { margin-top: 0.5rem; }
+    """
+    with gr.Blocks(
+        title="VoiceVerse AI — Document to Audio",
+        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
+        css=css,
+    ) as app:
+        # ── Header ───────────────────────────────────────────────────────────
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🎙️ VoiceVerse AI</h1>
+            <p>Transform your documents into engaging audio experiences</p>
+        </div>
+        """)
+        with gr.Row(equal_height=False):
+            # ════════════════════════════════════════════════════════════════
+            # LEFT COLUMN — Upload + Mode Selector
+            # ═══════════════════════════════════════════════════════════���════
+            with gr.Column(scale=1):
+                # ── Upload ───────────────────────────────────────────────
+                gr.Markdown("### 📤 Upload Document")
+                file_input = gr.File(
+                    label="Upload a PDF or TXT file",
+                    file_types=[".pdf", ".txt"],
+                    type="filepath",
+                )
+                # ── Delivery Mode Selector ───────────────────────────────
+                gr.HTML('<div class="mode-card">')
+                gr.Markdown("### 🎨 Choose Audio Experience")
+                delivery_mode = gr.Radio(
+                    choices=["Summary", "Podcast", "Song / Rap"],
+                    value="Summary",
+                    label=None,
+                    elem_id="delivery-mode-radio",
+                )
+                # Song / Rap sub-option — hidden unless Song/Rap selected
+                with gr.Row(visible=False, elem_classes=["sub-mode-row"]) as song_rap_row:
+                    song_rap_sub = gr.Radio(
+                        choices=["Song", "Rap"],
+                        value="Rap",
+                        label="Style",
+                        scale=1,
+                    )
+                # Mode description (updates on change)
+                mode_description = gr.Markdown(
+                    value=_mode_description("Summary"),
+                    elem_id="mode-desc",
+                )
+                gr.HTML("</div>")   # close .mode-card
+                # ── Generate Button ──────────────────────────────────────
+                generate_btn = gr.Button(
+                    "🎙️ Generate Audio",
+                    variant="primary",
+                    size="lg",
+                )
+                # ── Status ───────────────────────────────────────────────
+                status_output = gr.Markdown(
+                    value="*Upload a document, choose your audio experience, then click Generate.*",
+                    elem_classes=["status-box"],
+                )
+            # ════════════════════════════════════════════════════════════════
+            # RIGHT COLUMN — Audio + Script Output
+            # ════════════════════════════════════════════════════════════════
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎧 Generated Audio")
+                audio_output = gr.Audio(
+                    label="Audio",
+                    type="filepath",
+                    interactive=False,
+                )
+                gr.Markdown("### ✍️ Generated Script")
+                script_output = gr.Textbox(
+                    label="Script",
+                    lines=14,
+                    max_lines=22,
+                    interactive=False,
+                    placeholder="The generated script will appear here…",
+                )
+        # ── Footer ───────────────────────────────────────────────────────────
+        gr.Markdown(
+            "<center style='color:#9ca3af;margin-top:1rem;'>"
+            "Built with ❤️ using SmolLM3-3B · Qwen3-TTS · Edge-TTS · Gradio"
+            "</center>"
+        )
+        # ════════════════════════════════════════════════════════════════════
+        # Event wiring
+        # ════════════════════════════════════════════════════════════════════
+        # Show/hide Song-Rap sub-option + update description when mode changes
+        delivery_mode.change(
+            fn=_on_mode_change_full,
+            inputs=[delivery_mode],
+            outputs=[song_rap_row, mode_description],
+        )
+        # Generate button click
+        generate_btn.click(
+            fn=process_document,
+            inputs=[file_input, delivery_mode, song_rap_sub],
+            outputs=[script_output, audio_output, status_output],
+        )
+    return app
+# ── Mode description helper ───────────────────────────────────────────────────
+def _mode_description(mode: str) -> str:
+    descriptions = {
+        "Summary": (
+            "*📋 **Summary** — A clear, structured spoken narration covering "
+            "the intro, key points, and conclusion. Single voice, neutral tone.*"
+        ),
+        "Podcast": (
+            "*🎙️ **Podcast** — A two-host conversation. Host 1 guides and "
+            "asks questions; Host 2 explains and elaborates. Dual voices.*"
+        ),
+        "Song / Rap": (
+            "*🎵 **Song / Rap** — Key ideas transformed into a rhythmic, "
+            "memorable format. Choose Song for smooth flow or Rap for punchy lines.*"
+        ),
+    }
+    return descriptions.get(mode, "")
+def _on_mode_change_full(mode: str):
+    """Return (song_rap_row visibility, description markdown)."""
+    show_sub = "song" in mode.lower() or "rap" in mode.lower()
+    return gr.update(visible=show_sub), _mode_description(mode)
+# ══════════════════════════════════════════════════════════════════════════════
+# Entry point
+# ══════════════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    logger.info("Starting VoiceVerse AI…")
+    app = build_ui()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+    )

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

script_gen.py CHANGED Viewed

@@ -4,13 +4,19 @@ VoiceVerse AI — Script Generation Module.
 Generates spoken-style scripts from retrieved document chunks
 using SmolLM3-3B via the Hugging Face Inference API.
 Design decisions:
-  - Serverless HF Inference API avoids loading a large model locally
-  - SmolLM3-3B is deployed on the free hf-inference provider
-  - Prompt template enforces podcast/narration structure
-  - Max 1024 new tokens keeps scripts a reasonable length for TTS
-  - Temperature 0.4 keeps output grounded and factual
-  - Post-processing strips markdown/XML artifacts for clean TTS
 """
 import os
@@ -18,157 +24,303 @@ import re
 from huggingface_hub import InferenceClient
 from utils import logger
-# — Configuration ————————————————————————————————————
-MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
-MAX_NEW_TOKENS = 1024
-TEMPERATURE = 0.4
-# — Prompt Template ———————————————————————————————————
-SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.
-CRITICAL RULES:
-1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
-2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
-3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
-4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
-5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
-6. Never say "the document says" or "according to the text". Speak as the expert.
-7. If the content is limited, keep the script short rather than inventing information.
-8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
-9. Output ONLY the spoken narration text, nothing else."""
-USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:
---- CONTENT ---
 {context}
---- END ---
-Topic: {topic}
-Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""
-# — Post-processing ——————————————————————————————————
-def _clean_script_for_tts(text: str) -> str:
     """
-    Remove markdown, XML/HTML tags, and other artifacts that would be
-    read aloud by TTS engines.
     """
-    # Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
-    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
-    # Remove any remaining XML/HTML-style tags
-    text = re.sub(r'<[^>]+>', '', text)
-    # Remove markdown headers (# ## ### etc.)
-    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
-    # Remove markdown bold/italic markers
-    text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
-    text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
-    # Remove markdown links [text](url) -> text
-    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
-    # Remove markdown code blocks and inline code
-    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
-    text = re.sub(r'`([^`]+)`', r'\1', text)
-    # Remove bullet point markers
-    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
-    # Remove numbered list markers
-    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
-    # Remove blockquote markers
-    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
-    # Remove horizontal rules
-    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
-    # Collapse multiple newlines into one
-    text = re.sub(r'\n{3,}', '\n\n', text)
-    # Collapse multiple spaces
-    text = re.sub(r' {2,}', ' ', text)
-    return text.strip()
-# — Script Generation ————————————————————————————————
 def _get_client() -> InferenceClient:
-    """Create an HF Inference client with the user's token."""
     token = os.environ.get("HF_TOKEN")
     if not token:
         raise EnvironmentError(
             "HF_TOKEN environment variable is not set. "
-            "Please set your Hugging Face API token to use the script generation feature."
         )
-    return InferenceClient(
-        provider="hf-inference",
-        token=token,
     )
 def generate_script(
     context_chunks: list[str],
     topic: str = "the key ideas and insights from this document",
 ) -> str:
     """
-    Generate a spoken-style podcast script from retrieved document chunks.
     Args:
-        context_chunks: List of relevant text chunks from the RAG store
-        topic: Optional focus topic for the script
     Returns:
-        A spoken script string ready for TTS conversion
     """
     if not context_chunks:
         raise ValueError("No document context provided. Please upload a document first.")
-    # Combine chunks into a single context block
     context = "\n\n".join(context_chunks)
-    # Truncate if too long
-    max_context_chars = 6000
-    if len(context) > max_context_chars:
-        context = context[:max_context_chars]
-        logger.warning("Context truncated to %d characters", max_context_chars)
-    # Build the prompt
-    user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)
-    logger.info("Generating script via %s (context: %d chars, topic: '%s')",
-                MODEL_ID, len(context), topic[:50])
-    client = _get_client()
-    # Call the model using chat_completion
-    response = client.chat_completion(
-        model=MODEL_ID,
-        messages=[
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": user_message},
-        ],
-        max_tokens=MAX_NEW_TOKENS,
-        temperature=TEMPERATURE,
-        top_p=0.9,
     )
-    raw_script = response.choices[0].message.content.strip()
-    if not raw_script:
-        raise RuntimeError("The model returned an empty script. Please try again.")
-    # Clean the script for TTS (remove markdown, XML tags, etc.)
-    script = _clean_script_for_tts(raw_script)
     if not script:
         raise RuntimeError("Script was empty after cleaning. Please try again.")
-    logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
-    return script

 Generates spoken-style scripts from retrieved document chunks
 using SmolLM3-3B via the Hugging Face Inference API.
+Delivery Modes:
+  - Summary        : Single-speaker structured narration
+  - Podcast        : Two-host dialogue (HOST_1 / HOST_2 tags)
+  - Song / Rap     : Rhythmic retention-style content
+The core RAG pipeline (rag.py) is NOT modified.
+Only this generation stage switches behaviour based on `mode`.
 Design decisions:
+  - generate_script() is the single public entry point
+  - Each mode has its own system + user prompt pair
+  - Post-processing cleans markdown / XML artifacts for TTS
+  - Podcast mode preserves HOST_1 / HOST_2 tags (tts.py splits on them)
 """
 import os
 from huggingface_hub import InferenceClient
 from utils import logger
+# ── Configuration ────────────────────────────────────────────────────────────
+MODEL_ID        = "HuggingFaceTB/SmolLM3-3B"
+MAX_NEW_TOKENS  = 1200
+TEMPERATURE     = 0.5
+# ══════════════════════════════════════════════════════════════════════════════
+# Mode A — Summary
+# ══════════════════════════════════════════════════════════════════════════════
+_SUMMARY_SYSTEM = """\
+You are a professional narrator. Your task is to produce a clear, structured \
+spoken summary strictly grounded in the provided source material.
+RULES:
+1. Use ONLY facts present in the source. Do NOT add outside knowledge.
+2. Structure: a short introduction, key points spoken as natural sentences, \
+   then a concise conclusion.
+3. Write in plain text only — no markdown, no bullet symbols, no headers.
+4. Write for the ear: short sentences, conversational language.
+5. Never say "the document says" or "according to the text". Speak as the expert.
+6. Output ONLY the spoken narration text, nothing else.\
+"""
+_SUMMARY_USER = """\
+SOURCE MATERIAL:
 {context}
+Write a spoken summary that flows naturally. Cover the introduction, the key \
+points, and a short conclusion — all in plain spoken sentences without headings \
+or labels.\
+"""
+# ══════════════════════════════════════════════════════════════════════════════
+# Mode B — Podcast (Multi-Host)
+# ══════════════════════════════════════════════════════════════════════════════
+_PODCAST_SYSTEM = """\
+You are a podcast script writer. Produce an engaging two-host conversation \
+strictly grounded in the provided source material.
+STRICT OUTPUT FORMAT — every line must start with a speaker tag:
+HOST_1: <what Host 1 says>
+HOST_2: <what Host 2 says>
+RULES:
+1. Alternate HOST_1 and HOST_2 throughout. Never have the same host speak twice in a row.
+2. HOST_1 introduces topics, asks questions, and guides the conversation.
+3. HOST_2 explains concepts, provides detail, and answers HOST_1's questions.
+4. Use ONLY information present in the source material. No hallucination.
+5. Tone: conversational, curious, engaging — like a real podcast.
+6. Do NOT add lines that are not prefixed with HOST_1: or HOST_2:.
+7. No markdown, no stage directions, no asterisks.
+8. Aim for 16–24 exchanges (lines) so the conversation feels substantial.\
+"""
+_PODCAST_USER = """\
+SOURCE MATERIAL:
+{context}
+Write the full podcast conversation. Every single line must start with either \
+HOST_1: or HOST_2: — no exceptions.\
+"""
+# ══════════════════════════════════════════════════════════════════════════════
+# Mode C — Song / Rap  (Retention Mode)
+# ══════════════════════════════════════════════════════════════════════════════
+_SONG_SYSTEM = """\
+You are a creative lyricist. Your task has two steps:
+STEP 1 — silently extract 5 to 7 key ideas from the source material.
+STEP 2 — turn those key ideas into a smooth, melodic SONG.
+SONG RULES:
+- Simple, memorable language.
+- Rhyming couplets or AABB scheme.
+- Include a CHORUS (label it [CHORUS]) that repeats the main concept.
+- Label verses [VERSE 1], [VERSE 2], etc.
+- Short lines (6–10 words each).
+- Use repetition to aid retention.
+- Do NOT invent facts not in the source.
+- Output ONLY the song lyrics with section labels. No explanations.\
+"""
+_RAP_SYSTEM = """\
+You are a creative lyricist. Your task has two steps:
+STEP 1 — silently extract 5 to 7 key ideas from the source material.
+STEP 2 — turn those key ideas into a punchy, rhythmic RAP.
+RAP RULES:
+- Short, punchy lines (5–8 words each).
+- Fast-flow rhyme scheme (AABB or ABAB).
+- Include a HOOK (label it [HOOK]) that repeats the main concept.
+- Label verses [VERSE 1], [VERSE 2], etc.
+- Use repetition and wordplay to aid retention.
+- Do NOT invent facts not in the source.
+- Output ONLY the rap lyrics with section labels. No explanations.\
+"""
+_SONG_RAP_USER = """\
+SOURCE MATERIAL:
+{context}
+Extract the key ideas, then write the full {form} based strictly on those ideas.\
+"""
+# ══════════════════════════════════════════════════════════════════════════════
+# Post-processing
+# ══════════════════════════════════════════════════════════════════════════════
+def _clean_for_tts(text: str, preserve_host_tags: bool = False) -> str:
     """
+    Remove markdown and XML/HTML artifacts that TTS engines would read aloud.
+    When preserve_host_tags=True, HOST_1: / HOST_2: prefixes are kept intact.
     """
+    # Remove <think>…</think> reasoning traces (SmolLM3)
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
+    # Remove remaining XML/HTML tags (but NOT HOST_1/HOST_2 lines)
+    text = re.sub(r"<[^>]+>", "", text)
+    # Markdown headers
+    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
+    # Bold / italic
+    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
+    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
+    # Links
+    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
+    # Code blocks
+    text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
+    text = re.sub(r"`([^`]+)`", r"\1", text)
+    # Bullet / numbered lists
+    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
+    # Blockquotes / horizontal rules
+    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
+    # Collapse whitespace
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r" {2,}", " ", text)
+    return text.strip()
+def _clean_summary(text: str) -> str:
+    return _clean_for_tts(text, preserve_host_tags=False)
+def _clean_podcast(text: str) -> str:
+    """
+    Keep HOST_1: / HOST_2: tags — they are required by tts.py for voice splitting.
+    Strip everything else.
+    """
+    text = _clean_for_tts(text, preserve_host_tags=True)
+    # Normalise tag variants: "Host 1:", "host_1:", "HOST1:" → "HOST_1:"
+    text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
+    text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
+    # Remove lines that lack a speaker tag (stray stage directions etc.)
+    lines = text.splitlines()
+    clean_lines = [
+        ln for ln in lines
+        if ln.strip() == "" or ln.strip().startswith("HOST_1:") or ln.strip().startswith("HOST_2:")
+    ]
+    return "\n".join(clean_lines).strip()
+def _clean_song_rap(text: str) -> str:
+    """
+    Keep section labels ([VERSE 1], [CHORUS], [HOOK]) — they help TTS pacing
+    when read aloud, and are harmless.
+    """
+    return _clean_for_tts(text, preserve_host_tags=False)
+# ══════════════════════════════════════════════════════════════════════════════
+# LLM Client
+# ══════════════════════════════════════════════════════════════════════════════
 def _get_client() -> InferenceClient:
     token = os.environ.get("HF_TOKEN")
     if not token:
         raise EnvironmentError(
             "HF_TOKEN environment variable is not set. "
+            "Please add your Hugging Face API token as a Space secret."
         )
+    return InferenceClient(provider="hf-inference", token=token)
+def _call_llm(system_prompt: str, user_prompt: str) -> str:
+    """Send a chat completion request and return the raw response text."""
+    client = _get_client()
+    response = client.chat_completion(
+        model=MODEL_ID,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user",   "content": user_prompt},
+        ],
+        max_tokens=MAX_NEW_TOKENS,
+        temperature=TEMPERATURE,
+        top_p=0.9,
     )
+    raw = response.choices[0].message.content.strip()
+    if not raw:
+        raise RuntimeError("The model returned an empty response. Please try again.")
+    return raw
+# ══════════════════════════════════════════════════════════════════════════════
+# Public Entry Point
+# ══════════════════════════════════════════════════════════════════════════════
 def generate_script(
     context_chunks: list[str],
+    mode: str = "Summary",
+    sub_mode: str = "Rap",
     topic: str = "the key ideas and insights from this document",
 ) -> str:
     """
+    Generate a spoken script from retrieved RAG chunks.
     Args:
+        context_chunks : Chunks returned by RAGStore.query() — NOT modified here.
+        mode           : "Summary" | "Podcast" | "Song / Rap"
+        sub_mode       : "Song" | "Rap"  (only used when mode == "Song / Rap")
+        topic          : Optional human-readable topic label (unused in prompts
+                         except summary, kept for logging).
     Returns:
+        A clean string ready to hand to tts.generate_audio().
+        Podcast mode preserves HOST_1: / HOST_2: prefixes.
     """
     if not context_chunks:
         raise ValueError("No document context provided. Please upload a document first.")
+    # ── Combine & truncate context ───────────────────────────────────────────
     context = "\n\n".join(context_chunks)
+    max_ctx  = 6000
+    if len(context) > max_ctx:
+        context = context[:max_ctx]
+        logger.warning("Context truncated to %d chars for LLM call.", max_ctx)
+    logger.info(
+        "Generating script | mode=%s sub_mode=%s context=%d chars",
+        mode, sub_mode, len(context),
     )
+    # ── Route to the correct prompt pair ────────────────────────────────────
+    mode_key = mode.strip().lower()
+    if mode_key == "summary":
+        raw = _call_llm(
+            _SUMMARY_SYSTEM,
+            _SUMMARY_USER.format(context=context),
+        )
+        script = _clean_summary(raw)
+    elif mode_key == "podcast":
+        raw = _call_llm(
+            _PODCAST_SYSTEM,
+            _PODCAST_USER.format(context=context),
+        )
+        script = _clean_podcast(raw)
+    elif "song" in mode_key or "rap" in mode_key:
+        form        = sub_mode.lower()      # "song" or "rap"
+        system      = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
+        raw = _call_llm(
+            system,
+            _SONG_RAP_USER.format(context=context, form=form),
+        )
+        script = _clean_song_rap(raw)
+    else:
+        # Unknown mode — fall back to Summary so we never crash
+        logger.warning("Unknown mode '%s' — falling back to Summary.", mode)
+        raw = _call_llm(
+            _SUMMARY_SYSTEM,
+            _SUMMARY_USER.format(context=context),
+        )
+        script = _clean_summary(raw)
     if not script:
         raise RuntimeError("Script was empty after cleaning. Please try again.")
+    logger.info("Script ready: %d chars (raw %d chars)", len(script), len(raw))
+    return script

tts.py CHANGED Viewed

@@ -1,157 +1,245 @@
-"""
-VoiceVerse AI — Voice Generation Module (TTS).
-Converts generated scripts into emotionally expressive audio.
-Primary:  Qwen3-TTS via HF Inference API (expressive, emotional)
-Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)
-Design decisions:
-  - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)
-  - Edge-TTS is the demo-safe fallback — runs on CPU, no API key needed
-  - Architecture accepts a voice_id parameter for future multi-voice support
-  - Audio is saved as WAV for maximum compatibility
-"""
-import os
-import asyncio
-import tempfile
-from utils import logger, get_temp_filepath
-# ── Configuration ────────────────────────────────────────────────────────────
-QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
-EDGE_TTS_VOICE = "en-US-AriaNeural"  # Expressive female neural voice
-# Chunk size for TTS (too-long text can cause issues)
-TTS_MAX_CHARS = 3000
-# ── Qwen TTS (Primary — via HF Inference API) ───────────────────────────────
-def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
-    """
-    Generate audio using Qwen3-TTS via the HF Inference API.
-    Args:
-        text: The script text to convert to speech
-        voice_id: Reserved for future multi-voice support
-    Returns:
-        Path to the generated audio file, or None if failed
-    """
-    token = os.environ.get("HF_TOKEN")
-    if not token:
-        logger.warning("HF_TOKEN not set — skipping Qwen TTS")
-        return None
-    try:
-        from huggingface_hub import InferenceClient
-        client = InferenceClient(token=token)
-        logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))
-        # Truncate if needed
-        tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
-        # Call the TTS endpoint
-        audio_bytes = client.text_to_speech(
-            text=tts_text,
-            model=QWEN_TTS_MODEL,
-        )
-        if audio_bytes and len(audio_bytes) > 0:
-            output_path = get_temp_filepath(suffix=".wav")
-            with open(output_path, "wb") as f:
-                f.write(audio_bytes)
-            logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
-            return output_path
-        else:
-            logger.warning("Qwen TTS returned empty audio")
-            return None
-    except Exception as e:
-        logger.warning("Qwen TTS failed: %s — will fall back to Edge-TTS", e)
-        return None
-# ── Edge TTS (Fallback — CPU-only, no API key) ──────────────────────────────
-def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
-    """
-    Generate audio using Edge-TTS (Microsoft neural voices).
-    Runs entirely on CPU, no API key required.
-    Args:
-        text: The script text to convert to speech
-        voice_id: Edge-TTS voice name (default: en-US-AriaNeural)
-    Returns:
-        Path to the generated audio file
-    """
-    import edge_tts
-    voice = voice_id or EDGE_TTS_VOICE
-    output_path = get_temp_filepath(suffix=".mp3")
-    # Truncate if needed
-    tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
-    logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))
-    # Edge-TTS is async, so we need to run it in an event loop
-    async def _generate():
-        communicate = edge_tts.Communicate(tts_text, voice)
-        await communicate.save(output_path)
-    # Handle event loop — works whether called from sync or async context
-    try:
-        loop = asyncio.get_event_loop()
-        if loop.is_running():
-            # We're inside an existing event loop (e.g., Gradio)
-            import concurrent.futures
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future = executor.submit(asyncio.run, _generate())
-                future.result(timeout=120)
-        else:
-            loop.run_until_complete(_generate())
-    except RuntimeError:
-        asyncio.run(_generate())
-    file_size = os.path.getsize(output_path)
-    logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)
-    if file_size == 0:
-        raise RuntimeError("Edge-TTS generated an empty audio file")
-    return output_path
-# ── Unified Interface ────────────────────────────────────────────────────────
-def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
-    """
-    Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.
-    Args:
-        text: The script text to convert to speech
-        voice_id: Optional voice identifier
-    Returns:
-        Tuple of (audio_file_path, engine_used)
-    """
-    if not text or not text.strip():
-        raise ValueError("No text provided for audio generation.")
-    # Try Qwen TTS first (expressive, emotional)
-    logger.info("Attempting Qwen3-TTS (primary)...")
-    audio_path = generate_audio_qwen(text, voice_id)
-    if audio_path and os.path.exists(audio_path):
-        return audio_path, "Qwen3-TTS"
-    # Fall back to Edge-TTS (reliable, CPU-only)
-    logger.info("Falling back to Edge-TTS...")
-    audio_path = generate_audio_edge(text, voice_id)
-    return audio_path, "Edge-TTS"

+"""
+VoiceVerse AI — Voice Generation Module (TTS).
+Converts generated scripts into audio.
+Primary:  Qwen3-TTS via HF Inference API
+Fallback: Edge-TTS (CPU-only, no API key needed)
+Delivery Mode additions:
+  - Podcast mode  : splits script on HOST_1/HOST_2 tags, generates each
+                    segment with a distinct voice, then concatenates.
+  - Summary/Song  : single voice, unchanged from original behaviour.
+Public API (unchanged signature):
+    generate_audio(text, voice_id=None) → (path, engine_name)
+New internal API:
+    generate_audio_podcast(script) → (path, engine_name)
+"""
+import os
+import re
+import asyncio
+import tempfile
+from utils import logger, get_temp_filepath
+# ── Configuration ────────────────────────────────────────────────────────────
+QWEN_TTS_MODEL  = "Qwen/Qwen3-TTS"
+# Edge-TTS voices
+EDGE_VOICE_DEFAULT  = "en-US-AriaNeural"    # Host 1 / single voice
+EDGE_VOICE_HOST2    = "en-US-GuyNeural"     # Host 2 (podcast)
+TTS_MAX_CHARS = 3000   # hard cap per TTS call
+# ══════════════════════════════════════════════════════════════════════════════
+# Low-level TTS helpers
+# ══════════════════════════════════════════════════════════════════════════════
+def _qwen_tts(text: str) -> str | None:
+    """
+    Call Qwen3-TTS via HF Inference API.
+    Returns a WAV file path on success, None on any failure.
+    """
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        logger.warning("HF_TOKEN not set — skipping Qwen TTS")
+        return None
+    try:
+        from huggingface_hub import InferenceClient
+        client = InferenceClient(token=token)
+        snippet = text[:TTS_MAX_CHARS]
+        logger.info("Calling Qwen3-TTS (%d chars)…", len(snippet))
+        audio_bytes = client.text_to_speech(text=snippet, model=QWEN_TTS_MODEL)
+        if not audio_bytes:
+            logger.warning("Qwen TTS returned empty bytes")
+            return None
+        path = get_temp_filepath(suffix=".wav")
+        with open(path, "wb") as f:
+            f.write(audio_bytes)
+        logger.info("Qwen TTS saved: %s (%d bytes)", path, len(audio_bytes))
+        return path
+    except Exception as exc:
+        logger.warning("Qwen TTS failed (%s) — will use Edge-TTS fallback", exc)
+        return None
+def _edge_tts(text: str, voice: str = EDGE_VOICE_DEFAULT) -> str:
+    """
+    Generate audio with Edge-TTS (CPU, no key needed).
+    Returns an MP3 file path.
+    """
+    import edge_tts
+    snippet = text[:TTS_MAX_CHARS]
+    path    = get_temp_filepath(suffix=".mp3")
+    logger.info("Edge-TTS: voice=%s, %d chars → %s", voice, len(snippet), path)
+    async def _run():
+        comm = edge_tts.Communicate(snippet, voice)
+        await comm.save(path)
+    # Works whether called from sync or async context (Gradio)
+    try:
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                pool.submit(asyncio.run, _run()).result(timeout=120)
+        else:
+            loop.run_until_complete(_run())
+    except RuntimeError:
+        asyncio.run(_run())
+    size = os.path.getsize(path)
+    if size == 0:
+        raise RuntimeError("Edge-TTS produced an empty audio file.")
+    logger.info("Edge-TTS saved: %s (%d bytes)", path, size)
+    return path
+# ══════════════════════════════════════════════════════════════════════════════
+# Audio concatenation (for podcast multi-segment audio)
+# ════════════════════════════════════════════════════���═════════════════════════
+def _concat_audio_files(paths: list[str]) -> str:
+    """
+    Concatenate a list of audio files (WAV or MP3) into a single MP3.
+    Uses pydub; ffmpeg must be available (packages.txt: ffmpeg).
+    Falls back to copying the first file if pydub fails.
+    """
+    if len(paths) == 1:
+        return paths[0]
+    try:
+        from pydub import AudioSegment
+        combined = AudioSegment.empty()
+        silence  = AudioSegment.silent(duration=300)   # 300 ms between speakers
+        for p in paths:
+            seg = AudioSegment.from_file(p)
+            combined += seg + silence
+        out = get_temp_filepath(suffix=".mp3")
+        combined.export(out, format="mp3")
+        logger.info("Concatenated %d segments → %s", len(paths), out)
+        return out
+    except Exception as exc:
+        logger.warning("pydub concat failed (%s) — returning first segment", exc)
+        return paths[0]
+# ══════════════════════════════════════════════════════════════════════════════
+# Podcast TTS  (multi-voice)
+# ══════════════════════════════════════════════════════════════════════════════
+def _parse_podcast_script(script: str) -> list[tuple[str, str]]:
+    """
+    Parse a podcast script into a list of (speaker, text) tuples.
+    Expects lines like:
+        HOST_1: Some text here.
+        HOST_2: Reply text here.
+    Consecutive lines from the same speaker are merged.
+    """
+    segments: list[tuple[str, str]] = []
+    for line in script.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("HOST_1:"):
+            text = line[len("HOST_1:"):].strip()
+            if text:
+                if segments and segments[-1][0] == "HOST_1":
+                    segments[-1] = ("HOST_1", segments[-1][1] + " " + text)
+                else:
+                    segments.append(("HOST_1", text))
+        elif line.startswith("HOST_2:"):
+            text = line[len("HOST_2:"):].strip()
+            if text:
+                if segments and segments[-1][0] == "HOST_2":
+                    segments[-1] = ("HOST_2", segments[-1][1] + " " + text)
+                else:
+                    segments.append(("HOST_2", text))
+        # Lines without a valid tag are silently skipped
+    return segments
+def generate_audio_podcast(script: str) -> tuple[str, str]:
+    """
+    Generate multi-voice audio for Podcast mode.
+    Strategy:
+      1. Parse script into (speaker, text) segments.
+      2. Generate Edge-TTS audio for each segment using distinct voices.
+         (Qwen TTS doesn't expose per-call voice selection, so Edge-TTS is
+          used directly for podcast to guarantee two distinct voices.)
+      3. Concatenate all segments with a short silence between speakers.
+    Returns:
+        (audio_file_path, engine_label)
+    """
+    segments = _parse_podcast_script(script)
+    if not segments:
+        logger.warning("Podcast parser found no HOST_1/HOST_2 lines — using single voice")
+        return generate_audio(script)
+    logger.info("Podcast mode: %d speaker segments to synthesise", len(segments))
+    voice_map = {
+        "HOST_1": EDGE_VOICE_DEFAULT,
+        "HOST_2": EDGE_VOICE_HOST2,
+    }
+    audio_paths: list[str] = []
+    for speaker, text in segments:
+        voice = voice_map.get(speaker, EDGE_VOICE_DEFAULT)
+        try:
+            path = _edge_tts(text, voice=voice)
+            audio_paths.append(path)
+        except Exception as exc:
+            logger.warning("Segment TTS failed for %s: %s — skipping", speaker, exc)
+    if not audio_paths:
+        raise RuntimeError("All podcast audio segments failed to generate.")
+    final_path = _concat_audio_files(audio_paths)
+    return final_path, "Edge-TTS (Podcast)"
+# ══════════════════════════════════════════════════════════════════════════════
+# Unified public interface  (unchanged signature from original tts.py)
+# ══════════════════════════════════════════════════════════════════════════════
+def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
+    """
+    Generate audio from text (Summary / Song / Rap — single voice).
+    Tries Qwen3-TTS first; falls back to Edge-TTS.
+    Args:
+        text     : Script text to synthesise.
+        voice_id : Optional Edge-TTS voice override.
+    Returns:
+        (audio_file_path, engine_label)
+    """
+    if not text or not text.strip():
+        raise ValueError("No text provided for audio generation.")
+    # Attempt primary (Qwen)
+    path = _qwen_tts(text)
+    if path and os.path.exists(path):
+        return path, "Qwen3-TTS"
+    # Fallback (Edge-TTS)
+    voice = voice_id or EDGE_VOICE_DEFAULT
+    path  = _edge_tts(text, voice=voice)
+    return path, "Edge-TTS"