Spaces:

Isshi14
/

Voiceover-ai-2

Sleeping

File size: 13,424 Bytes

3828c7d

"""
VoiceVerse AI — Script Generation Module.

Delivery Modes:
  Summary  — single-speaker structured narration
  Podcast  — HOST_1 / HOST_2 two-host dialogue
  Song/Rap — rhythmic retention content
  Debate   — DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
"""

import os
import re
from huggingface_hub import InferenceClient
from utils import logger

MODEL_ID       = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 1200
TEMPERATURE    = 0.5


# ══════════════════════════════════════════════════════════════════════════════
# Prompts
# ══════════════════════════════════════════════════════════════════════════════

# ── Summary ───────────────────────────────────────────────────────────────────
_SUMMARY_SYSTEM = """\
You are a professional narrator. Produce a clear spoken summary strictly from the source material.
RULES:
1. Use ONLY facts from the source. Do NOT add outside knowledge.
2. Structure: short intro → key points as natural spoken sentences → concise conclusion.
3. Plain text only — no markdown, no bullets, no headers.
4. Write for the ear: short sentences, conversational tone.
5. Never say "the document says". Speak as the expert.
6. Output ONLY the narration text, nothing else."""

_SUMMARY_USER = """\
SOURCE MATERIAL:
{context}

Write a flowing spoken summary (intro, key points, conclusion) in plain sentences."""


# ── Podcast ───────────────────────────────────────────────────────────────────
_PODCAST_SYSTEM = """\
You are a podcast script writer. Write a two-host conversation strictly from the source material.

STRICT FORMAT — every single line must start with a speaker tag:
HOST_1: <what Host 1 says>
HOST_2: <what Host 2 says>

RULES:
1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
2. HOST_1 introduces topics and asks questions.
3. HOST_2 explains concepts and answers.
4. Use ONLY information from the source. No hallucination.
5. Conversational, engaging tone.
6. No markdown, no stage directions, no lines without a HOST tag.
7. Aim for 16–24 exchanges."""

_PODCAST_USER = """\
SOURCE MATERIAL:
{context}

Write the full podcast. Every line must start with HOST_1: or HOST_2:"""


# ── Song / Rap ────────────────────────────────────────────────────────────────
_SONG_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 — silently extract 5–7 key ideas from the source.
STEP 2 — write a smooth melodic SONG from those ideas.

RULES:
- Simple memorable language, rhyming couplets (AABB).
- Label sections [VERSE 1], [VERSE 2], [CHORUS].
- [CHORUS] repeats the main concept.
- Short lines (6–10 words). Use repetition.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_RAP_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 — silently extract 5–7 key ideas from the source.
STEP 2 — write a punchy rhythmic RAP from those ideas.

RULES:
- Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
- Label sections [VERSE 1], [VERSE 2], [HOOK].
- [HOOK] repeats the main concept.
- Wordplay and repetition to aid retention.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_SONG_RAP_USER = """\
SOURCE MATERIAL:
{context}

Extract the key ideas, then write the full {form}."""


# ── Debate ────────────────────────────────────────────────────────────────────
_DEBATE_SYSTEM = """\
You are a debate script writer. Write a structured two-person debate strictly grounded \
in the provided source material.

STRICT FORMAT — every single line must start with a speaker tag:
DEBATER_A: <what Debater A says>
DEBATER_B: <what Debater B says>

CHARACTER PROFILES:
- DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
- DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.

DEBATE STRUCTURE:
1. DEBATER_A opens with a strong statement supporting the topic.
2. DEBATER_B immediately challenges with a counterpoint.
3. They alternate, each directly responding to the other's previous point.
4. Both use evidence and logic from the source material only.
5. End with each debater giving a brief closing statement.

RULES:
- Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
- Use ONLY information from the source material. No hallucination.
- Each turn should be 1–3 sentences — punchy, not long speeches.
- No markdown, no stage directions, no narration outside the speaker tags.
- Aim for 16–22 exchanges total."""

_DEBATE_USER = """\
SOURCE MATERIAL:
{context}

Write the full debate on the key topics from this material. \
Every line must start with DEBATER_A: or DEBATER_B:"""


# ── Story ─────────────────────────────────────────────────────────────────────
_STORY_SYSTEM = """\
You are a master storyteller. Retell the ideas from the source material as an \
immersive narrative story written for slow, expressive audio delivery.

RULES:
1. Transform factual content into a story — use characters, scenes, a narrative arc \
   (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
2. Use ONLY information and ideas from the source. Do NOT invent new facts.
3. Warm, descriptive storytelling voice. Vivid but calm.
4. Short paragraphs, 1–3 sentences each, separated by blank lines.
5. Plain text only — no markdown, no bullets, no headers.
6. Begin with an evocative scene-setting sentence.
7. End with a closing reflection or lesson drawn from the source.
8. Output ONLY the story text, nothing else."""

_STORY_USER = """\
SOURCE MATERIAL:
{context}

Transform this into a rich narrative story for slow, expressive audio. \
Use short paragraphs with blank lines between them."""


# ══════════════════════════════════════════════════════════════════════════════
# Post-processing
# ══════════════════════════════════════════════════════════════════════════════

def _clean(text: str) -> str:
    """Remove all markdown and XML artifacts from LLM output."""
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
    text = re.sub(r"`([^`]+)`", r"\1", text)
    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)
    return text.strip()


def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
    """
    Clean output that must have speaker tags (podcast or debate).
    Normalises tag variants, removes lines without valid tags.
    """
    text = _clean(text)

    # Normalise tag variants the model might produce
    if tag_a == "HOST_1":
        text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
        text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
    elif tag_a == "DEBATER_A":
        text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
        # Also catch "Pro:" / "Con:" / "Speaker A:" variants
        text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)

    # Keep only lines that have a valid speaker tag
    lines = text.splitlines()
    clean_lines = [
        ln for ln in lines
        if ln.strip() == ""
        or ln.strip().startswith(f"{tag_a}:")
        or ln.strip().startswith(f"{tag_b}:")
    ]
    return "\n".join(clean_lines).strip()


# ══════════════════════════════════════════════════════════════════════════════
# LLM client
# ══════════════════════════════════════════════════════════════════════════════

def _get_client() -> InferenceClient:
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise EnvironmentError(
            "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
        )
    return InferenceClient(provider="hf-inference", token=token)


def _call_llm(system: str, user: str) -> str:
    client = _get_client()
    response = client.chat_completion(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user},
        ],
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=0.9,
    )
    raw = response.choices[0].message.content.strip()
    if not raw:
        raise RuntimeError("Model returned empty response. Please try again.")
    return raw


# ══════════════════════════════════════════════════════════════════════════════
# Public entry point
# ══════════════════════════════════════════════════════════════════════════════

def generate_script(
    context_chunks: list[str],
    mode: str = "Summary",
    sub_mode: str = "Rap",
    topic: str = "the key ideas from this document",
) -> str:
    """
    Generate a spoken script from RAG chunks.

    Args:
        context_chunks : chunks from RAGStore — NOT modified here
        mode           : "Summary" | "Podcast" | "Song / Rap" | "Debate"
        sub_mode       : "Song" | "Rap"  (only for Song/Rap mode)

    Returns:
        Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
        Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
    """
    if not context_chunks:
        raise ValueError("No document context. Please upload or paste content first.")

    context = "\n\n".join(context_chunks)
    if len(context) > 6000:
        context = context[:6000]
        logger.warning("Context truncated to 6000 chars")

    logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))

    m = mode.strip().lower()

    if m == "summary":
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    elif m == "podcast":
        raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
        script = _clean_dialogue(raw, "HOST_1", "HOST_2")

    elif "song" in m or "rap" in m:
        form = sub_mode.lower()
        sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
        raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
        script = _clean(raw)

    elif "debate" in m:
        raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
        script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")

    elif "story" in m:
        raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
        script = _clean(raw)

    else:
        logger.warning("Unknown mode '%s' — falling back to Summary", mode)
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    if not script:
        raise RuntimeError("Script was empty after cleaning. Please try again.")

    logger.info("Script ready: %d chars", len(script))
    return script