voiceverse-ai-test

Sleeping

File size: 14,553 Bytes

fd75949
 
 
fbe59b1
c6c42d9
 
 
 
fd75949
 
 
94a026e
fd75949
 
 
8c9f65d
8b3748a
8c9f65d
fd75949
 
c6c42d9
 
 
 
 
fbe59b1
8c9f65d
fbe59b1
8c9f65d
8b3748a
 
 
 
 
 
fbe59b1
 
 
fd75949
 
8b3748a
fbe59b1
 
c6c42d9
fbe59b1
8c9f65d
fbe59b1
8c9f65d
fbe59b1
 
 
 
c6c42d9
8c9f65d
 
 
 
 
 
fbe59b1
 
 
 
 
8c9f65d
fbe59b1
fd75949
c6c42d9
fbe59b1
8c9f65d
 
 
94a026e
c6c42d9
8c9f65d
 
 
 
fbe59b1
8c9f65d
fbe59b1
 
8c9f65d
 
 
fbe59b1
c6c42d9
8c9f65d
 
 
 
fbe59b1
8c9f65d
fbe59b1
 
 
 
 
8c9f65d
94a026e
fbe59b1
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff6ba78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c42d9
 
 
fbe59b1
8c9f65d
c6c42d9
fbe59b1
 
 
8b3748a
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe59b1
 
 
 
 
 
 
 
 
 
 
 
94a026e
 
c6c42d9
 
 
 
 
8c9f65d
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe59b1
c6c42d9
fbe59b1
 
c6c42d9
 
 
fd75949
 
 
 
 
8c9f65d
fd75949
fbe59b1
 
 
8c9f65d
fbe59b1
 
 
c6c42d9
 
 
 
fbe59b1
 
 
71b59d0
fbe59b1
 
8c9f65d
fbe59b1
fd75949
 
c6c42d9
 
 
fbe59b1
fd75949
 
fbe59b1
 
8c9f65d
fd75949
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
fd75949
8c9f65d
fd75949
 
8c9f65d
 
 
fd75949
c6c42d9
fd75949
8c9f65d
fd75949
8c9f65d
 
 
 
 
 
c6c42d9
fbe59b1
8c9f65d
 
 
 
 
fbe59b1
c6c42d9
 
 
 
ff6ba78
 
 
 
fbe59b1
8c9f65d
 
 
94a026e
 
 
 
8c9f65d
fbe59b1

"""
VoiceVerse AI — Script Generation Module.

Delivery Modes:
  Summary  — single-speaker structured narration
  Podcast  — HOST_1 / HOST_2 two-host dialogue
  Song/Rap — rhythmic retention content
  Debate   — DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
"""

import os
import re
from huggingface_hub import InferenceClient
from utils import logger

MODEL_ID       = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 2048
TEMPERATURE    = 0.5


# ══════════════════════════════════════════════════════════════════════════════
# Prompts
# ══════════════════════════════════════════════════════════════════════════════

# ── Summary ───────────────────────────────────────────────────────────────────
_SUMMARY_SYSTEM = """\
You are a professional narrator. Produce a clear spoken summary strictly from the source material.
RULES:
1. Use ONLY facts from the source. Do NOT add outside knowledge.
2. Write as one continuous flowing narration. Do NOT use any section headings, labels, or structural markers like "Introduction", "Intro", "Key Points", "Conclusion", "Summary", "Section 1", etc.
3. Use smooth spoken transitions instead of headings. For example say "Let's start with..." or "Now moving on to..." or "To wrap things up..." instead of labeling sections.
4. Plain text only — no markdown, no bullets, no headers, no labels of any kind.
5. Write for the ear: short sentences, conversational tone.
6. Never say "the document says". Speak as the expert.
7. Output ONLY the spoken narration text, nothing else. It should read like someone is naturally talking."""

_SUMMARY_USER = """\
SOURCE MATERIAL:
{context}

Write a flowing spoken summary in plain sentences. Do NOT include any headings or labels like Intro, Conclusion, etc. Just speak naturally as if talking to a listener."""


# ── Podcast ───────────────────────────────────────────────────────────────────
_PODCAST_SYSTEM = """\
You are a podcast script writer. Write a two-host conversation strictly from the source material.

STRICT FORMAT — every single line must start with a speaker tag:
HOST_1: <what Host 1 says>
HOST_2: <what Host 2 says>

RULES:
1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
2. HOST_1 introduces topics and asks questions.
3. HOST_2 explains concepts and answers.
4. Use ONLY information from the source. No hallucination.
5. Conversational, engaging tone.
6. No markdown, no stage directions, no lines without a HOST tag.
7. Aim for 16–24 exchanges."""

_PODCAST_USER = """\
SOURCE MATERIAL:
{context}

Write the full podcast. Every line must start with HOST_1: or HOST_2:"""


# ── Song / Rap ────────────────────────────────────────────────────────────────
_SONG_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 — silently extract 5–7 key ideas from the source.
STEP 2 — write a smooth melodic SONG from those ideas.

RULES:
- Simple memorable language, rhyming couplets (AABB).
- Label sections [VERSE 1], [VERSE 2], [CHORUS].
- [CHORUS] repeats the main concept.
- Short lines (6–10 words). Use repetition.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_RAP_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 — silently extract 5–7 key ideas from the source.
STEP 2 — write a punchy rhythmic RAP from those ideas.

RULES:
- Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
- Label sections [VERSE 1], [VERSE 2], [HOOK].
- [HOOK] repeats the main concept.
- Wordplay and repetition to aid retention.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_SONG_RAP_USER = """\
SOURCE MATERIAL:
{context}

Extract the key ideas, then write the full {form}."""


# ── Debate ────────────────────────────────────────────────────────────────────
_DEBATE_SYSTEM = """\
You are a debate script writer. Write a structured two-person debate strictly grounded \
in the provided source material.

STRICT FORMAT — every single line must start with a speaker tag:
DEBATER_A: <what Debater A says>
DEBATER_B: <what Debater B says>

CHARACTER PROFILES:
- DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
- DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.

DEBATE STRUCTURE:
1. DEBATER_A opens with a strong statement supporting the topic.
2. DEBATER_B immediately challenges with a counterpoint.
3. They alternate, each directly responding to the other's previous point.
4. Both use evidence and logic from the source material only.
5. End with each debater giving a brief closing statement.

RULES:
- Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
- Use ONLY information from the source material. No hallucination.
- Each turn should be 1–3 sentences — punchy, not long speeches.
- No markdown, no stage directions, no narration outside the speaker tags.
- Aim for 16–22 exchanges total."""

_DEBATE_USER = """\
SOURCE MATERIAL:
{context}

Write the full debate on the key topics from this material. \
Every line must start with DEBATER_A: or DEBATER_B:"""


# ── Story ─────────────────────────────────────────────────────────────────────
_STORY_SYSTEM = """\
You are a master storyteller. Retell the ideas from the source material as an \
immersive narrative story written for slow, expressive audio delivery.

RULES:
1. Transform factual content into a story — use characters, scenes, a narrative arc \
   (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
2. Use ONLY information and ideas from the source. Do NOT invent new facts.
3. Warm, descriptive storytelling voice. Vivid but calm.
4. Short paragraphs, 1–3 sentences each, separated by blank lines.
5. Plain text only — no markdown, no bullets, no headers.
6. Begin with an evocative scene-setting sentence.
7. End with a closing reflection or lesson drawn from the source.
8. Output ONLY the story text, nothing else."""

_STORY_USER = """\
SOURCE MATERIAL:
{context}

Transform this into a rich narrative story for slow, expressive audio. \
Use short paragraphs with blank lines between them."""


# ══════════════════════════════════════════════════════════════════════════════
# Post-processing
# ══════════════════════════════════════════════════════════════════════════════

def _clean(text: str) -> str:
    """Remove all markdown and XML artifacts from LLM output."""
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)

    # Remove heading-like labels that TTS would read aloud
    # e.g. "Introduction:", "Intro:", "Conclusion:", "Key Points:", "Summary:" etc.
    text = re.sub(
        r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
        r"Closing|Opening|Final\s*Thoughts?|In\s*Summary|To\s*Conclude)\s*[:\-—]?\s*$",
        "", text, flags=re.MULTILINE | re.IGNORECASE
    )
    # Also remove inline heading labels at the start of a line followed by content
    text = re.sub(
        r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
        r"Closing|Opening|Final\s*Thoughts?)\s*[:\-—]\s+",
        "", text, flags=re.MULTILINE | re.IGNORECASE
    )
    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
    text = re.sub(r"`([^`]+)`", r"\1", text)
    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)
    return text.strip()


def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
    """
    Clean output that must have speaker tags (podcast or debate).
    Normalises tag variants, removes lines without valid tags.
    """
    text = _clean(text)

    # Normalise tag variants the model might produce
    if tag_a == "HOST_1":
        text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
        text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
    elif tag_a == "DEBATER_A":
        text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
        # Also catch "Pro:" / "Con:" / "Speaker A:" variants
        text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)

    # Keep only lines that have a valid speaker tag
    lines = text.splitlines()
    clean_lines = [
        ln for ln in lines
        if ln.strip() == ""
        or ln.strip().startswith(f"{tag_a}:")
        or ln.strip().startswith(f"{tag_b}:")
    ]
    return "\n".join(clean_lines).strip()


# ══════════════════════════════════════════════════════════════════════════════
# LLM client
# ══════════════════════════════════════════════════════════════════════════════

def _get_client() -> InferenceClient:
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise EnvironmentError(
            "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
        )
    return InferenceClient(provider="hf-inference", token=token)


def _call_llm(system: str, user: str) -> str:
    client = _get_client()
    response = client.chat_completion(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user},
        ],
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=0.9,
    )
    raw = response.choices[0].message.content.strip()
    if not raw:
        raise RuntimeError("Model returned empty response. Please try again.")
    return raw


# ══════════════════════════════════════════════════════════════════════════════
# Public entry point
# ══════════════════════════════════════════════════════════════════════════════

def generate_script(
    context_chunks: list[str],
    mode: str = "Summary",
    sub_mode: str = "Rap",
    topic: str = "the key ideas from this document",
) -> str:
    """
    Generate a spoken script from RAG chunks.

    Args:
        context_chunks : chunks from RAGStore — NOT modified here
        mode           : "Summary" | "Podcast" | "Song / Rap" | "Debate"
        sub_mode       : "Song" | "Rap"  (only for Song/Rap mode)

    Returns:
        Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
        Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
    """
    if not context_chunks:
        raise ValueError("No document context. Please upload or paste content first.")

    context = "\n\n".join(context_chunks)
    if len(context) > 6000:
        context = context[:6000]
        logger.warning("Context truncated to 6000 chars")

    logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))

    m = mode.strip().lower()

    if m == "summary":
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    elif m == "podcast":
        raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
        script = _clean_dialogue(raw, "HOST_1", "HOST_2")

    elif "song" in m or "rap" in m:
        form = sub_mode.lower()
        sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
        raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
        script = _clean(raw)

    elif "debate" in m:
        raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
        script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")

    elif "story" in m:
        raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
        script = _clean(raw)

    else:
        logger.warning("Unknown mode '%s' — falling back to Summary", mode)
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    if not script:
        raise RuntimeError("Script was empty after cleaning. Please try again.")

    logger.info("Script ready: %d chars", len(script))
    return script