voiceverse-ai / script_gen.py
Isshi14's picture
Update script_gen.py
7723b85 verified
"""
VoiceVerse AI β€” Script Generation Module.
Generates spoken-style scripts from retrieved document chunks
using SmolLM3-3B via the Hugging Face Inference API.
Design decisions:
- Serverless HF Inference API avoids loading a large model locally
- SmolLM3-3B is deployed on the free hf-inference provider
- Prompt template enforces podcast/narration structure
- Max 1024 new tokens keeps scripts a reasonable length for TTS
- Temperature 0.4 keeps output grounded and factual
- Post-processing strips markdown/XML artifacts for clean TTS
"""
import os
import re
from huggingface_hub import InferenceClient
from utils import logger
# β€” Configuration β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.4
# β€” Prompt Template β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.
CRITICAL RULES:
1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
6. Never say "the document says" or "according to the text". Speak as the expert.
7. If the content is limited, keep the script short rather than inventing information.
8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
9. Output ONLY the spoken narration text, nothing else."""
USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:
--- CONTENT ---
{context}
--- END ---
Topic: {topic}
Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""
# β€” Post-processing β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def _clean_script_for_tts(text: str) -> str:
"""
Remove markdown, XML/HTML tags, and other artifacts that would be
read aloud by TTS engines.
"""
# Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
# Remove any remaining XML/HTML-style tags
text = re.sub(r'<[^>]+>', '', text)
# Remove markdown headers (# ## ### etc.)
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
# Remove markdown bold/italic markers
text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
# Remove markdown links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Remove markdown code blocks and inline code
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove bullet point markers
text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
# Remove numbered list markers
text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove blockquote markers
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Collapse multiple newlines into one
text = re.sub(r'\n{3,}', '\n\n', text)
# Collapse multiple spaces
text = re.sub(r' {2,}', ' ', text)
return text.strip()
# β€” Script Generation β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def _get_client() -> InferenceClient:
"""Create an HF Inference client with the user's token."""
token = os.environ.get("HF_TOKEN")
if not token:
raise EnvironmentError(
"HF_TOKEN environment variable is not set. "
"Please set your Hugging Face API token to use the script generation feature."
)
return InferenceClient(
provider="hf-inference",
token=token,
)
def generate_script(
context_chunks: list[str],
topic: str = "the key ideas and insights from this document",
) -> str:
"""
Generate a spoken-style podcast script from retrieved document chunks.
Args:
context_chunks: List of relevant text chunks from the RAG store
topic: Optional focus topic for the script
Returns:
A spoken script string ready for TTS conversion
"""
if not context_chunks:
raise ValueError("No document context provided. Please upload a document first.")
# Combine chunks into a single context block
context = "\n\n".join(context_chunks)
# Truncate if too long
max_context_chars = 6000
if len(context) > max_context_chars:
context = context[:max_context_chars]
logger.warning("Context truncated to %d characters", max_context_chars)
# Build the prompt
user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)
logger.info("Generating script via %s (context: %d chars, topic: '%s')",
MODEL_ID, len(context), topic[:50])
client = _get_client()
# Call the model using chat_completion
response = client.chat_completion(
model=MODEL_ID,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message},
],
max_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_p=0.9,
)
raw_script = response.choices[0].message.content.strip()
if not raw_script:
raise RuntimeError("The model returned an empty script. Please try again.")
# Clean the script for TTS (remove markdown, XML tags, etc.)
script = _clean_script_for_tts(raw_script)
if not script:
raise RuntimeError("Script was empty after cleaning. Please try again.")
logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
return script