""" VoiceVerse AI — Script Generation Module. Generates spoken-style scripts from retrieved document chunks using SmolLM3-3B via the Hugging Face Inference API. Design decisions: - Serverless HF Inference API avoids loading a large model locally - SmolLM3-3B is deployed on the free hf-inference provider - Prompt template enforces podcast/narration structure - Max 1024 new tokens keeps scripts a reasonable length for TTS - Temperature 0.4 keeps output grounded and factual - Post-processing strips markdown/XML artifacts for clean TTS """ import os import re from huggingface_hub import InferenceClient from utils import logger # — Configuration ———————————————————————————————————— MODEL_ID = "HuggingFaceTB/SmolLM3-3B" MAX_NEW_TOKENS = 1024 TEMPERATURE = 0.4 # — Prompt Template ——————————————————————————————————— SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration. CRITICAL RULES: 1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details. 2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc. 3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..." 4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags. 5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language. 6. Never say "the document says" or "according to the text". Speak as the expert. 7. If the content is limited, keep the script short rather than inventing information. 8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks. 9. Output ONLY the spoken narration text, nothing else.""" USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script: --- CONTENT --- {context} --- END --- Topic: {topic} Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content.""" # — Post-processing —————————————————————————————————— def _clean_script_for_tts(text: str) -> str: """ Remove markdown, XML/HTML tags, and other artifacts that would be read aloud by TTS engines. """ # Remove ... blocks entirely (SmolLM3 reasoning traces) text = re.sub(r'.*?', '', text, flags=re.DOTALL) # Remove any remaining XML/HTML-style tags text = re.sub(r'<[^>]+>', '', text) # Remove markdown headers (# ## ### etc.) text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE) # Remove markdown bold/italic markers text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text) text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text) # Remove markdown links [text](url) -> text text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove markdown code blocks and inline code text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) text = re.sub(r'`([^`]+)`', r'\1', text) # Remove bullet point markers text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE) # Remove numbered list markers text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE) # Remove blockquote markers text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) # Remove horizontal rules text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) # Collapse multiple newlines into one text = re.sub(r'\n{3,}', '\n\n', text) # Collapse multiple spaces text = re.sub(r' {2,}', ' ', text) return text.strip() # — Script Generation ———————————————————————————————— def _get_client() -> InferenceClient: """Create an HF Inference client with the user's token.""" token = os.environ.get("HF_TOKEN") if not token: raise EnvironmentError( "HF_TOKEN environment variable is not set. " "Please set your Hugging Face API token to use the script generation feature." ) return InferenceClient( provider="hf-inference", token=token, ) def generate_script( context_chunks: list[str], topic: str = "the key ideas and insights from this document", ) -> str: """ Generate a spoken-style podcast script from retrieved document chunks. Args: context_chunks: List of relevant text chunks from the RAG store topic: Optional focus topic for the script Returns: A spoken script string ready for TTS conversion """ if not context_chunks: raise ValueError("No document context provided. Please upload a document first.") # Combine chunks into a single context block context = "\n\n".join(context_chunks) # Truncate if too long max_context_chars = 6000 if len(context) > max_context_chars: context = context[:max_context_chars] logger.warning("Context truncated to %d characters", max_context_chars) # Build the prompt user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic) logger.info("Generating script via %s (context: %d chars, topic: '%s')", MODEL_ID, len(context), topic[:50]) client = _get_client() # Call the model using chat_completion response = client.chat_completion( model=MODEL_ID, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_message}, ], max_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, top_p=0.9, ) raw_script = response.choices[0].message.content.strip() if not raw_script: raise RuntimeError("The model returned an empty script. Please try again.") # Clean the script for TTS (remove markdown, XML tags, etc.) script = _clean_script_for_tts(raw_script) if not script: raise RuntimeError("Script was empty after cleaning. Please try again.") logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script)) return script