Spaces:
Sleeping
Sleeping
| """ | |
| VoiceVerse AI β Script Generation Module. | |
| Generates spoken-style scripts from retrieved document chunks | |
| using SmolLM3-3B via the Hugging Face Inference API. | |
| Design decisions: | |
| - Serverless HF Inference API avoids loading a large model locally | |
| - SmolLM3-3B is deployed on the free hf-inference provider | |
| - Prompt template enforces podcast/narration structure | |
| - Max 1024 new tokens keeps scripts a reasonable length for TTS | |
| - Temperature 0.4 keeps output grounded and factual | |
| - Post-processing strips markdown/XML artifacts for clean TTS | |
| """ | |
| import os | |
| import re | |
| from huggingface_hub import InferenceClient | |
| from utils import logger | |
| # β Configuration ββββββββββββββββββββββββββββββββββββ | |
| MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
| MAX_NEW_TOKENS = 1024 | |
| TEMPERATURE = 0.4 | |
| # β Prompt Template βββββββββββββββββββββββββββββββββββ | |
| SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration. | |
| CRITICAL RULES: | |
| 1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details. | |
| 2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc. | |
| 3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..." | |
| 4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags. | |
| 5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language. | |
| 6. Never say "the document says" or "according to the text". Speak as the expert. | |
| 7. If the content is limited, keep the script short rather than inventing information. | |
| 8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks. | |
| 9. Output ONLY the spoken narration text, nothing else.""" | |
| USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script: | |
| --- CONTENT --- | |
| {context} | |
| --- END --- | |
| Topic: {topic} | |
| Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content.""" | |
| # β Post-processing ββββββββββββββββββββββββββββββββββ | |
| def _clean_script_for_tts(text: str) -> str: | |
| """ | |
| Remove markdown, XML/HTML tags, and other artifacts that would be | |
| read aloud by TTS engines. | |
| """ | |
| # Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces) | |
| text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL) | |
| # Remove any remaining XML/HTML-style tags | |
| text = re.sub(r'<[^>]+>', '', text) | |
| # Remove markdown headers (# ## ### etc.) | |
| text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE) | |
| # Remove markdown bold/italic markers | |
| text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text) | |
| text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text) | |
| # Remove markdown links [text](url) -> text | |
| text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) | |
| # Remove markdown code blocks and inline code | |
| text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) | |
| text = re.sub(r'`([^`]+)`', r'\1', text) | |
| # Remove bullet point markers | |
| text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE) | |
| # Remove numbered list markers | |
| text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE) | |
| # Remove blockquote markers | |
| text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) | |
| # Remove horizontal rules | |
| text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) | |
| # Collapse multiple newlines into one | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Collapse multiple spaces | |
| text = re.sub(r' {2,}', ' ', text) | |
| return text.strip() | |
| # β Script Generation ββββββββββββββββββββββββββββββββ | |
| def _get_client() -> InferenceClient: | |
| """Create an HF Inference client with the user's token.""" | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| raise EnvironmentError( | |
| "HF_TOKEN environment variable is not set. " | |
| "Please set your Hugging Face API token to use the script generation feature." | |
| ) | |
| return InferenceClient( | |
| provider="hf-inference", | |
| token=token, | |
| ) | |
| def generate_script( | |
| context_chunks: list[str], | |
| topic: str = "the key ideas and insights from this document", | |
| ) -> str: | |
| """ | |
| Generate a spoken-style podcast script from retrieved document chunks. | |
| Args: | |
| context_chunks: List of relevant text chunks from the RAG store | |
| topic: Optional focus topic for the script | |
| Returns: | |
| A spoken script string ready for TTS conversion | |
| """ | |
| if not context_chunks: | |
| raise ValueError("No document context provided. Please upload a document first.") | |
| # Combine chunks into a single context block | |
| context = "\n\n".join(context_chunks) | |
| # Truncate if too long | |
| max_context_chars = 6000 | |
| if len(context) > max_context_chars: | |
| context = context[:max_context_chars] | |
| logger.warning("Context truncated to %d characters", max_context_chars) | |
| # Build the prompt | |
| user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic) | |
| logger.info("Generating script via %s (context: %d chars, topic: '%s')", | |
| MODEL_ID, len(context), topic[:50]) | |
| client = _get_client() | |
| # Call the model using chat_completion | |
| response = client.chat_completion( | |
| model=MODEL_ID, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_message}, | |
| ], | |
| max_tokens=MAX_NEW_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=0.9, | |
| ) | |
| raw_script = response.choices[0].message.content.strip() | |
| if not raw_script: | |
| raise RuntimeError("The model returned an empty script. Please try again.") | |
| # Clean the script for TTS (remove markdown, XML tags, etc.) | |
| script = _clean_script_for_tts(raw_script) | |
| if not script: | |
| raise RuntimeError("Script was empty after cleaning. Please try again.") | |
| logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script)) | |
| return script |