Spaces:
Sleeping
Sleeping
File size: 6,563 Bytes
fd75949 620c87c fd75949 620c87c fd75949 94a026e fd75949 94a026e fd75949 620c87c fd75949 94a026e fd75949 7723b85 fd75949 94a026e 7723b85 fd75949 94a026e fd75949 94a026e fd75949 94a026e fd75949 94a026e fd75949 94a026e fd75949 71b59d0 fd75949 620c87c fd75949 71b59d0 fd75949 71b59d0 fd75949 94a026e fd75949 94a026e fd75949 94a026e 71b59d0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """
VoiceVerse AI β Script Generation Module.
Generates spoken-style scripts from retrieved document chunks
using SmolLM3-3B via the Hugging Face Inference API.
Design decisions:
- Serverless HF Inference API avoids loading a large model locally
- SmolLM3-3B is deployed on the free hf-inference provider
- Prompt template enforces podcast/narration structure
- Max 1024 new tokens keeps scripts a reasonable length for TTS
- Temperature 0.4 keeps output grounded and factual
- Post-processing strips markdown/XML artifacts for clean TTS
"""
import os
import re
from huggingface_hub import InferenceClient
from utils import logger
# β Configuration ββββββββββββββββββββββββββββββββββββ
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.4
# β Prompt Template βββββββββββββββββββββββββββββββββββ
SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.
CRITICAL RULES:
1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
6. Never say "the document says" or "according to the text". Speak as the expert.
7. If the content is limited, keep the script short rather than inventing information.
8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
9. Output ONLY the spoken narration text, nothing else."""
USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:
--- CONTENT ---
{context}
--- END ---
Topic: {topic}
Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""
# β Post-processing ββββββββββββββββββββββββββββββββββ
def _clean_script_for_tts(text: str) -> str:
"""
Remove markdown, XML/HTML tags, and other artifacts that would be
read aloud by TTS engines.
"""
# Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
# Remove any remaining XML/HTML-style tags
text = re.sub(r'<[^>]+>', '', text)
# Remove markdown headers (# ## ### etc.)
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
# Remove markdown bold/italic markers
text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
# Remove markdown links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Remove markdown code blocks and inline code
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove bullet point markers
text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
# Remove numbered list markers
text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove blockquote markers
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Collapse multiple newlines into one
text = re.sub(r'\n{3,}', '\n\n', text)
# Collapse multiple spaces
text = re.sub(r' {2,}', ' ', text)
return text.strip()
# β Script Generation ββββββββββββββββββββββββββββββββ
def _get_client() -> InferenceClient:
"""Create an HF Inference client with the user's token."""
token = os.environ.get("HF_TOKEN")
if not token:
raise EnvironmentError(
"HF_TOKEN environment variable is not set. "
"Please set your Hugging Face API token to use the script generation feature."
)
return InferenceClient(
provider="hf-inference",
token=token,
)
def generate_script(
context_chunks: list[str],
topic: str = "the key ideas and insights from this document",
) -> str:
"""
Generate a spoken-style podcast script from retrieved document chunks.
Args:
context_chunks: List of relevant text chunks from the RAG store
topic: Optional focus topic for the script
Returns:
A spoken script string ready for TTS conversion
"""
if not context_chunks:
raise ValueError("No document context provided. Please upload a document first.")
# Combine chunks into a single context block
context = "\n\n".join(context_chunks)
# Truncate if too long
max_context_chars = 6000
if len(context) > max_context_chars:
context = context[:max_context_chars]
logger.warning("Context truncated to %d characters", max_context_chars)
# Build the prompt
user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)
logger.info("Generating script via %s (context: %d chars, topic: '%s')",
MODEL_ID, len(context), topic[:50])
client = _get_client()
# Call the model using chat_completion
response = client.chat_completion(
model=MODEL_ID,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message},
],
max_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_p=0.9,
)
raw_script = response.choices[0].message.content.strip()
if not raw_script:
raise RuntimeError("The model returned an empty script. Please try again.")
# Clean the script for TTS (remove markdown, XML tags, etc.)
script = _clean_script_for_tts(raw_script)
if not script:
raise RuntimeError("Script was empty after cleaning. Please try again.")
logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
return script |