Spaces:

Isshi14
/

voiceverse-ai

Sleeping

App Files Files Community

voiceverse-ai / script_gen.py

Isshi14

Update script_gen.py

7723b85 verified 3 months ago

raw

history blame contribute delete

6.56 kB

	"""
	VoiceVerse AI — Script Generation Module.

	Generates spoken-style scripts from retrieved document chunks
	using SmolLM3-3B via the Hugging Face Inference API.

	Design decisions:
	- Serverless HF Inference API avoids loading a large model locally
	- SmolLM3-3B is deployed on the free hf-inference provider
	- Prompt template enforces podcast/narration structure
	- Max 1024 new tokens keeps scripts a reasonable length for TTS
	- Temperature 0.4 keeps output grounded and factual
	- Post-processing strips markdown/XML artifacts for clean TTS
	"""

	import os
	import re
	from huggingface_hub import InferenceClient
	from utils import logger

	# — Configuration ————————————————————————————————————
	MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
	MAX_NEW_TOKENS = 1024
	TEMPERATURE = 0.4

	# — Prompt Template ———————————————————————————————————

	SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.

	CRITICAL RULES:
	1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
	2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
	3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
	4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
	5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
	6. Never say "the document says" or "according to the text". Speak as the expert.
	7. If the content is limited, keep the script short rather than inventing information.
	8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
	9. Output ONLY the spoken narration text, nothing else."""

	USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:

	--- CONTENT ---
	{context}
	--- END ---

	Topic: {topic}

	Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""


	# — Post-processing ——————————————————————————————————

	def _clean_script_for_tts(text: str) -> str:
	"""
	Remove markdown, XML/HTML tags, and other artifacts that would be
	read aloud by TTS engines.
	"""
	# Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
	text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)

	# Remove any remaining XML/HTML-style tags
	text = re.sub(r'<[^>]+>', '', text)

	# Remove markdown headers (# ## ### etc.)
	text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)

	# Remove markdown bold/italic markers
	text = re.sub(r'\{1,3}([^]+)\*{1,3}', r'\1', text)
	text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)

	# Remove markdown links [text](url) -> text
	text = re.sub(r'\[([^\]]+)\]$[^)]+$', r'\1', text)

	# Remove markdown code blocks and inline code
	text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
	text = re.sub(r'`([^`]+)`', r'\1', text)

	# Remove bullet point markers
	text = re.sub(r'^[\s][-+]\s+', '', text, flags=re.MULTILINE)

	# Remove numbered list markers
	text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)

	# Remove blockquote markers
	text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)

	# Remove horizontal rules
	text = re.sub(r'^[-_]{3,}\s$', '', text, flags=re.MULTILINE)

	# Collapse multiple newlines into one
	text = re.sub(r'\n{3,}', '\n\n', text)

	# Collapse multiple spaces
	text = re.sub(r' {2,}', ' ', text)

	return text.strip()


	# — Script Generation ————————————————————————————————

	def _get_client() -> InferenceClient:
	"""Create an HF Inference client with the user's token."""
	token = os.environ.get("HF_TOKEN")
	if not token:
	raise EnvironmentError(
	"HF_TOKEN environment variable is not set. "
	"Please set your Hugging Face API token to use the script generation feature."
	)
	return InferenceClient(
	provider="hf-inference",
	token=token,
	)


	def generate_script(
	context_chunks: list[str],
	topic: str = "the key ideas and insights from this document",
	) -> str:
	"""
	Generate a spoken-style podcast script from retrieved document chunks.

	Args:
	context_chunks: List of relevant text chunks from the RAG store
	topic: Optional focus topic for the script

	Returns:
	A spoken script string ready for TTS conversion
	"""
	if not context_chunks:
	raise ValueError("No document context provided. Please upload a document first.")

	# Combine chunks into a single context block
	context = "\n\n".join(context_chunks)

	# Truncate if too long
	max_context_chars = 6000
	if len(context) > max_context_chars:
	context = context[:max_context_chars]
	logger.warning("Context truncated to %d characters", max_context_chars)

	# Build the prompt
	user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)

	logger.info("Generating script via %s (context: %d chars, topic: '%s')",
	MODEL_ID, len(context), topic[:50])

	client = _get_client()

	# Call the model using chat_completion
	response = client.chat_completion(
	model=MODEL_ID,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_message},
	],
	max_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	top_p=0.9,
	)

	raw_script = response.choices[0].message.content.strip()

	if not raw_script:
	raise RuntimeError("The model returned an empty script. Please try again.")

	# Clean the script for TTS (remove markdown, XML tags, etc.)
	script = _clean_script_for_tts(raw_script)

	if not script:
	raise RuntimeError("Script was empty after cleaning. Please try again.")

	logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
	return script