voiceverse-ai-test

Sleeping

App Files Files Community

Isshi14 commited on Feb 19

Commit

94a026e

verified ·

1 Parent(s): 71b59d0

Update script_gen.py

Browse files

Files changed (1) hide show

script_gen.py +76 -22

script_gen.py CHANGED Viewed

@@ -9,43 +9,91 @@ Design decisions:
   - SmolLM3-3B is deployed on the free hf-inference provider
   - Prompt template enforces podcast/narration structure
   - Max 1024 new tokens keeps scripts a reasonable length for TTS
-  - Temperature 0.7 balances creativity with factual grounding
 """
 import os
 from huggingface_hub import InferenceClient
 from utils import logger
 # — Configuration ————————————————————————————————————
 MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
 MAX_NEW_TOKENS = 1024
-TEMPERATURE = 0.7
 # — Prompt Template ———————————————————————————————————
-SYSTEM_PROMPT = """You are a professional podcast script writer. Your job is to transform document content into an engaging spoken script for a podcast-style narration.
-Rules you MUST follow:
-1. Write ONLY for listening, not reading. Use conversational, natural language.
-2. Structure the script with: a compelling hook/introduction, the main narrative, and a clear closing/takeaway.
-3. Use short sentences. Vary sentence length for rhythm.
-4. Include natural transitions like "Now, here's where it gets interesting..." or "Let's break this down..."
-5. Add brief pauses indicated by "..." where the speaker should pause for effect.
-6. NEVER include headers, bullet points, or any visual formatting.
-7. NEVER mention "the document" or "the text" — speak as if you're the expert sharing knowledge.
-8. Ground ALL content strictly in the provided context. Do NOT invent facts.
-9. If information is insufficient, acknowledge it naturally: "There's still much to explore here..."
-10. Keep the tone warm, engaging, and authoritative — like a great podcast host."""
-USER_PROMPT_TEMPLATE = """Based on the following content, create an engaging spoken script for a podcast-style narration.
---- CONTENT FROM DOCUMENT ---
 {context}
---- END OF CONTENT ---
-Topic focus: {topic}
-Write the complete spoken script now. Remember: this will be read aloud, so make it sound natural and engaging."""
 # — Script Generation ————————————————————————————————
@@ -110,10 +158,16 @@ def generate_script(
         top_p=0.9,
     )
-    script = response.choices[0].message.content.strip()
-    if not script:
         raise RuntimeError("The model returned an empty script. Please try again.")
-    logger.info("Script generated: %d characters", len(script))
     return script

   - SmolLM3-3B is deployed on the free hf-inference provider
   - Prompt template enforces podcast/narration structure
   - Max 1024 new tokens keeps scripts a reasonable length for TTS
+  - Temperature 0.4 keeps output grounded and factual
+  - Post-processing strips markdown/XML artifacts for clean TTS
 """
 import os
+import re
 from huggingface_hub import InferenceClient
 from utils import logger
 # — Configuration ————————————————————————————————————
 MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
 MAX_NEW_TOKENS = 1024
+TEMPERATURE = 0.4
 # — Prompt Template ———————————————————————————————————
+SYSTEM_PROMPT = """You are a podcast script writer. Convert the provided document content into a spoken narration script.
+CRITICAL RULES:
+1. ONLY use facts, ideas, and information that appear in the provided content. Do NOT add any outside knowledge or make up details.
+2. Write in plain text only. No markdown, no headers, no bullet points, no asterisks, no hashtags, no HTML tags, no XML tags.
+3. Write naturally as if speaking aloud. Use short sentences and conversational language.
+4. Structure: brief intro, main points from the content, brief conclusion.
+5. Never say "the document says" or "according to the text". Speak as if you are the expert.
+6. If the content is limited, keep the script short rather than inventing information.
+7. Do NOT use any special formatting or tags like <think>, **, ##, etc.
+8. Output ONLY the spoken script text, nothing else."""
+USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:
+--- CONTENT ---
 {context}
+--- END ---
+Topic: {topic}
+Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""
+# — Post-processing ——————————————————————————————————
+def _clean_script_for_tts(text: str) -> str:
+    """
+    Remove markdown, XML/HTML tags, and other artifacts that would be
+    read aloud by TTS engines.
+    """
+    # Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    # Remove any remaining XML/HTML-style tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # Remove markdown headers (# ## ### etc.)
+    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
+    # Remove markdown bold/italic markers
+    text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
+    text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
+    # Remove markdown links [text](url) -> text
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+    # Remove markdown code blocks and inline code
+    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
+    text = re.sub(r'`([^`]+)`', r'\1', text)
+    # Remove bullet point markers
+    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
+    # Remove numbered list markers
+    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
+    # Remove blockquote markers
+    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
+    # Remove horizontal rules
+    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
+    # Collapse multiple newlines into one
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Collapse multiple spaces
+    text = re.sub(r' {2,}', ' ', text)
+    return text.strip()
 # — Script Generation ————————————————————————————————
         top_p=0.9,
     )
+    raw_script = response.choices[0].message.content.strip()
+    if not raw_script:
         raise RuntimeError("The model returned an empty script. Please try again.")
+    # Clean the script for TTS (remove markdown, XML tags, etc.)
+    script = _clean_script_for_tts(raw_script)
+    if not script:
+        raise RuntimeError("Script was empty after cleaning. Please try again.")
+    logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
     return script