File size: 6,563 Bytes
fd75949
 
 
 
620c87c
fd75949
 
620c87c
 
fd75949
 
94a026e
 
fd75949
 
 
94a026e
fd75949
 
 
 
620c87c
fd75949
94a026e
fd75949
 
 
7723b85
fd75949
94a026e
7723b85
 
 
 
 
 
 
 
 
fd75949
94a026e
fd75949
94a026e
fd75949
94a026e
fd75949
94a026e
fd75949
94a026e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd75949
 
 
 
 
 
 
 
 
 
 
 
71b59d0
 
 
 
fd75949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620c87c
fd75949
 
 
 
 
 
 
 
 
 
 
 
 
71b59d0
 
fd75949
71b59d0
 
 
 
 
fd75949
 
 
 
94a026e
fd75949
94a026e
fd75949
 
94a026e
 
 
 
 
 
 
71b59d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
VoiceVerse AI β€” Script Generation Module.

Generates spoken-style scripts from retrieved document chunks
using SmolLM3-3B via the Hugging Face Inference API.

Design decisions:
  - Serverless HF Inference API avoids loading a large model locally
  - SmolLM3-3B is deployed on the free hf-inference provider
  - Prompt template enforces podcast/narration structure
  - Max 1024 new tokens keeps scripts a reasonable length for TTS
  - Temperature 0.4 keeps output grounded and factual
  - Post-processing strips markdown/XML artifacts for clean TTS
"""

import os
import re
from huggingface_hub import InferenceClient
from utils import logger

# β€” Configuration β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.4

# β€” Prompt Template β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

SYSTEM_PROMPT = """You are a podcast host narrating content to listeners. Convert the provided document content into a smooth, flowing spoken narration.

CRITICAL RULES:
1. ONLY use facts, ideas, and information from the provided content. Do NOT add outside knowledge or invent details.
2. Write as one continuous flowing narration. Do NOT use section headings, labels, or titles like "Intro", "Conclusion", "Section 1", etc.
3. Use smooth spoken transitions between topics instead of headings. For example say "Now let's talk about..." or "Moving on to..." or "Here's where it gets interesting..."
4. Write in plain text only. No markdown, no bullet points, no asterisks, no hashtags, no HTML/XML tags.
5. Write naturally as if speaking aloud to a listener. Use short sentences and conversational language.
6. Never say "the document says" or "according to the text". Speak as the expert.
7. If the content is limited, keep the script short rather than inventing information.
8. Do NOT include any labels, headers, or structural markers. The output should read like someone is talking without breaks.
9. Output ONLY the spoken narration text, nothing else."""

USER_PROMPT_TEMPLATE = """Here is the document content to convert into a spoken podcast script:

--- CONTENT ---
{context}
--- END ---

Topic: {topic}

Now write ONLY the spoken script based strictly on the content above. Do not add information that is not in the content."""


# β€” Post-processing β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

def _clean_script_for_tts(text: str) -> str:
    """
    Remove markdown, XML/HTML tags, and other artifacts that would be
    read aloud by TTS engines.
    """
    # Remove <think>...</think> blocks entirely (SmolLM3 reasoning traces)
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)

    # Remove any remaining XML/HTML-style tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove markdown headers (# ## ### etc.)
    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)

    # Remove markdown bold/italic markers
    text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
    text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)

    # Remove markdown links [text](url) -> text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

    # Remove markdown code blocks and inline code
    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)

    # Remove bullet point markers
    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)

    # Remove numbered list markers
    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)

    # Remove blockquote markers
    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)

    # Remove horizontal rules
    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)

    # Collapse multiple newlines into one
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Collapse multiple spaces
    text = re.sub(r' {2,}', ' ', text)

    return text.strip()


# β€” Script Generation β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

def _get_client() -> InferenceClient:
    """Create an HF Inference client with the user's token."""
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise EnvironmentError(
            "HF_TOKEN environment variable is not set. "
            "Please set your Hugging Face API token to use the script generation feature."
        )
    return InferenceClient(
        provider="hf-inference",
        token=token,
    )


def generate_script(
    context_chunks: list[str],
    topic: str = "the key ideas and insights from this document",
) -> str:
    """
    Generate a spoken-style podcast script from retrieved document chunks.

    Args:
        context_chunks: List of relevant text chunks from the RAG store
        topic: Optional focus topic for the script

    Returns:
        A spoken script string ready for TTS conversion
    """
    if not context_chunks:
        raise ValueError("No document context provided. Please upload a document first.")

    # Combine chunks into a single context block
    context = "\n\n".join(context_chunks)

    # Truncate if too long
    max_context_chars = 6000
    if len(context) > max_context_chars:
        context = context[:max_context_chars]
        logger.warning("Context truncated to %d characters", max_context_chars)

    # Build the prompt
    user_message = USER_PROMPT_TEMPLATE.format(context=context, topic=topic)

    logger.info("Generating script via %s (context: %d chars, topic: '%s')",
                MODEL_ID, len(context), topic[:50])

    client = _get_client()

    # Call the model using chat_completion
    response = client.chat_completion(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_message},
        ],
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=0.9,
    )

    raw_script = response.choices[0].message.content.strip()

    if not raw_script:
        raise RuntimeError("The model returned an empty script. Please try again.")

    # Clean the script for TTS (remove markdown, XML tags, etc.)
    script = _clean_script_for_tts(raw_script)

    if not script:
        raise RuntimeError("Script was empty after cleaning. Please try again.")

    logger.info("Script generated: %d chars (raw: %d chars)", len(script), len(raw_script))
    return script