File size: 14,553 Bytes
fd75949
 
 
fbe59b1
c6c42d9
 
 
 
fd75949
 
 
94a026e
fd75949
 
 
8c9f65d
8b3748a
8c9f65d
fd75949
 
c6c42d9
 
 
 
 
fbe59b1
8c9f65d
fbe59b1
8c9f65d
8b3748a
 
 
 
 
 
fbe59b1
 
 
fd75949
 
8b3748a
fbe59b1
 
c6c42d9
fbe59b1
8c9f65d
fbe59b1
8c9f65d
fbe59b1
 
 
 
c6c42d9
8c9f65d
 
 
 
 
 
fbe59b1
 
 
 
 
8c9f65d
fbe59b1
fd75949
c6c42d9
fbe59b1
8c9f65d
 
 
94a026e
c6c42d9
8c9f65d
 
 
 
fbe59b1
8c9f65d
fbe59b1
 
8c9f65d
 
 
fbe59b1
c6c42d9
8c9f65d
 
 
 
fbe59b1
8c9f65d
fbe59b1
 
 
 
 
8c9f65d
94a026e
fbe59b1
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff6ba78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c42d9
 
 
fbe59b1
8c9f65d
c6c42d9
fbe59b1
 
 
8b3748a
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe59b1
 
 
 
 
 
 
 
 
 
 
 
94a026e
 
c6c42d9
 
 
 
 
8c9f65d
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe59b1
c6c42d9
fbe59b1
 
c6c42d9
 
 
fd75949
 
 
 
 
8c9f65d
fd75949
fbe59b1
 
 
8c9f65d
fbe59b1
 
 
c6c42d9
 
 
 
fbe59b1
 
 
71b59d0
fbe59b1
 
8c9f65d
fbe59b1
fd75949
 
c6c42d9
 
 
fbe59b1
fd75949
 
fbe59b1
 
8c9f65d
fd75949
c6c42d9
 
 
 
 
 
 
 
 
 
 
 
fd75949
8c9f65d
fd75949
 
8c9f65d
 
 
fd75949
c6c42d9
fd75949
8c9f65d
fd75949
8c9f65d
 
 
 
 
 
c6c42d9
fbe59b1
8c9f65d
 
 
 
 
fbe59b1
c6c42d9
 
 
 
ff6ba78
 
 
 
fbe59b1
8c9f65d
 
 
94a026e
 
 
 
8c9f65d
fbe59b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
VoiceVerse AI β€” Script Generation Module.

Delivery Modes:
  Summary  β€” single-speaker structured narration
  Podcast  β€” HOST_1 / HOST_2 two-host dialogue
  Song/Rap β€” rhythmic retention content
  Debate   β€” DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
"""

import os
import re
from huggingface_hub import InferenceClient
from utils import logger

MODEL_ID       = "HuggingFaceTB/SmolLM3-3B"
MAX_NEW_TOKENS = 2048
TEMPERATURE    = 0.5


# ══════════════════════════════════════════════════════════════════════════════
# Prompts
# ══════════════════════════════════════════════════════════════════════════════

# ── Summary ───────────────────────────────────────────────────────────────────
_SUMMARY_SYSTEM = """\
You are a professional narrator. Produce a clear spoken summary strictly from the source material.
RULES:
1. Use ONLY facts from the source. Do NOT add outside knowledge.
2. Write as one continuous flowing narration. Do NOT use any section headings, labels, or structural markers like "Introduction", "Intro", "Key Points", "Conclusion", "Summary", "Section 1", etc.
3. Use smooth spoken transitions instead of headings. For example say "Let's start with..." or "Now moving on to..." or "To wrap things up..." instead of labeling sections.
4. Plain text only β€” no markdown, no bullets, no headers, no labels of any kind.
5. Write for the ear: short sentences, conversational tone.
6. Never say "the document says". Speak as the expert.
7. Output ONLY the spoken narration text, nothing else. It should read like someone is naturally talking."""

_SUMMARY_USER = """\
SOURCE MATERIAL:
{context}

Write a flowing spoken summary in plain sentences. Do NOT include any headings or labels like Intro, Conclusion, etc. Just speak naturally as if talking to a listener."""


# ── Podcast ───────────────────────────────────────────────────────────────────
_PODCAST_SYSTEM = """\
You are a podcast script writer. Write a two-host conversation strictly from the source material.

STRICT FORMAT β€” every single line must start with a speaker tag:
HOST_1: <what Host 1 says>
HOST_2: <what Host 2 says>

RULES:
1. Alternate HOST_1 and HOST_2. Never same host twice in a row.
2. HOST_1 introduces topics and asks questions.
3. HOST_2 explains concepts and answers.
4. Use ONLY information from the source. No hallucination.
5. Conversational, engaging tone.
6. No markdown, no stage directions, no lines without a HOST tag.
7. Aim for 16–24 exchanges."""

_PODCAST_USER = """\
SOURCE MATERIAL:
{context}

Write the full podcast. Every line must start with HOST_1: or HOST_2:"""


# ── Song / Rap ────────────────────────────────────────────────────────────────
_SONG_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 β€” silently extract 5–7 key ideas from the source.
STEP 2 β€” write a smooth melodic SONG from those ideas.

RULES:
- Simple memorable language, rhyming couplets (AABB).
- Label sections [VERSE 1], [VERSE 2], [CHORUS].
- [CHORUS] repeats the main concept.
- Short lines (6–10 words). Use repetition.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_RAP_SYSTEM = """\
You are a lyricist. Two steps:
STEP 1 β€” silently extract 5–7 key ideas from the source.
STEP 2 β€” write a punchy rhythmic RAP from those ideas.

RULES:
- Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
- Label sections [VERSE 1], [VERSE 2], [HOOK].
- [HOOK] repeats the main concept.
- Wordplay and repetition to aid retention.
- Do NOT invent facts not in the source.
- Output ONLY the lyrics with section labels."""

_SONG_RAP_USER = """\
SOURCE MATERIAL:
{context}

Extract the key ideas, then write the full {form}."""


# ── Debate ────────────────────────────────────────────────────────────────────
_DEBATE_SYSTEM = """\
You are a debate script writer. Write a structured two-person debate strictly grounded \
in the provided source material.

STRICT FORMAT β€” every single line must start with a speaker tag:
DEBATER_A: <what Debater A says>
DEBATER_B: <what Debater B says>

CHARACTER PROFILES:
- DEBATER_A: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
- DEBATER_B: Takes the CON / critical position. Tone is skeptical, cautious, questioning.

DEBATE STRUCTURE:
1. DEBATER_A opens with a strong statement supporting the topic.
2. DEBATER_B immediately challenges with a counterpoint.
3. They alternate, each directly responding to the other's previous point.
4. Both use evidence and logic from the source material only.
5. End with each debater giving a brief closing statement.

RULES:
- Alternate DEBATER_A and DEBATER_B. Never same debater twice in a row.
- Use ONLY information from the source material. No hallucination.
- Each turn should be 1–3 sentences β€” punchy, not long speeches.
- No markdown, no stage directions, no narration outside the speaker tags.
- Aim for 16–22 exchanges total."""

_DEBATE_USER = """\
SOURCE MATERIAL:
{context}

Write the full debate on the key topics from this material. \
Every line must start with DEBATER_A: or DEBATER_B:"""


# ── Story ─────────────────────────────────────────────────────────────────────
_STORY_SYSTEM = """\
You are a master storyteller. Retell the ideas from the source material as an \
immersive narrative story written for slow, expressive audio delivery.

RULES:
1. Transform factual content into a story β€” use characters, scenes, a narrative arc \
   (beginning, middle, end). Characters can be fictional stand-ins for real concepts.
2. Use ONLY information and ideas from the source. Do NOT invent new facts.
3. Warm, descriptive storytelling voice. Vivid but calm.
4. Short paragraphs, 1–3 sentences each, separated by blank lines.
5. Plain text only β€” no markdown, no bullets, no headers.
6. Begin with an evocative scene-setting sentence.
7. End with a closing reflection or lesson drawn from the source.
8. Output ONLY the story text, nothing else."""

_STORY_USER = """\
SOURCE MATERIAL:
{context}

Transform this into a rich narrative story for slow, expressive audio. \
Use short paragraphs with blank lines between them."""


# ══════════════════════════════════════════════════════════════════════════════
# Post-processing
# ══════════════════════════════════════════════════════════════════════════════

def _clean(text: str) -> str:
    """Remove all markdown and XML artifacts from LLM output."""
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)

    # Remove heading-like labels that TTS would read aloud
    # e.g. "Introduction:", "Intro:", "Conclusion:", "Key Points:", "Summary:" etc.
    text = re.sub(
        r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
        r"Closing|Opening|Final\s*Thoughts?|In\s*Summary|To\s*Conclude)\s*[:\-β€”]?\s*$",
        "", text, flags=re.MULTILINE | re.IGNORECASE
    )
    # Also remove inline heading labels at the start of a line followed by content
    text = re.sub(
        r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|"
        r"Closing|Opening|Final\s*Thoughts?)\s*[:\-β€”]\s+",
        "", text, flags=re.MULTILINE | re.IGNORECASE
    )
    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
    text = re.sub(r"`([^`]+)`", r"\1", text)
    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)
    return text.strip()


def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
    """
    Clean output that must have speaker tags (podcast or debate).
    Normalises tag variants, removes lines without valid tags.
    """
    text = _clean(text)

    # Normalise tag variants the model might produce
    if tag_a == "HOST_1":
        text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "HOST_1:", text)
        text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "HOST_2:", text)
    elif tag_a == "DEBATER_A":
        text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "DEBATER_B:", text)
        # Also catch "Pro:" / "Con:" / "Speaker A:" variants
        text = re.sub(r"(?i)\bpro\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bcon\s*:", "DEBATER_B:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "DEBATER_A:", text)
        text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "DEBATER_B:", text)

    # Keep only lines that have a valid speaker tag
    lines = text.splitlines()
    clean_lines = [
        ln for ln in lines
        if ln.strip() == ""
        or ln.strip().startswith(f"{tag_a}:")
        or ln.strip().startswith(f"{tag_b}:")
    ]
    return "\n".join(clean_lines).strip()


# ══════════════════════════════════════════════════════════════════════════════
# LLM client
# ══════════════════════════════════════════════════════════════════════════════

def _get_client() -> InferenceClient:
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise EnvironmentError(
            "HF_TOKEN not set. Add your Hugging Face token as a Space secret."
        )
    return InferenceClient(provider="hf-inference", token=token)


def _call_llm(system: str, user: str) -> str:
    client = _get_client()
    response = client.chat_completion(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user},
        ],
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=0.9,
    )
    raw = response.choices[0].message.content.strip()
    if not raw:
        raise RuntimeError("Model returned empty response. Please try again.")
    return raw


# ══════════════════════════════════════════════════════════════════════════════
# Public entry point
# ══════════════════════════════════════════════════════════════════════════════

def generate_script(
    context_chunks: list[str],
    mode: str = "Summary",
    sub_mode: str = "Rap",
    topic: str = "the key ideas from this document",
) -> str:
    """
    Generate a spoken script from RAG chunks.

    Args:
        context_chunks : chunks from RAGStore β€” NOT modified here
        mode           : "Summary" | "Podcast" | "Song / Rap" | "Debate"
        sub_mode       : "Song" | "Rap"  (only for Song/Rap mode)

    Returns:
        Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
        Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
    """
    if not context_chunks:
        raise ValueError("No document context. Please upload or paste content first.")

    context = "\n\n".join(context_chunks)
    if len(context) > 6000:
        context = context[:6000]
        logger.warning("Context truncated to 6000 chars")

    logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))

    m = mode.strip().lower()

    if m == "summary":
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    elif m == "podcast":
        raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
        script = _clean_dialogue(raw, "HOST_1", "HOST_2")

    elif "song" in m or "rap" in m:
        form = sub_mode.lower()
        sys_prompt = _SONG_SYSTEM if form == "song" else _RAP_SYSTEM
        raw = _call_llm(sys_prompt, _SONG_RAP_USER.format(context=context, form=form))
        script = _clean(raw)

    elif "debate" in m:
        raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
        script = _clean_dialogue(raw, "DEBATER_A", "DEBATER_B")

    elif "story" in m:
        raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
        script = _clean(raw)

    else:
        logger.warning("Unknown mode '%s' β€” falling back to Summary", mode)
        raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
        script = _clean(raw)

    if not script:
        raise RuntimeError("Script was empty after cleaning. Please try again.")

    logger.info("Script ready: %d chars", len(script))
    return script