File size: 14,459 Bytes
d61edf1
 
 
 
 
 
b725430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b725430
 
 
d61edf1
 
 
b725430
d61edf1
 
b725430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d61edf1
b725430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
b725430
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
 
 
b725430
 
 
 
 
d61edf1
 
 
b725430
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
b725430
 
 
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b725430
d61edf1
 
 
b725430
d61edf1
 
b725430
 
 
 
 
 
 
 
 
 
d61edf1
b725430
d61edf1
 
 
 
 
644a5a5
 
 
b725430
644a5a5
b725430
 
d61edf1
 
 
 
 
 
 
 
 
 
644a5a5
b725430
644a5a5
 
d61edf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644a5a5
 
 
 
 
d61edf1
644a5a5
 
 
 
 
 
 
 
 
 
 
d61edf1
 
 
 
 
 
 
 
 
 
 
 
 
b725430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d61edf1
 
b725430
d61edf1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""
Chatterbox Turbo TTS β€” Text Processor
═══════════════════════════════════════
Sanitizes raw input text and splits it into sentence-level chunks
for streaming TTS.  Paralinguistic tags ([laugh], [cough], …) are
explicitly preserved so the model can render them.

Punctuation Philosophy (based on Resemble AI recommendations):
  βœ… PRESERVE (benefits prosody):
     β€’ Ellipsis ...    β†’ dramatic pause, trailing thought, hesitation
     β€’ Em dash β€”       β†’ abrupt transition, dramatic break
     β€’ Comma ,         β†’ short natural pause / breathing point
     β€’ Period .        β†’ full stop, pitch drop, sentence boundary
     β€’ ! and ?         β†’ exclamatory / interrogative inflection
     β€’ Semicolon ;     β†’ medium pause, clause bridge (NOT a split point)
     β€’ Colon :         β†’ medium pause, introduces explanation (NOT a split point)
     β€’ Parentheses ()  β†’ quieter/explanatory tone shift
     β€’ Quotes ""       β†’ dialogue cue
     β€’ Apostrophe '    β†’ contractions (don't, it's)
     β€’ CAPS words      β†’ emphasis / volume increase

  ❌ FILTER (harms output):
     β€’ Excessive repeated punctuation (!!!! β†’ !, ???? β†’ ?, ,,, β†’ ,)
     β€’ 4+ dots (.... β†’ ...)
     β€’ Emojis, URLs, markdown, HTML tags
     β€’ Non-standard Unicode punctuation (guillemets, etc.)
"""
import re
from typing import List

from config import Config

# ═══════════════════════════════════════════════════════════════════
# Pre-compiled regex patterns (compiled once at import β†’ zero cost)
# ═══════════════════════════════════════════════════════════════════

# β€” Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
_TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)

# β€” Markdown / structural noise
_RE_CODE_BLOCK   = re.compile(r"```[\s\S]*?```")
_RE_INLINE_CODE  = re.compile(r"`([^`]+)`")
_RE_IMAGE        = re.compile(r"!\[([^\]]*)\]\([^)]+\)")
_RE_LINK         = re.compile(r"\[([^\]]+)\]\([^)]+\)")
_RE_BOLD_AST     = re.compile(r"\*\*(.+?)\*\*")
_RE_BOLD_UND     = re.compile(r"__(.+?)__")
_RE_STRIKE       = re.compile(r"~~(.+?)~~")
_RE_ITALIC_AST   = re.compile(r"\*(.+?)\*")
_RE_ITALIC_UND   = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
_RE_HEADER       = re.compile(r"^#{1,6}\s+", re.MULTILINE)
_RE_BLOCKQUOTE   = re.compile(r"^>+\s?", re.MULTILINE)
_RE_HR           = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
_RE_BULLET       = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
_RE_ORDERED      = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)

# β€” URLs, emojis, HTML entities
_RE_URL          = re.compile(r"https?://\S+")
_RE_EMOJI        = re.compile(
    r"["
    r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
    r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
    r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
    r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
    r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
    r"\U0000200D"
    r"]+", re.UNICODE,
)
_RE_HTML_ENTITY  = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);")

# HTML entities β†’ speakable replacements
# NOTE: &hellip; β†’ "..." (preserves dramatic pause), &mdash;/&ndash; β†’ "β€”" (preserves dramatic break)
_HTML_ENTITIES   = {
    "&amp;": " and ", "&lt;": " less than ", "&gt;": " greater than ",
    "&nbsp;": " ", "&quot;": '"', "&apos;": "'",
    "&mdash;": "β€”", "&ndash;": "β€”", "&hellip;": "...",
}

# β€” Smart/curly quote normalization β†’ ASCII equivalents
# These Unicode variants may confuse the tokenizer; normalizing ensures clean input.
_SMART_QUOTE_MAP = str.maketrans({
    "\u201c": '"',   # " left double quotation mark
    "\u201d": '"',   # " right double quotation mark
    "\u2018": "'",   # ' left single quotation mark
    "\u2019": "'",   # ' right single quotation mark
    "\u00ab": '"',   # Β« left guillemet
    "\u00bb": '"',   # Β» right guillemet
    "\u201e": '"',   # β€ž double low quotation mark
    "\u201f": '"',   # β€Ÿ double high reversed quotation mark
    "\u2032": "'",   # β€² prime
    "\u2033": '"',   # β€³ double prime
    "\u2013": "β€”",   # – en dash β†’ em dash (dramatic pause)
    "\u2014": "β€”",   # β€” em dash (keep as-is after mapping)
    "\u2026": "...", # … horizontal ellipsis β†’ three dots
})

# β€” ALL CAPS normalization
# Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING).
# By converting them to Title Case, they'll be processed naturally as words.
_RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b")

# β€” Punctuation normalization
#   Ellipsis (... / ..) is PRESERVED β€” it creates dramatic pauses in Chatterbox.
#   Only 4+ dots are excessive and get capped to standard ellipsis.
_RE_EXCESSIVE_DOTS   = re.compile(r"\.{4,}")       # ....+ β†’ ... (cap excessive)
_RE_NORMALIZE_DOTS   = re.compile(r"\.{2,3}")       # .. or ... β†’ ... (standardize)
_RE_REPEATED_EXCLAM  = re.compile(r"!{2,}")          # !! β†’ !
_RE_REPEATED_QUEST   = re.compile(r"\?{2,}")         # ?? β†’ ?
_RE_REPEATED_SEMI    = re.compile(r";{2,}")           # ;; β†’ ;
_RE_REPEATED_COLON   = re.compile(r":{2,}")           # :: β†’ :
_RE_REPEATED_COMMA   = re.compile(r",{2,}")           # ,, β†’ ,
_RE_REPEATED_DASH    = re.compile(r"-{3,}")           # --- β†’ β€” (em dash)

# β€” Abbreviation protection
# Common abbreviations ending in "." that should NOT trigger sentence splitting.
# These get a placeholder before splitting, then get restored.
_ABBREVIATIONS = (
    "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd",
    "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd",
    "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
    "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm",
    "Fig", "Vol", "No", "Ref", "Rev", "Ph",
)
_RE_ABBREV = re.compile(
    r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.",
    re.IGNORECASE,
)

# β€” Whitespace
_RE_MULTI_SPACE      = re.compile(r"[ \t]+")
_RE_MULTI_NEWLINE    = re.compile(r"\n{3,}")
_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")

# β€” Sentence boundary (split point)
# Split ONLY on true sentence-ending punctuation: . ! ?
# Semicolons and colons are clause connectors β€” they bridge related thoughts
# and should NOT be used as split points (creates choppy, unnatural fragments).
# Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream
# creates a compound lag between chunks, making the pause artificially excessive.
_RE_SENTENCE_SPLIT = re.compile(
    r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+"""
)

_MIN_MERGE_WORDS = 5


# ═══════════════════════════════════════════════════════════════════
# Public API
# ═══════════════════════════════════════════════════════════════════

def sanitize(text: str) -> str:
    """Clean raw input for TTS while preserving prosody-beneficial punctuation.

    Preserves: ellipsis (...), em dashes (β€”), commas, periods, !, ?, ;, :, quotes.
    Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation.
    """
    if not text:
        return text

    # 0. Normalize smart/curly quotes and Unicode punctuation FIRST
    #    This ensures downstream regex works on clean ASCII-like punctuation.
    text = text.translate(_SMART_QUOTE_MAP)

    # 1. Normalize ALL CAPS words to Title Case to prevent spelling out
    text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text)

    # 2. Protect paralinguistic tags by replacing with placeholders
    tags_found: list[tuple[int, str]] = []
    def _protect_tag(m):
        idx = len(tags_found)
        tags_found.append((idx, m.group(0)))
        return f"Β§TAG{idx}Β§"
    text = _RE_PARA_TAG.sub(_protect_tag, text)

    # 3. Protect abbreviations from sentence-boundary splitting
    #    "Dr. Smith" β†’ "DrΒ§ Smith" (restored later)
    abbrevs_found: list[tuple[int, str]] = []
    def _protect_abbrev(m):
        idx = len(abbrevs_found)
        abbrevs_found.append((idx, m.group(0)))
        return f"{m.group(1)}Β§ABR{idx}Β§"
    text = _RE_ABBREV.sub(_protect_abbrev, text)

    # 4. Strip non-speakable structures
    text = _RE_URL.sub("", text)
    text = _RE_CODE_BLOCK.sub("", text)
    text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
    text = _RE_LINK.sub(r"\1", text)
    text = _RE_BOLD_AST.sub(r"\1", text)
    text = _RE_BOLD_UND.sub(r"\1", text)
    text = _RE_STRIKE.sub(r"\1", text)
    text = _RE_ITALIC_AST.sub(r"\1", text)
    text = _RE_ITALIC_UND.sub(r"\1", text)
    text = _RE_INLINE_CODE.sub(r"\1", text)
    text = _RE_HEADER.sub("", text)
    text = _RE_BLOCKQUOTE.sub("", text)
    text = _RE_HR.sub("", text)
    text = _RE_BULLET.sub("", text)
    text = _RE_ORDERED.sub("", text)

    # 5. Emojis, hashtags
    text = _RE_EMOJI.sub("", text)
    text = re.sub(r"#(\w+)", r"\1", text)

    # 6. HTML entities β†’ speakable text
    text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)

    # 7. Normalize punctuation (PRESERVE prosody-beneficial marks)
    #    Order matters: handle excessive dots first, then standardize ellipsis.
    text = _RE_EXCESSIVE_DOTS.sub("...", text)       # ....+ β†’ ... (cap)
    text = _RE_NORMALIZE_DOTS.sub("...", text)        # .. or ... β†’ ... (standardize)
    text = _RE_REPEATED_EXCLAM.sub("!", text)         # !! β†’ !
    text = _RE_REPEATED_QUEST.sub("?", text)          # ?? β†’ ?
    text = _RE_REPEATED_SEMI.sub(";", text)           # ;; β†’ ;
    text = _RE_REPEATED_COLON.sub(":", text)          # :: β†’ :
    text = _RE_REPEATED_COMMA.sub(",", text)          # ,, β†’ ,
    text = _RE_REPEATED_DASH.sub("β€”", text)           # --- β†’ em dash

    # 8. Whitespace cleanup
    text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
    text = _RE_MULTI_SPACE.sub(" ", text)
    text = _RE_MULTI_NEWLINE.sub("\n\n", text)
    text = text.strip()

    # 9. Strip abbreviation dots (Mr. β†’ Mr, Dr. β†’ Dr, etc.)
    #    The dot is not needed for correct TTS pronunciation and removing it
    #    prevents false sentence-boundary splits in split_for_streaming().
    for idx, original in abbrevs_found:
        text = text.replace(f"Β§ABR{idx}Β§", "")

    # 10. Restore paralinguistic tags
    for idx, original in tags_found:
        text = text.replace(f"Β§TAG{idx}Β§", original)

    return text


def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
    """Split sanitized text into sentence-level chunks for streaming.

    Strategy:
      1. Split on sentence-ending punctuation boundaries (. ! ?)
         β€” NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries)
      2. Enforce max_chars per chunk (split long sentences on commas / spaces)
      3. Merge short chunks (≀5 words) with the next to avoid tiny segments
    """
    if not text:
        return []

    # Step 1: sentence split
    raw_chunks = _RE_SENTENCE_SPLIT.split(text)
    raw_chunks = [c.strip() for c in raw_chunks if c.strip()]

    # Step 2: enforce max length per chunk
    sized: List[str] = []
    for chunk in raw_chunks:
        if len(chunk) <= max_chars:
            sized.append(chunk)
        else:
            sized.extend(_break_long_chunk(chunk, max_chars))

    # Step 3: merge short chunks
    if len(sized) <= 1:
        return sized

    merged: List[str] = []
    carry = ""
    for i, chunk in enumerate(sized):
        if carry:
            chunk = carry + " " + chunk
            carry = ""
        if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
            carry = chunk
        else:
            merged.append(chunk)
    if carry:
        if merged:
            merged[-1] += " " + carry
        else:
            merged.append(carry)

    return merged


# ═══════════════════════════════════════════════════════════════════
# Internal helpers
# ═══════════════════════════════════════════════════════════════════

def _break_long_chunk(text: str, max_chars: int) -> List[str]:
    """Break a chunk longer than max_chars on commas or word boundaries."""
    parts: List[str] = []
    remaining = text
    while len(remaining) > max_chars:
        break_pos = -1
        include_break_char = False

        # Prefer punctuation/pauses first to keep prosody natural.
        for marker in (",", ";", ":", "β€”", "-", "!", "?"):
            pos = remaining.rfind(marker, 0, max_chars)
            if pos > break_pos:
                break_pos = pos
                include_break_char = True

        # Then prefer nearest space before limit.
        space_pos = remaining.rfind(" ", 0, max_chars)
        if space_pos > break_pos:
            break_pos = space_pos
            include_break_char = False

        # If nothing before limit, look slightly ahead to avoid mid-word cuts.
        if break_pos == -1:
            forward_limit = min(len(remaining), max_chars + 24)
            m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit])
            if m:
                break_pos = max_chars + m.start()
                include_break_char = remaining[break_pos] in ",;:!?"
            else:
                break_pos = max_chars
                include_break_char = False

        cut_at = break_pos + (1 if include_break_char else 0)
        if cut_at <= 0:
            cut_at = min(max_chars, len(remaining))

        segment = remaining[:cut_at].strip()
        if segment:
            parts.append(segment)
        remaining = remaining[cut_at:].lstrip()
    if remaining.strip():
        parts.append(remaining.strip())
    return parts