hello / text_processor.py
ShadowHunter222's picture
Upload text_processor.py
644a5a5 verified
"""
Chatterbox Turbo TTS β€” Text Processor
═══════════════════════════════════════
Sanitizes raw input text and splits it into sentence-level chunks
for streaming TTS. Paralinguistic tags ([laugh], [cough], …) are
explicitly preserved so the model can render them.
Punctuation Philosophy (based on Resemble AI recommendations):
βœ… PRESERVE (benefits prosody):
β€’ Ellipsis ... β†’ dramatic pause, trailing thought, hesitation
β€’ Em dash β€” β†’ abrupt transition, dramatic break
β€’ Comma , β†’ short natural pause / breathing point
β€’ Period . β†’ full stop, pitch drop, sentence boundary
β€’ ! and ? β†’ exclamatory / interrogative inflection
β€’ Semicolon ; β†’ medium pause, clause bridge (NOT a split point)
β€’ Colon : β†’ medium pause, introduces explanation (NOT a split point)
β€’ Parentheses () β†’ quieter/explanatory tone shift
β€’ Quotes "" β†’ dialogue cue
β€’ Apostrophe ' β†’ contractions (don't, it's)
β€’ CAPS words β†’ emphasis / volume increase
❌ FILTER (harms output):
β€’ Excessive repeated punctuation (!!!! β†’ !, ???? β†’ ?, ,,, β†’ ,)
β€’ 4+ dots (.... β†’ ...)
β€’ Emojis, URLs, markdown, HTML tags
β€’ Non-standard Unicode punctuation (guillemets, etc.)
"""
import re
from typing import List
from config import Config
# ═══════════════════════════════════════════════════════════════════
# Pre-compiled regex patterns (compiled once at import β†’ zero cost)
# ═══════════════════════════════════════════════════════════════════
# β€” Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
_TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)
# β€” Markdown / structural noise
_RE_CODE_BLOCK = re.compile(r"```[\s\S]*?```")
_RE_INLINE_CODE = re.compile(r"`([^`]+)`")
_RE_IMAGE = re.compile(r"!\[([^\]]*)\]\([^)]+\)")
_RE_LINK = re.compile(r"\[([^\]]+)\]\([^)]+\)")
_RE_BOLD_AST = re.compile(r"\*\*(.+?)\*\*")
_RE_BOLD_UND = re.compile(r"__(.+?)__")
_RE_STRIKE = re.compile(r"~~(.+?)~~")
_RE_ITALIC_AST = re.compile(r"\*(.+?)\*")
_RE_ITALIC_UND = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
_RE_HEADER = re.compile(r"^#{1,6}\s+", re.MULTILINE)
_RE_BLOCKQUOTE = re.compile(r"^>+\s?", re.MULTILINE)
_RE_HR = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
_RE_BULLET = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
_RE_ORDERED = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)
# β€” URLs, emojis, HTML entities
_RE_URL = re.compile(r"https?://\S+")
_RE_EMOJI = re.compile(
r"["
r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
r"\U0000200D"
r"]+", re.UNICODE,
)
_RE_HTML_ENTITY = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);")
# HTML entities β†’ speakable replacements
# NOTE: &hellip; β†’ "..." (preserves dramatic pause), &mdash;/&ndash; β†’ "β€”" (preserves dramatic break)
_HTML_ENTITIES = {
"&amp;": " and ", "&lt;": " less than ", "&gt;": " greater than ",
"&nbsp;": " ", "&quot;": '"', "&apos;": "'",
"&mdash;": "β€”", "&ndash;": "β€”", "&hellip;": "...",
}
# β€” Smart/curly quote normalization β†’ ASCII equivalents
# These Unicode variants may confuse the tokenizer; normalizing ensures clean input.
_SMART_QUOTE_MAP = str.maketrans({
"\u201c": '"', # " left double quotation mark
"\u201d": '"', # " right double quotation mark
"\u2018": "'", # ' left single quotation mark
"\u2019": "'", # ' right single quotation mark
"\u00ab": '"', # Β« left guillemet
"\u00bb": '"', # Β» right guillemet
"\u201e": '"', # β€ž double low quotation mark
"\u201f": '"', # β€Ÿ double high reversed quotation mark
"\u2032": "'", # β€² prime
"\u2033": '"', # β€³ double prime
"\u2013": "β€”", # – en dash β†’ em dash (dramatic pause)
"\u2014": "β€”", # β€” em dash (keep as-is after mapping)
"\u2026": "...", # … horizontal ellipsis β†’ three dots
})
# β€” ALL CAPS normalization
# Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING).
# By converting them to Title Case, they'll be processed naturally as words.
_RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b")
# β€” Punctuation normalization
# Ellipsis (... / ..) is PRESERVED β€” it creates dramatic pauses in Chatterbox.
# Only 4+ dots are excessive and get capped to standard ellipsis.
_RE_EXCESSIVE_DOTS = re.compile(r"\.{4,}") # ....+ β†’ ... (cap excessive)
_RE_NORMALIZE_DOTS = re.compile(r"\.{2,3}") # .. or ... β†’ ... (standardize)
_RE_REPEATED_EXCLAM = re.compile(r"!{2,}") # !! β†’ !
_RE_REPEATED_QUEST = re.compile(r"\?{2,}") # ?? β†’ ?
_RE_REPEATED_SEMI = re.compile(r";{2,}") # ;; β†’ ;
_RE_REPEATED_COLON = re.compile(r":{2,}") # :: β†’ :
_RE_REPEATED_COMMA = re.compile(r",{2,}") # ,, β†’ ,
_RE_REPEATED_DASH = re.compile(r"-{3,}") # --- β†’ β€” (em dash)
# β€” Abbreviation protection
# Common abbreviations ending in "." that should NOT trigger sentence splitting.
# These get a placeholder before splitting, then get restored.
_ABBREVIATIONS = (
"Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd",
"vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd",
"Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
"Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm",
"Fig", "Vol", "No", "Ref", "Rev", "Ph",
)
_RE_ABBREV = re.compile(
r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.",
re.IGNORECASE,
)
# β€” Whitespace
_RE_MULTI_SPACE = re.compile(r"[ \t]+")
_RE_MULTI_NEWLINE = re.compile(r"\n{3,}")
_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")
# β€” Sentence boundary (split point)
# Split ONLY on true sentence-ending punctuation: . ! ?
# Semicolons and colons are clause connectors β€” they bridge related thoughts
# and should NOT be used as split points (creates choppy, unnatural fragments).
# Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream
# creates a compound lag between chunks, making the pause artificially excessive.
_RE_SENTENCE_SPLIT = re.compile(
r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+"""
)
_MIN_MERGE_WORDS = 5
# ═══════════════════════════════════════════════════════════════════
# Public API
# ═══════════════════════════════════════════════════════════════════
def sanitize(text: str) -> str:
"""Clean raw input for TTS while preserving prosody-beneficial punctuation.
Preserves: ellipsis (...), em dashes (β€”), commas, periods, !, ?, ;, :, quotes.
Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation.
"""
if not text:
return text
# 0. Normalize smart/curly quotes and Unicode punctuation FIRST
# This ensures downstream regex works on clean ASCII-like punctuation.
text = text.translate(_SMART_QUOTE_MAP)
# 1. Normalize ALL CAPS words to Title Case to prevent spelling out
text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text)
# 2. Protect paralinguistic tags by replacing with placeholders
tags_found: list[tuple[int, str]] = []
def _protect_tag(m):
idx = len(tags_found)
tags_found.append((idx, m.group(0)))
return f"Β§TAG{idx}Β§"
text = _RE_PARA_TAG.sub(_protect_tag, text)
# 3. Protect abbreviations from sentence-boundary splitting
# "Dr. Smith" β†’ "DrΒ§ Smith" (restored later)
abbrevs_found: list[tuple[int, str]] = []
def _protect_abbrev(m):
idx = len(abbrevs_found)
abbrevs_found.append((idx, m.group(0)))
return f"{m.group(1)}Β§ABR{idx}Β§"
text = _RE_ABBREV.sub(_protect_abbrev, text)
# 4. Strip non-speakable structures
text = _RE_URL.sub("", text)
text = _RE_CODE_BLOCK.sub("", text)
text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
text = _RE_LINK.sub(r"\1", text)
text = _RE_BOLD_AST.sub(r"\1", text)
text = _RE_BOLD_UND.sub(r"\1", text)
text = _RE_STRIKE.sub(r"\1", text)
text = _RE_ITALIC_AST.sub(r"\1", text)
text = _RE_ITALIC_UND.sub(r"\1", text)
text = _RE_INLINE_CODE.sub(r"\1", text)
text = _RE_HEADER.sub("", text)
text = _RE_BLOCKQUOTE.sub("", text)
text = _RE_HR.sub("", text)
text = _RE_BULLET.sub("", text)
text = _RE_ORDERED.sub("", text)
# 5. Emojis, hashtags
text = _RE_EMOJI.sub("", text)
text = re.sub(r"#(\w+)", r"\1", text)
# 6. HTML entities β†’ speakable text
text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)
# 7. Normalize punctuation (PRESERVE prosody-beneficial marks)
# Order matters: handle excessive dots first, then standardize ellipsis.
text = _RE_EXCESSIVE_DOTS.sub("...", text) # ....+ β†’ ... (cap)
text = _RE_NORMALIZE_DOTS.sub("...", text) # .. or ... β†’ ... (standardize)
text = _RE_REPEATED_EXCLAM.sub("!", text) # !! β†’ !
text = _RE_REPEATED_QUEST.sub("?", text) # ?? β†’ ?
text = _RE_REPEATED_SEMI.sub(";", text) # ;; β†’ ;
text = _RE_REPEATED_COLON.sub(":", text) # :: β†’ :
text = _RE_REPEATED_COMMA.sub(",", text) # ,, β†’ ,
text = _RE_REPEATED_DASH.sub("β€”", text) # --- β†’ em dash
# 8. Whitespace cleanup
text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
text = _RE_MULTI_SPACE.sub(" ", text)
text = _RE_MULTI_NEWLINE.sub("\n\n", text)
text = text.strip()
# 9. Strip abbreviation dots (Mr. β†’ Mr, Dr. β†’ Dr, etc.)
# The dot is not needed for correct TTS pronunciation and removing it
# prevents false sentence-boundary splits in split_for_streaming().
for idx, original in abbrevs_found:
text = text.replace(f"Β§ABR{idx}Β§", "")
# 10. Restore paralinguistic tags
for idx, original in tags_found:
text = text.replace(f"Β§TAG{idx}Β§", original)
return text
def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
"""Split sanitized text into sentence-level chunks for streaming.
Strategy:
1. Split on sentence-ending punctuation boundaries (. ! ?)
β€” NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries)
2. Enforce max_chars per chunk (split long sentences on commas / spaces)
3. Merge short chunks (≀5 words) with the next to avoid tiny segments
"""
if not text:
return []
# Step 1: sentence split
raw_chunks = _RE_SENTENCE_SPLIT.split(text)
raw_chunks = [c.strip() for c in raw_chunks if c.strip()]
# Step 2: enforce max length per chunk
sized: List[str] = []
for chunk in raw_chunks:
if len(chunk) <= max_chars:
sized.append(chunk)
else:
sized.extend(_break_long_chunk(chunk, max_chars))
# Step 3: merge short chunks
if len(sized) <= 1:
return sized
merged: List[str] = []
carry = ""
for i, chunk in enumerate(sized):
if carry:
chunk = carry + " " + chunk
carry = ""
if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
carry = chunk
else:
merged.append(chunk)
if carry:
if merged:
merged[-1] += " " + carry
else:
merged.append(carry)
return merged
# ═══════════════════════════════════════════════════════════════════
# Internal helpers
# ═══════════════════════════════════════════════════════════════════
def _break_long_chunk(text: str, max_chars: int) -> List[str]:
"""Break a chunk longer than max_chars on commas or word boundaries."""
parts: List[str] = []
remaining = text
while len(remaining) > max_chars:
break_pos = -1
include_break_char = False
# Prefer punctuation/pauses first to keep prosody natural.
for marker in (",", ";", ":", "β€”", "-", "!", "?"):
pos = remaining.rfind(marker, 0, max_chars)
if pos > break_pos:
break_pos = pos
include_break_char = True
# Then prefer nearest space before limit.
space_pos = remaining.rfind(" ", 0, max_chars)
if space_pos > break_pos:
break_pos = space_pos
include_break_char = False
# If nothing before limit, look slightly ahead to avoid mid-word cuts.
if break_pos == -1:
forward_limit = min(len(remaining), max_chars + 24)
m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit])
if m:
break_pos = max_chars + m.start()
include_break_char = remaining[break_pos] in ",;:!?"
else:
break_pos = max_chars
include_break_char = False
cut_at = break_pos + (1 if include_break_char else 0)
if cut_at <= 0:
cut_at = min(max_chars, len(remaining))
segment = remaining[:cut_at].strip()
if segment:
parts.append(segment)
remaining = remaining[cut_at:].lstrip()
if remaining.strip():
parts.append(remaining.strip())
return parts