""" Chatterbox Turbo TTS — Text Processor ═══════════════════════════════════════ Sanitizes raw input text and splits it into sentence-level chunks for streaming TTS. Paralinguistic tags ([laugh], [cough], …) are explicitly preserved so the model can render them. Punctuation Philosophy (based on Resemble AI recommendations): ✅ PRESERVE (benefits prosody): • Ellipsis ... → dramatic pause, trailing thought, hesitation • Em dash — → abrupt transition, dramatic break • Comma , → short natural pause / breathing point • Period . → full stop, pitch drop, sentence boundary • ! and ? → exclamatory / interrogative inflection • Semicolon ; → medium pause, clause bridge (NOT a split point) • Colon : → medium pause, introduces explanation (NOT a split point) • Parentheses () → quieter/explanatory tone shift • Quotes "" → dialogue cue • Apostrophe ' → contractions (don't, it's) • CAPS words → emphasis / volume increase ❌ FILTER (harms output): • Excessive repeated punctuation (!!!! → !, ???? → ?, ,,, → ,) • 4+ dots (.... → ...) • Emojis, URLs, markdown, HTML tags • Non-standard Unicode punctuation (guillemets, etc.) """ import re from typing import List from config import Config # ═══════════════════════════════════════════════════════════════════ # Pre-compiled regex patterns (compiled once at import → zero cost) # ═══════════════════════════════════════════════════════════════════ # — Paralinguistic tag protector (matches [laugh], [clear throat], etc.) _TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS) _RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE) # — Markdown / structural noise _RE_CODE_BLOCK = re.compile(r"```[\s\S]*?```") _RE_INLINE_CODE = re.compile(r"`([^`]+)`") _RE_IMAGE = re.compile(r"!\[([^\]]*)\]\([^)]+\)") _RE_LINK = re.compile(r"\[([^\]]+)\]\([^)]+\)") _RE_BOLD_AST = re.compile(r"\*\*(.+?)\*\*") _RE_BOLD_UND = re.compile(r"__(.+?)__") _RE_STRIKE = re.compile(r"~~(.+?)~~") _RE_ITALIC_AST = re.compile(r"\*(.+?)\*") _RE_ITALIC_UND = re.compile(r"(?+\s?", re.MULTILINE) _RE_HR = re.compile(r"^[-*_]{3,}$", re.MULTILINE) _RE_BULLET = re.compile(r"^\s*[-*+]\s+", re.MULTILINE) _RE_ORDERED = re.compile(r"^\s*\d+\.\s+", re.MULTILINE) # — URLs, emojis, HTML entities _RE_URL = re.compile(r"https?://\S+") _RE_EMOJI = re.compile( r"[" r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" r"\U00002702-\U000027B0\U0001F900-\U0001F9FF" r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF" r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F" r"\U0000200D" r"]+", re.UNICODE, ) _RE_HTML_ENTITY = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);") # HTML entities → speakable replacements # NOTE: … → "..." (preserves dramatic pause), —/– → "—" (preserves dramatic break) _HTML_ENTITIES = { "&": " and ", "<": " less than ", ">": " greater than ", " ": " ", """: '"', "'": "'", "—": "—", "–": "—", "…": "...", } # — Smart/curly quote normalization → ASCII equivalents # These Unicode variants may confuse the tokenizer; normalizing ensures clean input. _SMART_QUOTE_MAP = str.maketrans({ "\u201c": '"', # " left double quotation mark "\u201d": '"', # " right double quotation mark "\u2018": "'", # ' left single quotation mark "\u2019": "'", # ' right single quotation mark "\u00ab": '"', # « left guillemet "\u00bb": '"', # » right guillemet "\u201e": '"', # „ double low quotation mark "\u201f": '"', # ‟ double high reversed quotation mark "\u2032": "'", # ′ prime "\u2033": '"', # ″ double prime "\u2013": "—", # – en dash → em dash (dramatic pause) "\u2014": "—", # — em dash (keep as-is after mapping) "\u2026": "...", # … horizontal ellipsis → three dots }) # — ALL CAPS normalization # Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING). # By converting them to Title Case, they'll be processed naturally as words. _RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b") # — Punctuation normalization # Ellipsis (... / ..) is PRESERVED — it creates dramatic pauses in Chatterbox. # Only 4+ dots are excessive and get capped to standard ellipsis. _RE_EXCESSIVE_DOTS = re.compile(r"\.{4,}") # ....+ → ... (cap excessive) _RE_NORMALIZE_DOTS = re.compile(r"\.{2,3}") # .. or ... → ... (standardize) _RE_REPEATED_EXCLAM = re.compile(r"!{2,}") # !! → ! _RE_REPEATED_QUEST = re.compile(r"\?{2,}") # ?? → ? _RE_REPEATED_SEMI = re.compile(r";{2,}") # ;; → ; _RE_REPEATED_COLON = re.compile(r":{2,}") # :: → : _RE_REPEATED_COMMA = re.compile(r",{2,}") # ,, → , _RE_REPEATED_DASH = re.compile(r"-{3,}") # --- → — (em dash) # — Abbreviation protection # Common abbreviations ending in "." that should NOT trigger sentence splitting. # These get a placeholder before splitting, then get restored. _ABBREVIATIONS = ( "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd", "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm", "Fig", "Vol", "No", "Ref", "Rev", "Ph", ) _RE_ABBREV = re.compile( r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.", re.IGNORECASE, ) # — Whitespace _RE_MULTI_SPACE = re.compile(r"[ \t]+") _RE_MULTI_NEWLINE = re.compile(r"\n{3,}") _RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])") # — Sentence boundary (split point) # Split ONLY on true sentence-ending punctuation: . ! ? # Semicolons and colons are clause connectors — they bridge related thoughts # and should NOT be used as split points (creates choppy, unnatural fragments). # Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream # creates a compound lag between chunks, making the pause artificially excessive. _RE_SENTENCE_SPLIT = re.compile( r"""(?:(?<=[.!?])(? str: """Clean raw input for TTS while preserving prosody-beneficial punctuation. Preserves: ellipsis (...), em dashes (—), commas, periods, !, ?, ;, :, quotes. Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation. """ if not text: return text # 0. Normalize smart/curly quotes and Unicode punctuation FIRST # This ensures downstream regex works on clean ASCII-like punctuation. text = text.translate(_SMART_QUOTE_MAP) # 1. Normalize ALL CAPS words to Title Case to prevent spelling out text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text) # 2. Protect paralinguistic tags by replacing with placeholders tags_found: list[tuple[int, str]] = [] def _protect_tag(m): idx = len(tags_found) tags_found.append((idx, m.group(0))) return f"§TAG{idx}§" text = _RE_PARA_TAG.sub(_protect_tag, text) # 3. Protect abbreviations from sentence-boundary splitting # "Dr. Smith" → "Dr§ Smith" (restored later) abbrevs_found: list[tuple[int, str]] = [] def _protect_abbrev(m): idx = len(abbrevs_found) abbrevs_found.append((idx, m.group(0))) return f"{m.group(1)}§ABR{idx}§" text = _RE_ABBREV.sub(_protect_abbrev, text) # 4. Strip non-speakable structures text = _RE_URL.sub("", text) text = _RE_CODE_BLOCK.sub("", text) text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text) text = _RE_LINK.sub(r"\1", text) text = _RE_BOLD_AST.sub(r"\1", text) text = _RE_BOLD_UND.sub(r"\1", text) text = _RE_STRIKE.sub(r"\1", text) text = _RE_ITALIC_AST.sub(r"\1", text) text = _RE_ITALIC_UND.sub(r"\1", text) text = _RE_INLINE_CODE.sub(r"\1", text) text = _RE_HEADER.sub("", text) text = _RE_BLOCKQUOTE.sub("", text) text = _RE_HR.sub("", text) text = _RE_BULLET.sub("", text) text = _RE_ORDERED.sub("", text) # 5. Emojis, hashtags text = _RE_EMOJI.sub("", text) text = re.sub(r"#(\w+)", r"\1", text) # 6. HTML entities → speakable text text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text) # 7. Normalize punctuation (PRESERVE prosody-beneficial marks) # Order matters: handle excessive dots first, then standardize ellipsis. text = _RE_EXCESSIVE_DOTS.sub("...", text) # ....+ → ... (cap) text = _RE_NORMALIZE_DOTS.sub("...", text) # .. or ... → ... (standardize) text = _RE_REPEATED_EXCLAM.sub("!", text) # !! → ! text = _RE_REPEATED_QUEST.sub("?", text) # ?? → ? text = _RE_REPEATED_SEMI.sub(";", text) # ;; → ; text = _RE_REPEATED_COLON.sub(":", text) # :: → : text = _RE_REPEATED_COMMA.sub(",", text) # ,, → , text = _RE_REPEATED_DASH.sub("—", text) # --- → em dash # 8. Whitespace cleanup text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text) text = _RE_MULTI_SPACE.sub(" ", text) text = _RE_MULTI_NEWLINE.sub("\n\n", text) text = text.strip() # 9. Strip abbreviation dots (Mr. → Mr, Dr. → Dr, etc.) # The dot is not needed for correct TTS pronunciation and removing it # prevents false sentence-boundary splits in split_for_streaming(). for idx, original in abbrevs_found: text = text.replace(f"§ABR{idx}§", "") # 10. Restore paralinguistic tags for idx, original in tags_found: text = text.replace(f"§TAG{idx}§", original) return text def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]: """Split sanitized text into sentence-level chunks for streaming. Strategy: 1. Split on sentence-ending punctuation boundaries (. ! ?) — NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries) 2. Enforce max_chars per chunk (split long sentences on commas / spaces) 3. Merge short chunks (≤5 words) with the next to avoid tiny segments """ if not text: return [] # Step 1: sentence split raw_chunks = _RE_SENTENCE_SPLIT.split(text) raw_chunks = [c.strip() for c in raw_chunks if c.strip()] # Step 2: enforce max length per chunk sized: List[str] = [] for chunk in raw_chunks: if len(chunk) <= max_chars: sized.append(chunk) else: sized.extend(_break_long_chunk(chunk, max_chars)) # Step 3: merge short chunks if len(sized) <= 1: return sized merged: List[str] = [] carry = "" for i, chunk in enumerate(sized): if carry: chunk = carry + " " + chunk carry = "" if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1: carry = chunk else: merged.append(chunk) if carry: if merged: merged[-1] += " " + carry else: merged.append(carry) return merged # ═══════════════════════════════════════════════════════════════════ # Internal helpers # ═══════════════════════════════════════════════════════════════════ def _break_long_chunk(text: str, max_chars: int) -> List[str]: """Break a chunk longer than max_chars on commas or word boundaries.""" parts: List[str] = [] remaining = text while len(remaining) > max_chars: break_pos = -1 include_break_char = False # Prefer punctuation/pauses first to keep prosody natural. for marker in (",", ";", ":", "—", "-", "!", "?"): pos = remaining.rfind(marker, 0, max_chars) if pos > break_pos: break_pos = pos include_break_char = True # Then prefer nearest space before limit. space_pos = remaining.rfind(" ", 0, max_chars) if space_pos > break_pos: break_pos = space_pos include_break_char = False # If nothing before limit, look slightly ahead to avoid mid-word cuts. if break_pos == -1: forward_limit = min(len(remaining), max_chars + 24) m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit]) if m: break_pos = max_chars + m.start() include_break_char = remaining[break_pos] in ",;:!?" else: break_pos = max_chars include_break_char = False cut_at = break_pos + (1 if include_break_char else 0) if cut_at <= 0: cut_at = min(max_chars, len(remaining)) segment = remaining[:cut_at].strip() if segment: parts.append(segment) remaining = remaining[cut_at:].lstrip() if remaining.strip(): parts.append(remaining.strip()) return parts