Spaces:
Sleeping
Sleeping
| """ | |
| Chatterbox Turbo TTS β Text Processor | |
| βββββββββββββββββββββββββββββββββββββββ | |
| Sanitizes raw input text and splits it into sentence-level chunks | |
| for streaming TTS. Paralinguistic tags ([laugh], [cough], β¦) are | |
| explicitly preserved so the model can render them. | |
| Punctuation Philosophy (based on Resemble AI recommendations): | |
| β PRESERVE (benefits prosody): | |
| β’ Ellipsis ... β dramatic pause, trailing thought, hesitation | |
| β’ Em dash β β abrupt transition, dramatic break | |
| β’ Comma , β short natural pause / breathing point | |
| β’ Period . β full stop, pitch drop, sentence boundary | |
| β’ ! and ? β exclamatory / interrogative inflection | |
| β’ Semicolon ; β medium pause, clause bridge (NOT a split point) | |
| β’ Colon : β medium pause, introduces explanation (NOT a split point) | |
| β’ Parentheses () β quieter/explanatory tone shift | |
| β’ Quotes "" β dialogue cue | |
| β’ Apostrophe ' β contractions (don't, it's) | |
| β’ CAPS words β emphasis / volume increase | |
| β FILTER (harms output): | |
| β’ Excessive repeated punctuation (!!!! β !, ???? β ?, ,,, β ,) | |
| β’ 4+ dots (.... β ...) | |
| β’ Emojis, URLs, markdown, HTML tags | |
| β’ Non-standard Unicode punctuation (guillemets, etc.) | |
| """ | |
| import re | |
| from typing import List | |
| from config import Config | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Pre-compiled regex patterns (compiled once at import β zero cost) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # β Paralinguistic tag protector (matches [laugh], [clear throat], etc.) | |
| _TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS) | |
| _RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE) | |
| # β Markdown / structural noise | |
| _RE_CODE_BLOCK = re.compile(r"```[\s\S]*?```") | |
| _RE_INLINE_CODE = re.compile(r"`([^`]+)`") | |
| _RE_IMAGE = re.compile(r"!\[([^\]]*)\]\([^)]+\)") | |
| _RE_LINK = re.compile(r"\[([^\]]+)\]\([^)]+\)") | |
| _RE_BOLD_AST = re.compile(r"\*\*(.+?)\*\*") | |
| _RE_BOLD_UND = re.compile(r"__(.+?)__") | |
| _RE_STRIKE = re.compile(r"~~(.+?)~~") | |
| _RE_ITALIC_AST = re.compile(r"\*(.+?)\*") | |
| _RE_ITALIC_UND = re.compile(r"(?<!\w)_(.+?)_(?!\w)") | |
| _RE_HEADER = re.compile(r"^#{1,6}\s+", re.MULTILINE) | |
| _RE_BLOCKQUOTE = re.compile(r"^>+\s?", re.MULTILINE) | |
| _RE_HR = re.compile(r"^[-*_]{3,}$", re.MULTILINE) | |
| _RE_BULLET = re.compile(r"^\s*[-*+]\s+", re.MULTILINE) | |
| _RE_ORDERED = re.compile(r"^\s*\d+\.\s+", re.MULTILINE) | |
| # β URLs, emojis, HTML entities | |
| _RE_URL = re.compile(r"https?://\S+") | |
| _RE_EMOJI = re.compile( | |
| r"[" | |
| r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" | |
| r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" | |
| r"\U00002702-\U000027B0\U0001F900-\U0001F9FF" | |
| r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF" | |
| r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F" | |
| r"\U0000200D" | |
| r"]+", re.UNICODE, | |
| ) | |
| _RE_HTML_ENTITY = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);") | |
| # HTML entities β speakable replacements | |
| # NOTE: … β "..." (preserves dramatic pause), —/– β "β" (preserves dramatic break) | |
| _HTML_ENTITIES = { | |
| "&": " and ", "<": " less than ", ">": " greater than ", | |
| " ": " ", """: '"', "'": "'", | |
| "—": "β", "–": "β", "…": "...", | |
| } | |
| # β Smart/curly quote normalization β ASCII equivalents | |
| # These Unicode variants may confuse the tokenizer; normalizing ensures clean input. | |
| _SMART_QUOTE_MAP = str.maketrans({ | |
| "\u201c": '"', # " left double quotation mark | |
| "\u201d": '"', # " right double quotation mark | |
| "\u2018": "'", # ' left single quotation mark | |
| "\u2019": "'", # ' right single quotation mark | |
| "\u00ab": '"', # Β« left guillemet | |
| "\u00bb": '"', # Β» right guillemet | |
| "\u201e": '"', # β double low quotation mark | |
| "\u201f": '"', # β double high reversed quotation mark | |
| "\u2032": "'", # β² prime | |
| "\u2033": '"', # β³ double prime | |
| "\u2013": "β", # β en dash β em dash (dramatic pause) | |
| "\u2014": "β", # β em dash (keep as-is after mapping) | |
| "\u2026": "...", # β¦ horizontal ellipsis β three dots | |
| }) | |
| # β ALL CAPS normalization | |
| # Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING). | |
| # By converting them to Title Case, they'll be processed naturally as words. | |
| _RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b") | |
| # β Punctuation normalization | |
| # Ellipsis (... / ..) is PRESERVED β it creates dramatic pauses in Chatterbox. | |
| # Only 4+ dots are excessive and get capped to standard ellipsis. | |
| _RE_EXCESSIVE_DOTS = re.compile(r"\.{4,}") # ....+ β ... (cap excessive) | |
| _RE_NORMALIZE_DOTS = re.compile(r"\.{2,3}") # .. or ... β ... (standardize) | |
| _RE_REPEATED_EXCLAM = re.compile(r"!{2,}") # !! β ! | |
| _RE_REPEATED_QUEST = re.compile(r"\?{2,}") # ?? β ? | |
| _RE_REPEATED_SEMI = re.compile(r";{2,}") # ;; β ; | |
| _RE_REPEATED_COLON = re.compile(r":{2,}") # :: β : | |
| _RE_REPEATED_COMMA = re.compile(r",{2,}") # ,, β , | |
| _RE_REPEATED_DASH = re.compile(r"-{3,}") # --- β β (em dash) | |
| # β Abbreviation protection | |
| # Common abbreviations ending in "." that should NOT trigger sentence splitting. | |
| # These get a placeholder before splitting, then get restored. | |
| _ABBREVIATIONS = ( | |
| "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd", | |
| "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd", | |
| "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", | |
| "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm", | |
| "Fig", "Vol", "No", "Ref", "Rev", "Ph", | |
| ) | |
| _RE_ABBREV = re.compile( | |
| r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.", | |
| re.IGNORECASE, | |
| ) | |
| # β Whitespace | |
| _RE_MULTI_SPACE = re.compile(r"[ \t]+") | |
| _RE_MULTI_NEWLINE = re.compile(r"\n{3,}") | |
| _RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])") | |
| # β Sentence boundary (split point) | |
| # Split ONLY on true sentence-ending punctuation: . ! ? | |
| # Semicolons and colons are clause connectors β they bridge related thoughts | |
| # and should NOT be used as split points (creates choppy, unnatural fragments). | |
| # Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream | |
| # creates a compound lag between chunks, making the pause artificially excessive. | |
| _RE_SENTENCE_SPLIT = re.compile( | |
| r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+""" | |
| ) | |
| _MIN_MERGE_WORDS = 5 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Public API | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def sanitize(text: str) -> str: | |
| """Clean raw input for TTS while preserving prosody-beneficial punctuation. | |
| Preserves: ellipsis (...), em dashes (β), commas, periods, !, ?, ;, :, quotes. | |
| Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation. | |
| """ | |
| if not text: | |
| return text | |
| # 0. Normalize smart/curly quotes and Unicode punctuation FIRST | |
| # This ensures downstream regex works on clean ASCII-like punctuation. | |
| text = text.translate(_SMART_QUOTE_MAP) | |
| # 1. Normalize ALL CAPS words to Title Case to prevent spelling out | |
| text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text) | |
| # 2. Protect paralinguistic tags by replacing with placeholders | |
| tags_found: list[tuple[int, str]] = [] | |
| def _protect_tag(m): | |
| idx = len(tags_found) | |
| tags_found.append((idx, m.group(0))) | |
| return f"Β§TAG{idx}Β§" | |
| text = _RE_PARA_TAG.sub(_protect_tag, text) | |
| # 3. Protect abbreviations from sentence-boundary splitting | |
| # "Dr. Smith" β "DrΒ§ Smith" (restored later) | |
| abbrevs_found: list[tuple[int, str]] = [] | |
| def _protect_abbrev(m): | |
| idx = len(abbrevs_found) | |
| abbrevs_found.append((idx, m.group(0))) | |
| return f"{m.group(1)}Β§ABR{idx}Β§" | |
| text = _RE_ABBREV.sub(_protect_abbrev, text) | |
| # 4. Strip non-speakable structures | |
| text = _RE_URL.sub("", text) | |
| text = _RE_CODE_BLOCK.sub("", text) | |
| text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text) | |
| text = _RE_LINK.sub(r"\1", text) | |
| text = _RE_BOLD_AST.sub(r"\1", text) | |
| text = _RE_BOLD_UND.sub(r"\1", text) | |
| text = _RE_STRIKE.sub(r"\1", text) | |
| text = _RE_ITALIC_AST.sub(r"\1", text) | |
| text = _RE_ITALIC_UND.sub(r"\1", text) | |
| text = _RE_INLINE_CODE.sub(r"\1", text) | |
| text = _RE_HEADER.sub("", text) | |
| text = _RE_BLOCKQUOTE.sub("", text) | |
| text = _RE_HR.sub("", text) | |
| text = _RE_BULLET.sub("", text) | |
| text = _RE_ORDERED.sub("", text) | |
| # 5. Emojis, hashtags | |
| text = _RE_EMOJI.sub("", text) | |
| text = re.sub(r"#(\w+)", r"\1", text) | |
| # 6. HTML entities β speakable text | |
| text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text) | |
| # 7. Normalize punctuation (PRESERVE prosody-beneficial marks) | |
| # Order matters: handle excessive dots first, then standardize ellipsis. | |
| text = _RE_EXCESSIVE_DOTS.sub("...", text) # ....+ β ... (cap) | |
| text = _RE_NORMALIZE_DOTS.sub("...", text) # .. or ... β ... (standardize) | |
| text = _RE_REPEATED_EXCLAM.sub("!", text) # !! β ! | |
| text = _RE_REPEATED_QUEST.sub("?", text) # ?? β ? | |
| text = _RE_REPEATED_SEMI.sub(";", text) # ;; β ; | |
| text = _RE_REPEATED_COLON.sub(":", text) # :: β : | |
| text = _RE_REPEATED_COMMA.sub(",", text) # ,, β , | |
| text = _RE_REPEATED_DASH.sub("β", text) # --- β em dash | |
| # 8. Whitespace cleanup | |
| text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text) | |
| text = _RE_MULTI_SPACE.sub(" ", text) | |
| text = _RE_MULTI_NEWLINE.sub("\n\n", text) | |
| text = text.strip() | |
| # 9. Strip abbreviation dots (Mr. β Mr, Dr. β Dr, etc.) | |
| # The dot is not needed for correct TTS pronunciation and removing it | |
| # prevents false sentence-boundary splits in split_for_streaming(). | |
| for idx, original in abbrevs_found: | |
| text = text.replace(f"Β§ABR{idx}Β§", "") | |
| # 10. Restore paralinguistic tags | |
| for idx, original in tags_found: | |
| text = text.replace(f"Β§TAG{idx}Β§", original) | |
| return text | |
| def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]: | |
| """Split sanitized text into sentence-level chunks for streaming. | |
| Strategy: | |
| 1. Split on sentence-ending punctuation boundaries (. ! ?) | |
| β NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries) | |
| 2. Enforce max_chars per chunk (split long sentences on commas / spaces) | |
| 3. Merge short chunks (β€5 words) with the next to avoid tiny segments | |
| """ | |
| if not text: | |
| return [] | |
| # Step 1: sentence split | |
| raw_chunks = _RE_SENTENCE_SPLIT.split(text) | |
| raw_chunks = [c.strip() for c in raw_chunks if c.strip()] | |
| # Step 2: enforce max length per chunk | |
| sized: List[str] = [] | |
| for chunk in raw_chunks: | |
| if len(chunk) <= max_chars: | |
| sized.append(chunk) | |
| else: | |
| sized.extend(_break_long_chunk(chunk, max_chars)) | |
| # Step 3: merge short chunks | |
| if len(sized) <= 1: | |
| return sized | |
| merged: List[str] = [] | |
| carry = "" | |
| for i, chunk in enumerate(sized): | |
| if carry: | |
| chunk = carry + " " + chunk | |
| carry = "" | |
| if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1: | |
| carry = chunk | |
| else: | |
| merged.append(chunk) | |
| if carry: | |
| if merged: | |
| merged[-1] += " " + carry | |
| else: | |
| merged.append(carry) | |
| return merged | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Internal helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _break_long_chunk(text: str, max_chars: int) -> List[str]: | |
| """Break a chunk longer than max_chars on commas or word boundaries.""" | |
| parts: List[str] = [] | |
| remaining = text | |
| while len(remaining) > max_chars: | |
| break_pos = -1 | |
| include_break_char = False | |
| # Prefer punctuation/pauses first to keep prosody natural. | |
| for marker in (",", ";", ":", "β", "-", "!", "?"): | |
| pos = remaining.rfind(marker, 0, max_chars) | |
| if pos > break_pos: | |
| break_pos = pos | |
| include_break_char = True | |
| # Then prefer nearest space before limit. | |
| space_pos = remaining.rfind(" ", 0, max_chars) | |
| if space_pos > break_pos: | |
| break_pos = space_pos | |
| include_break_char = False | |
| # If nothing before limit, look slightly ahead to avoid mid-word cuts. | |
| if break_pos == -1: | |
| forward_limit = min(len(remaining), max_chars + 24) | |
| m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit]) | |
| if m: | |
| break_pos = max_chars + m.start() | |
| include_break_char = remaining[break_pos] in ",;:!?" | |
| else: | |
| break_pos = max_chars | |
| include_break_char = False | |
| cut_at = break_pos + (1 if include_break_char else 0) | |
| if cut_at <= 0: | |
| cut_at = min(max_chars, len(remaining)) | |
| segment = remaining[:cut_at].strip() | |
| if segment: | |
| parts.append(segment) | |
| remaining = remaining[cut_at:].lstrip() | |
| if remaining.strip(): | |
| parts.append(remaining.strip()) | |
| return parts | |