Spaces:

ShadowHunter222
/

hello

Sleeping

File size: 14,459 Bytes

"""
Chatterbox Turbo TTS — Text Processor
═══════════════════════════════════════
Sanitizes raw input text and splits it into sentence-level chunks
for streaming TTS.  Paralinguistic tags ([laugh], [cough], …) are
explicitly preserved so the model can render them.

Punctuation Philosophy (based on Resemble AI recommendations):
  ✅ PRESERVE (benefits prosody):
     • Ellipsis ...    → dramatic pause, trailing thought, hesitation
     • Em dash —       → abrupt transition, dramatic break
     • Comma ,         → short natural pause / breathing point
     • Period .        → full stop, pitch drop, sentence boundary
     • ! and ?         → exclamatory / interrogative inflection
     • Semicolon ;     → medium pause, clause bridge (NOT a split point)
     • Colon :         → medium pause, introduces explanation (NOT a split point)
     • Parentheses ()  → quieter/explanatory tone shift
     • Quotes ""       → dialogue cue
     • Apostrophe '    → contractions (don't, it's)
     • CAPS words      → emphasis / volume increase

  ❌ FILTER (harms output):
     • Excessive repeated punctuation (!!!! → !, ???? → ?, ,,, → ,)
     • 4+ dots (.... → ...)
     • Emojis, URLs, markdown, HTML tags
     • Non-standard Unicode punctuation (guillemets, etc.)
"""
import re
from typing import List

from config import Config

# ═══════════════════════════════════════════════════════════════════
# Pre-compiled regex patterns (compiled once at import → zero cost)
# ═══════════════════════════════════════════════════════════════════

# — Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
_TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)

# — Markdown / structural noise
_RE_CODE_BLOCK   = re.compile(r"```[\s\S]*?```")
_RE_INLINE_CODE  = re.compile(r"`([^`]+)`")
_RE_IMAGE        = re.compile(r"!\[([^\]]*)\]\([^)]+\)")
_RE_LINK         = re.compile(r"\[([^\]]+)\]\([^)]+\)")
_RE_BOLD_AST     = re.compile(r"\*\*(.+?)\*\*")
_RE_BOLD_UND     = re.compile(r"__(.+?)__")
_RE_STRIKE       = re.compile(r"~~(.+?)~~")
_RE_ITALIC_AST   = re.compile(r"\*(.+?)\*")
_RE_ITALIC_UND   = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
_RE_HEADER       = re.compile(r"^#{1,6}\s+", re.MULTILINE)
_RE_BLOCKQUOTE   = re.compile(r"^>+\s?", re.MULTILINE)
_RE_HR           = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
_RE_BULLET       = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
_RE_ORDERED      = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)

# — URLs, emojis, HTML entities
_RE_URL          = re.compile(r"https?://\S+")
_RE_EMOJI        = re.compile(
    r"["
    r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
    r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
    r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
    r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
    r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
    r"\U0000200D"
    r"]+", re.UNICODE,
)
_RE_HTML_ENTITY  = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);")

# HTML entities → speakable replacements
# NOTE: &hellip; → "..." (preserves dramatic pause), &mdash;/&ndash; → "—" (preserves dramatic break)
_HTML_ENTITIES   = {
    "&amp;": " and ", "&lt;": " less than ", "&gt;": " greater than ",
    "&nbsp;": " ", "&quot;": '"', "&apos;": "'",
    "&mdash;": "—", "&ndash;": "—", "&hellip;": "...",
}

# — Smart/curly quote normalization → ASCII equivalents
# These Unicode variants may confuse the tokenizer; normalizing ensures clean input.
_SMART_QUOTE_MAP = str.maketrans({
    "\u201c": '"',   # " left double quotation mark
    "\u201d": '"',   # " right double quotation mark
    "\u2018": "'",   # ' left single quotation mark
    "\u2019": "'",   # ' right single quotation mark
    "\u00ab": '"',   # « left guillemet
    "\u00bb": '"',   # » right guillemet
    "\u201e": '"',   # „ double low quotation mark
    "\u201f": '"',   # ‟ double high reversed quotation mark
    "\u2032": "'",   # ′ prime
    "\u2033": '"',   # ″ double prime
    "\u2013": "—",   # – en dash → em dash (dramatic pause)
    "\u2014": "—",   # — em dash (keep as-is after mapping)
    "\u2026": "...", # … horizontal ellipsis → three dots
})

# — ALL CAPS normalization
# Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING).
# By converting them to Title Case, they'll be processed naturally as words.
_RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b")

# — Punctuation normalization
#   Ellipsis (... / ..) is PRESERVED — it creates dramatic pauses in Chatterbox.
#   Only 4+ dots are excessive and get capped to standard ellipsis.
_RE_EXCESSIVE_DOTS   = re.compile(r"\.{4,}")       # ....+ → ... (cap excessive)
_RE_NORMALIZE_DOTS   = re.compile(r"\.{2,3}")       # .. or ... → ... (standardize)
_RE_REPEATED_EXCLAM  = re.compile(r"!{2,}")          # !! → !
_RE_REPEATED_QUEST   = re.compile(r"\?{2,}")         # ?? → ?
_RE_REPEATED_SEMI    = re.compile(r";{2,}")           # ;; → ;
_RE_REPEATED_COLON   = re.compile(r":{2,}")           # :: → :
_RE_REPEATED_COMMA   = re.compile(r",{2,}")           # ,, → ,
_RE_REPEATED_DASH    = re.compile(r"-{3,}")           # --- → — (em dash)

# — Abbreviation protection
# Common abbreviations ending in "." that should NOT trigger sentence splitting.
# These get a placeholder before splitting, then get restored.
_ABBREVIATIONS = (
    "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd",
    "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd",
    "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
    "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm",
    "Fig", "Vol", "No", "Ref", "Rev", "Ph",
)
_RE_ABBREV = re.compile(
    r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.",
    re.IGNORECASE,
)

# — Whitespace
_RE_MULTI_SPACE      = re.compile(r"[ \t]+")
_RE_MULTI_NEWLINE    = re.compile(r"\n{3,}")
_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")

# — Sentence boundary (split point)
# Split ONLY on true sentence-ending punctuation: . ! ?
# Semicolons and colons are clause connectors — they bridge related thoughts
# and should NOT be used as split points (creates choppy, unnatural fragments).
# Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream
# creates a compound lag between chunks, making the pause artificially excessive.
_RE_SENTENCE_SPLIT = re.compile(
    r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+"""
)

_MIN_MERGE_WORDS = 5


# ═══════════════════════════════════════════════════════════════════
# Public API
# ═══════════════════════════════════════════════════════════════════

def sanitize(text: str) -> str:
    """Clean raw input for TTS while preserving prosody-beneficial punctuation.

    Preserves: ellipsis (...), em dashes (—), commas, periods, !, ?, ;, :, quotes.
    Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation.
    """
    if not text:
        return text

    # 0. Normalize smart/curly quotes and Unicode punctuation FIRST
    #    This ensures downstream regex works on clean ASCII-like punctuation.
    text = text.translate(_SMART_QUOTE_MAP)

    # 1. Normalize ALL CAPS words to Title Case to prevent spelling out
    text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text)

    # 2. Protect paralinguistic tags by replacing with placeholders
    tags_found: list[tuple[int, str]] = []
    def _protect_tag(m):
        idx = len(tags_found)
        tags_found.append((idx, m.group(0)))
        return f"§TAG{idx}§"
    text = _RE_PARA_TAG.sub(_protect_tag, text)

    # 3. Protect abbreviations from sentence-boundary splitting
    #    "Dr. Smith" → "Dr§ Smith" (restored later)
    abbrevs_found: list[tuple[int, str]] = []
    def _protect_abbrev(m):
        idx = len(abbrevs_found)
        abbrevs_found.append((idx, m.group(0)))
        return f"{m.group(1)}§ABR{idx}§"
    text = _RE_ABBREV.sub(_protect_abbrev, text)

    # 4. Strip non-speakable structures
    text = _RE_URL.sub("", text)
    text = _RE_CODE_BLOCK.sub("", text)
    text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
    text = _RE_LINK.sub(r"\1", text)
    text = _RE_BOLD_AST.sub(r"\1", text)
    text = _RE_BOLD_UND.sub(r"\1", text)
    text = _RE_STRIKE.sub(r"\1", text)
    text = _RE_ITALIC_AST.sub(r"\1", text)
    text = _RE_ITALIC_UND.sub(r"\1", text)
    text = _RE_INLINE_CODE.sub(r"\1", text)
    text = _RE_HEADER.sub("", text)
    text = _RE_BLOCKQUOTE.sub("", text)
    text = _RE_HR.sub("", text)
    text = _RE_BULLET.sub("", text)
    text = _RE_ORDERED.sub("", text)

    # 5. Emojis, hashtags
    text = _RE_EMOJI.sub("", text)
    text = re.sub(r"#(\w+)", r"\1", text)

    # 6. HTML entities → speakable text
    text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)

    # 7. Normalize punctuation (PRESERVE prosody-beneficial marks)
    #    Order matters: handle excessive dots first, then standardize ellipsis.
    text = _RE_EXCESSIVE_DOTS.sub("...", text)       # ....+ → ... (cap)
    text = _RE_NORMALIZE_DOTS.sub("...", text)        # .. or ... → ... (standardize)
    text = _RE_REPEATED_EXCLAM.sub("!", text)         # !! → !
    text = _RE_REPEATED_QUEST.sub("?", text)          # ?? → ?
    text = _RE_REPEATED_SEMI.sub(";", text)           # ;; → ;
    text = _RE_REPEATED_COLON.sub(":", text)          # :: → :
    text = _RE_REPEATED_COMMA.sub(",", text)          # ,, → ,
    text = _RE_REPEATED_DASH.sub("—", text)           # --- → em dash

    # 8. Whitespace cleanup
    text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
    text = _RE_MULTI_SPACE.sub(" ", text)
    text = _RE_MULTI_NEWLINE.sub("\n\n", text)
    text = text.strip()

    # 9. Strip abbreviation dots (Mr. → Mr, Dr. → Dr, etc.)
    #    The dot is not needed for correct TTS pronunciation and removing it
    #    prevents false sentence-boundary splits in split_for_streaming().
    for idx, original in abbrevs_found:
        text = text.replace(f"§ABR{idx}§", "")

    # 10. Restore paralinguistic tags
    for idx, original in tags_found:
        text = text.replace(f"§TAG{idx}§", original)

    return text


def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
    """Split sanitized text into sentence-level chunks for streaming.

    Strategy:
      1. Split on sentence-ending punctuation boundaries (. ! ?)
         — NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries)
      2. Enforce max_chars per chunk (split long sentences on commas / spaces)
      3. Merge short chunks (≤5 words) with the next to avoid tiny segments
    """
    if not text:
        return []

    # Step 1: sentence split
    raw_chunks = _RE_SENTENCE_SPLIT.split(text)
    raw_chunks = [c.strip() for c in raw_chunks if c.strip()]

    # Step 2: enforce max length per chunk
    sized: List[str] = []
    for chunk in raw_chunks:
        if len(chunk) <= max_chars:
            sized.append(chunk)
        else:
            sized.extend(_break_long_chunk(chunk, max_chars))

    # Step 3: merge short chunks
    if len(sized) <= 1:
        return sized

    merged: List[str] = []
    carry = ""
    for i, chunk in enumerate(sized):
        if carry:
            chunk = carry + " " + chunk
            carry = ""
        if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
            carry = chunk
        else:
            merged.append(chunk)
    if carry:
        if merged:
            merged[-1] += " " + carry
        else:
            merged.append(carry)

    return merged


# ═══════════════════════════════════════════════════════════════════
# Internal helpers
# ═══════════════════════════════════════════════════════════════════

def _break_long_chunk(text: str, max_chars: int) -> List[str]:
    """Break a chunk longer than max_chars on commas or word boundaries."""
    parts: List[str] = []
    remaining = text
    while len(remaining) > max_chars:
        break_pos = -1
        include_break_char = False

        # Prefer punctuation/pauses first to keep prosody natural.
        for marker in (",", ";", ":", "—", "-", "!", "?"):
            pos = remaining.rfind(marker, 0, max_chars)
            if pos > break_pos:
                break_pos = pos
                include_break_char = True

        # Then prefer nearest space before limit.
        space_pos = remaining.rfind(" ", 0, max_chars)
        if space_pos > break_pos:
            break_pos = space_pos
            include_break_char = False

        # If nothing before limit, look slightly ahead to avoid mid-word cuts.
        if break_pos == -1:
            forward_limit = min(len(remaining), max_chars + 24)
            m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit])
            if m:
                break_pos = max_chars + m.start()
                include_break_char = remaining[break_pos] in ",;:!?"
            else:
                break_pos = max_chars
                include_break_char = False

        cut_at = break_pos + (1 if include_break_char else 0)
        if cut_at <= 0:
            cut_at = min(max_chars, len(remaining))

        segment = remaining[:cut_at].strip()
        if segment:
            parts.append(segment)
        remaining = remaining[cut_at:].lstrip()
    if remaining.strip():
        parts.append(remaining.strip())
    return parts