"""Pure-Python speech-duration estimator for DramaBox prompts. Originally lived in ``inference.py`` but pulled out so chunkers / tooling / unit tests can import it without dragging torch + the LTX pipeline through sys.path. ``inference.py`` and ``inference_server.py`` continue to import ``estimate_speech_duration`` from here. """ from __future__ import annotations import re _LAUGH_VERBS = { # base seconds per occurrence; gets scaled by the modifier found nearby. # Verb regex covers inflections: laugh/laughs/laughed/laughing. r"\blaugh(?:s|ed|ing)?\b": 1.5, r"\bcackl(?:e|es|ed|ing)\b": 1.5, r"\bchuckl(?:e|es|ed|ing)\b": 1.0, r"\bgiggl(?:e|es|ed|ing)\b": 1.0, r"\bsnicker(?:s|ed|ing)?\b": 0.8, r"\bcru?el laugh\b": 1.5, } def _contextual_laugh_duration(text: str) -> float: """Context-aware laugh budget. For each laugh verb in the prompt, look at the adjective/adverb that modifies it and scale the base duration: - short modifiers (briefly, softly, once) -> 0.4x base - long modifiers (maniacally, heartily, ...) -> 1.2x base - default (no mod / neutral) -> 1.0x base Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more time than 'Haha' -- at ~0.2s per extra repeated syllable. """ short_mod = re.compile( r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)", re.IGNORECASE) long_mod = re.compile( r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|" r"hysterically|darkly|wickedly|evilly|loudly|long)" r"|^\s*between phrases", re.IGNORECASE) total = 0.0 for pat, base_dur in _LAUGH_VERBS.items(): for m in re.finditer(pat, text, re.IGNORECASE): ctx = text[m.end(): m.end() + 40] if short_mod.match(ctx): total += base_dur * 0.4 elif long_mod.match(ctx): total += base_dur * 1.2 else: total += base_dur # Phonetic laugh repetition inside quotes. for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text): for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE): syls = len(re.findall(r"h[ae]", run, re.IGNORECASE)) total += 0.2 * max(syls - 2, 0) return total def _estimate_nonverbal_duration(text: str) -> float: """Estimate extra duration for non-verbal sounds and actions in the prompt. Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle / chuckle / laugh budgets scale with the adjective ("maniacally" vs "briefly") and with the repetition length of 'Ha'/'He' tokens inside quotes. """ PATTERNS = { r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0, r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0, r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8, r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8, r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3, r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0, r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0, r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3, r'\bdraws? (?:his|her|a) sword\b': 0.5, r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5, r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8, r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0, r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5, r'\bswallows?\b': 0.5, r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5, r'\bsteadies? (?:him|her)self\b': 1.0, r'\bcatches? (?:his|her) breath\b': 1.0, r'\bcomposes? (?:him|her)self\b': 0.8, r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5, r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5, } extra = 0.0 for pattern, dur in PATTERNS.items(): extra += dur * len(re.findall(pattern, text, re.IGNORECASE)) extra += _contextual_laugh_duration(text) return extra def estimate_speech_duration(text: str, speed: float = 1.0) -> float: """Estimate speech duration from spoken content + non-verbal actions. Extracts spoken text by priority: 1. Quoted text ('...' or "...") -- official prompt guide format 2. Text after colon -- simple "Speaker: dialogue" format 3. Full text -- fallback Also scans the full prompt for non-verbal cues (laughs, pauses, sighs, gasps, etc.) and adds estimated duration for each. """ quotes = re.findall(r'"([^"]+)"', text) if not quotes: quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text) quotes = [q for q in quotes if len(q.split()) > 3] if quotes: spoken = " ".join(quotes) elif ":" in text: spoken = text.split(":", 1)[1].strip() else: spoken = text CHARS_PER_SEC = 14.0 text_len = len(spoken) if text_len < 40: chars_per_sec = CHARS_PER_SEC * 0.6 elif text_len < 80: chars_per_sec = CHARS_PER_SEC * 0.8 else: chars_per_sec = CHARS_PER_SEC chars_per_sec *= speed duration = text_len / chars_per_sec sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?") duration += sentence_count * 0.3 duration += _estimate_nonverbal_duration(text) return max(3.0, round(duration + 2.0, 1))