Dramabox / src /duration_estimator.py
Manmay's picture
Long-form chunking + RE-USE on reference
7e0eb32
"""Pure-Python speech-duration estimator for DramaBox prompts.
Originally lived in ``inference.py`` but pulled out so chunkers / tooling /
unit tests can import it without dragging torch + the LTX pipeline through
sys.path. ``inference.py`` and ``inference_server.py`` continue to import
``estimate_speech_duration`` from here.
"""
from __future__ import annotations
import re
_LAUGH_VERBS = {
# base seconds per occurrence; gets scaled by the modifier found nearby.
# Verb regex covers inflections: laugh/laughs/laughed/laughing.
r"\blaugh(?:s|ed|ing)?\b": 1.5,
r"\bcackl(?:e|es|ed|ing)\b": 1.5,
r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
r"\bsnicker(?:s|ed|ing)?\b": 0.8,
r"\bcru?el laugh\b": 1.5,
}
def _contextual_laugh_duration(text: str) -> float:
"""Context-aware laugh budget.
For each laugh verb in the prompt, look at the adjective/adverb that
modifies it and scale the base duration:
- short modifiers (briefly, softly, once) -> 0.4x base
- long modifiers (maniacally, heartily, ...) -> 1.2x base
- default (no mod / neutral) -> 1.0x base
Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
time than 'Haha' -- at ~0.2s per extra repeated syllable.
"""
short_mod = re.compile(
r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
re.IGNORECASE)
long_mod = re.compile(
r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
r"hysterically|darkly|wickedly|evilly|loudly|long)"
r"|^\s*between phrases", re.IGNORECASE)
total = 0.0
for pat, base_dur in _LAUGH_VERBS.items():
for m in re.finditer(pat, text, re.IGNORECASE):
ctx = text[m.end(): m.end() + 40]
if short_mod.match(ctx):
total += base_dur * 0.4
elif long_mod.match(ctx):
total += base_dur * 1.2
else:
total += base_dur
# Phonetic laugh repetition inside quotes.
for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
total += 0.2 * max(syls - 2, 0)
return total
def _estimate_nonverbal_duration(text: str) -> float:
"""Estimate extra duration for non-verbal sounds and actions in the prompt.
Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
chuckle / laugh budgets scale with the adjective ("maniacally" vs
"briefly") and with the repetition length of 'Ha'/'He' tokens inside
quotes.
"""
PATTERNS = {
r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
r'\bdraws? (?:his|her|a) sword\b': 0.5,
r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
r'\bswallows?\b': 0.5,
r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
r'\bsteadies? (?:him|her)self\b': 1.0,
r'\bcatches? (?:his|her) breath\b': 1.0,
r'\bcomposes? (?:him|her)self\b': 0.8,
r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
}
extra = 0.0
for pattern, dur in PATTERNS.items():
extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
extra += _contextual_laugh_duration(text)
return extra
def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
"""Estimate speech duration from spoken content + non-verbal actions.
Extracts spoken text by priority:
1. Quoted text ('...' or "...") -- official prompt guide format
2. Text after colon -- simple "Speaker: dialogue" format
3. Full text -- fallback
Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
gasps, etc.) and adds estimated duration for each.
"""
quotes = re.findall(r'"([^"]+)"', text)
if not quotes:
quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
quotes = [q for q in quotes if len(q.split()) > 3]
if quotes:
spoken = " ".join(quotes)
elif ":" in text:
spoken = text.split(":", 1)[1].strip()
else:
spoken = text
CHARS_PER_SEC = 14.0
text_len = len(spoken)
if text_len < 40:
chars_per_sec = CHARS_PER_SEC * 0.6
elif text_len < 80:
chars_per_sec = CHARS_PER_SEC * 0.8
else:
chars_per_sec = CHARS_PER_SEC
chars_per_sec *= speed
duration = text_len / chars_per_sec
sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
duration += sentence_count * 0.3
duration += _estimate_nonverbal_duration(text)
return max(3.0, round(duration + 2.0, 1))