Spaces:
Running on Zero
Running on Zero
File size: 5,518 Bytes
7e0eb32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | """Pure-Python speech-duration estimator for DramaBox prompts.
Originally lived in ``inference.py`` but pulled out so chunkers / tooling /
unit tests can import it without dragging torch + the LTX pipeline through
sys.path. ``inference.py`` and ``inference_server.py`` continue to import
``estimate_speech_duration`` from here.
"""
from __future__ import annotations
import re
_LAUGH_VERBS = {
# base seconds per occurrence; gets scaled by the modifier found nearby.
# Verb regex covers inflections: laugh/laughs/laughed/laughing.
r"\blaugh(?:s|ed|ing)?\b": 1.5,
r"\bcackl(?:e|es|ed|ing)\b": 1.5,
r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
r"\bsnicker(?:s|ed|ing)?\b": 0.8,
r"\bcru?el laugh\b": 1.5,
}
def _contextual_laugh_duration(text: str) -> float:
"""Context-aware laugh budget.
For each laugh verb in the prompt, look at the adjective/adverb that
modifies it and scale the base duration:
- short modifiers (briefly, softly, once) -> 0.4x base
- long modifiers (maniacally, heartily, ...) -> 1.2x base
- default (no mod / neutral) -> 1.0x base
Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
time than 'Haha' -- at ~0.2s per extra repeated syllable.
"""
short_mod = re.compile(
r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
re.IGNORECASE)
long_mod = re.compile(
r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
r"hysterically|darkly|wickedly|evilly|loudly|long)"
r"|^\s*between phrases", re.IGNORECASE)
total = 0.0
for pat, base_dur in _LAUGH_VERBS.items():
for m in re.finditer(pat, text, re.IGNORECASE):
ctx = text[m.end(): m.end() + 40]
if short_mod.match(ctx):
total += base_dur * 0.4
elif long_mod.match(ctx):
total += base_dur * 1.2
else:
total += base_dur
# Phonetic laugh repetition inside quotes.
for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
total += 0.2 * max(syls - 2, 0)
return total
def _estimate_nonverbal_duration(text: str) -> float:
"""Estimate extra duration for non-verbal sounds and actions in the prompt.
Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
chuckle / laugh budgets scale with the adjective ("maniacally" vs
"briefly") and with the repetition length of 'Ha'/'He' tokens inside
quotes.
"""
PATTERNS = {
r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
r'\bdraws? (?:his|her|a) sword\b': 0.5,
r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
r'\bswallows?\b': 0.5,
r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
r'\bsteadies? (?:him|her)self\b': 1.0,
r'\bcatches? (?:his|her) breath\b': 1.0,
r'\bcomposes? (?:him|her)self\b': 0.8,
r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
}
extra = 0.0
for pattern, dur in PATTERNS.items():
extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
extra += _contextual_laugh_duration(text)
return extra
def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
"""Estimate speech duration from spoken content + non-verbal actions.
Extracts spoken text by priority:
1. Quoted text ('...' or "...") -- official prompt guide format
2. Text after colon -- simple "Speaker: dialogue" format
3. Full text -- fallback
Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
gasps, etc.) and adds estimated duration for each.
"""
quotes = re.findall(r'"([^"]+)"', text)
if not quotes:
quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
quotes = [q for q in quotes if len(q.split()) > 3]
if quotes:
spoken = " ".join(quotes)
elif ":" in text:
spoken = text.split(":", 1)[1].strip()
else:
spoken = text
CHARS_PER_SEC = 14.0
text_len = len(spoken)
if text_len < 40:
chars_per_sec = CHARS_PER_SEC * 0.6
elif text_len < 80:
chars_per_sec = CHARS_PER_SEC * 0.8
else:
chars_per_sec = CHARS_PER_SEC
chars_per_sec *= speed
duration = text_len / chars_per_sec
sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
duration += sentence_count * 0.3
duration += _estimate_nonverbal_duration(text)
return max(3.0, round(duration + 2.0, 1))
|