File size: 5,518 Bytes
7e0eb32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Pure-Python speech-duration estimator for DramaBox prompts.

Originally lived in ``inference.py`` but pulled out so chunkers / tooling /
unit tests can import it without dragging torch + the LTX pipeline through
sys.path. ``inference.py`` and ``inference_server.py`` continue to import
``estimate_speech_duration`` from here.
"""
from __future__ import annotations

import re


_LAUGH_VERBS = {
    # base seconds per occurrence; gets scaled by the modifier found nearby.
    # Verb regex covers inflections: laugh/laughs/laughed/laughing.
    r"\blaugh(?:s|ed|ing)?\b": 1.5,
    r"\bcackl(?:e|es|ed|ing)\b": 1.5,
    r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
    r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
    r"\bsnicker(?:s|ed|ing)?\b": 0.8,
    r"\bcru?el laugh\b": 1.5,
}


def _contextual_laugh_duration(text: str) -> float:
    """Context-aware laugh budget.

    For each laugh verb in the prompt, look at the adjective/adverb that
    modifies it and scale the base duration:
      - short modifiers  (briefly, softly, once)     -> 0.4x base
      - long modifiers   (maniacally, heartily, ...) -> 1.2x base
      - default (no mod / neutral)                   -> 1.0x base
    Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
    time than 'Haha' -- at ~0.2s per extra repeated syllable.
    """
    short_mod = re.compile(
        r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
        re.IGNORECASE)
    long_mod = re.compile(
        r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
        r"hysterically|darkly|wickedly|evilly|loudly|long)"
        r"|^\s*between phrases", re.IGNORECASE)

    total = 0.0
    for pat, base_dur in _LAUGH_VERBS.items():
        for m in re.finditer(pat, text, re.IGNORECASE):
            ctx = text[m.end(): m.end() + 40]
            if short_mod.match(ctx):
                total += base_dur * 0.4
            elif long_mod.match(ctx):
                total += base_dur * 1.2
            else:
                total += base_dur

    # Phonetic laugh repetition inside quotes.
    for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
        for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
            syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
            total += 0.2 * max(syls - 2, 0)
    return total


def _estimate_nonverbal_duration(text: str) -> float:
    """Estimate extra duration for non-verbal sounds and actions in the prompt.

    Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
    chuckle / laugh budgets scale with the adjective ("maniacally" vs
    "briefly") and with the repetition length of 'Ha'/'He' tokens inside
    quotes.
    """
    PATTERNS = {
        r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
        r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
        r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
        r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
        r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
        r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
        r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
        r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
        r'\bdraws? (?:his|her|a) sword\b': 0.5,
        r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
        r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
        r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
        r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
        r'\bswallows?\b': 0.5,
        r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
        r'\bsteadies? (?:him|her)self\b': 1.0,
        r'\bcatches? (?:his|her) breath\b': 1.0,
        r'\bcomposes? (?:him|her)self\b': 0.8,
        r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
        r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
    }
    extra = 0.0
    for pattern, dur in PATTERNS.items():
        extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
    extra += _contextual_laugh_duration(text)
    return extra


def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
    """Estimate speech duration from spoken content + non-verbal actions.

    Extracts spoken text by priority:
    1. Quoted text ('...' or "...") -- official prompt guide format
    2. Text after colon -- simple "Speaker: dialogue" format
    3. Full text -- fallback

    Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
    gasps, etc.) and adds estimated duration for each.
    """
    quotes = re.findall(r'"([^"]+)"', text)
    if not quotes:
        quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
        quotes = [q for q in quotes if len(q.split()) > 3]
    if quotes:
        spoken = " ".join(quotes)
    elif ":" in text:
        spoken = text.split(":", 1)[1].strip()
    else:
        spoken = text

    CHARS_PER_SEC = 14.0
    text_len = len(spoken)

    if text_len < 40:
        chars_per_sec = CHARS_PER_SEC * 0.6
    elif text_len < 80:
        chars_per_sec = CHARS_PER_SEC * 0.8
    else:
        chars_per_sec = CHARS_PER_SEC

    chars_per_sec *= speed
    duration = text_len / chars_per_sec

    sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
    duration += sentence_count * 0.3

    duration += _estimate_nonverbal_duration(text)

    return max(3.0, round(duration + 2.0, 1))