Spaces:

ResembleAI
/

Dramabox

Running on Zero

Manmay commited on 2 days ago

Commit

7e0eb32

1 Parent(s): e53641f

Long-form chunking + RE-USE on reference

Port long-form generation from the upstream DramaBox repo (commit 382a37c)
to the HF Space:

- Auto chunk-and-stitch when the prompt's estimated (or explicit) duration
exceeds the max_chunk_duration cap; quote-aware sentence splitter (new
src/text_chunker.py) and shared duration estimator (new
src/duration_estimator.py). Chunks are stitched with an equal-power
crossfade so independently-generated joins are inaudible.

- RE-USE input-side voice-reference denoise (new src/super_resolution.py).
Applied to the *reference* before VAE encoding so the model conditions on
a clean speaker / style anchor; the generated paralinguistic content
(laughs, breaths, sighs) stays untouched. Cached per session so chunked
runs don't re-denoise the same reference per chunk. Silently falls back
if the mamba_ssm / causal-conv1d kernels can't be loaded.

- /generate_audio API gains denoise_ref + chunking knobs with sensible
defaults, so the existing index.html client (which sends only the
original kwargs) keeps working.

- @spaces.GPU duration bumped 60s → 600s for multi-chunk runs.

- requirements.txt: add resampy / mamba-ssm / causal-conv1d (Linux only;
optional — failures fall back to skipping the denoise).

Files changed (8) hide show

app.py +16 -4
requirements.txt +12 -0
src/duration_estimator.py +140 -0
src/inference.py +8 -145
src/inference_server.py +216 -9
src/model_downloader.py +38 -4
src/super_resolution.py +232 -0
src/text_chunker.py +198 -0

app.py CHANGED Viewed

@@ -183,7 +183,7 @@ async def homepage():
 @app.api()
-@spaces.GPU(duration=60)
 def generate_audio(
     prompt: str,
     audio_ref: FileData | None,
@@ -192,11 +192,15 @@ def generate_audio(
     dur_mult: float,
     gen_dur: float,
     ref_dur: float,
-    seed: int
 ) -> FileData:
     if not prompt or not prompt.strip():
         raise gr.Error("Prompt is empty.")
     t0 = time.time()
     ref_path = None
     if audio_ref:
@@ -206,8 +210,12 @@ def generate_audio(
             ref_path = audio_ref.path
     if ref_path and not os.path.exists(ref_path):
         ref_path = None
     output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
     tts.generate_to_file(
         prompt=prompt,
         output=output,
@@ -218,6 +226,10 @@ def generate_audio(
         seed=int(seed),
         gen_duration=float(gen_dur),
         ref_duration=float(ref_dur),
     )
     elapsed = time.time() - t0
     logging.info(f"Generated in {elapsed:.2f}s -> {output}")

 @app.api()
+@spaces.GPU(duration=600)
 def generate_audio(
     prompt: str,
     audio_ref: FileData | None,
     dur_mult: float,
     gen_dur: float,
     ref_dur: float,
+    seed: int,
+    denoise_ref: bool = True,
+    max_chunk_duration: float = 45.0,
+    target_chunk_duration: float = 37.0,
+    crossfade_ms: float = 50.0,
 ) -> FileData:
     if not prompt or not prompt.strip():
         raise gr.Error("Prompt is empty.")
     t0 = time.time()
     ref_path = None
     if audio_ref:
             ref_path = audio_ref.path
     if ref_path and not os.path.exists(ref_path):
         ref_path = None
     output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
+    # Long-form: generate_to_file auto-routes to the chunk-and-stitch path when
+    # the estimated (or explicit gen_dur) duration exceeds max_chunk_duration.
+    # denoise_ref runs RE-USE on the voice reference before VAE encoding so the
+    # model conditions on a cleaner speaker / style anchor.
     tts.generate_to_file(
         prompt=prompt,
         output=output,
         seed=int(seed),
         gen_duration=float(gen_dur),
         ref_duration=float(ref_dur),
+        denoise_ref=bool(denoise_ref),
+        max_chunk_duration=float(max_chunk_duration),
+        target_chunk_duration=float(target_chunk_duration),
+        crossfade_ms=float(crossfade_ms),
     )
     elapsed = time.time() - t0
     logging.info(f"Generated in {elapsed:.2f}s -> {output}")

requirements.txt CHANGED Viewed

@@ -24,3 +24,15 @@ gradio==6.14.0
 spaces>=0.30.0
 soundfile>=0.12.0
 resemble-perth @ git+https://github.com/resemble-ai/Perth.git@master

 spaces>=0.30.0
 soundfile>=0.12.0
 resemble-perth @ git+https://github.com/resemble-ai/Perth.git@master
+# ── Optional: NVIDIA RE-USE speech enhancement (input-side voice-ref denoise) ─
+# RE-USE is applied to the uploaded voice reference before VAE encoding so the
+# model conditions on a clean speaker / style anchor (generated paralinguistic
+# events — laughs, breaths, sighs — stay untouched because the denoiser only
+# touches the reference). resampy is used for its pre-resample step; mamba_ssm
+# + causal-conv1d power the bi-Mamba kernels. The kernels have no pre-built
+# wheels for every CUDA toolkit — if installs fail, app.py logs a warning
+# once and silently skips the reference denoise for the rest of the session.
+resampy>=0.4.0
+mamba-ssm>=2.2.0 ; platform_system == "Linux"
+causal-conv1d>=1.4.0 ; platform_system == "Linux"

src/duration_estimator.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Pure-Python speech-duration estimator for DramaBox prompts.
+Originally lived in ``inference.py`` but pulled out so chunkers / tooling /
+unit tests can import it without dragging torch + the LTX pipeline through
+sys.path. ``inference.py`` and ``inference_server.py`` continue to import
+``estimate_speech_duration`` from here.
+"""
+from __future__ import annotations
+import re
+_LAUGH_VERBS = {
+    # base seconds per occurrence; gets scaled by the modifier found nearby.
+    # Verb regex covers inflections: laugh/laughs/laughed/laughing.
+    r"\blaugh(?:s|ed|ing)?\b": 1.5,
+    r"\bcackl(?:e|es|ed|ing)\b": 1.5,
+    r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
+    r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
+    r"\bsnicker(?:s|ed|ing)?\b": 0.8,
+    r"\bcru?el laugh\b": 1.5,
+}
+def _contextual_laugh_duration(text: str) -> float:
+    """Context-aware laugh budget.
+    For each laugh verb in the prompt, look at the adjective/adverb that
+    modifies it and scale the base duration:
+      - short modifiers  (briefly, softly, once)     -> 0.4x base
+      - long modifiers   (maniacally, heartily, ...) -> 1.2x base
+      - default (no mod / neutral)                   -> 1.0x base
+    Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
+    time than 'Haha' -- at ~0.2s per extra repeated syllable.
+    """
+    short_mod = re.compile(
+        r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
+        re.IGNORECASE)
+    long_mod = re.compile(
+        r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
+        r"hysterically|darkly|wickedly|evilly|loudly|long)"
+        r"|^\s*between phrases", re.IGNORECASE)
+    total = 0.0
+    for pat, base_dur in _LAUGH_VERBS.items():
+        for m in re.finditer(pat, text, re.IGNORECASE):
+            ctx = text[m.end(): m.end() + 40]
+            if short_mod.match(ctx):
+                total += base_dur * 0.4
+            elif long_mod.match(ctx):
+                total += base_dur * 1.2
+            else:
+                total += base_dur
+    # Phonetic laugh repetition inside quotes.
+    for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
+        for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
+            syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
+            total += 0.2 * max(syls - 2, 0)
+    return total
+def _estimate_nonverbal_duration(text: str) -> float:
+    """Estimate extra duration for non-verbal sounds and actions in the prompt.
+    Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
+    chuckle / laugh budgets scale with the adjective ("maniacally" vs
+    "briefly") and with the repetition length of 'Ha'/'He' tokens inside
+    quotes.
+    """
+    PATTERNS = {
+        r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
+        r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
+        r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
+        r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
+        r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
+        r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
+        r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
+        r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
+        r'\bdraws? (?:his|her|a) sword\b': 0.5,
+        r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
+        r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
+        r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
+        r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
+        r'\bswallows?\b': 0.5,
+        r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
+        r'\bsteadies? (?:him|her)self\b': 1.0,
+        r'\bcatches? (?:his|her) breath\b': 1.0,
+        r'\bcomposes? (?:him|her)self\b': 0.8,
+        r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
+        r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
+    }
+    extra = 0.0
+    for pattern, dur in PATTERNS.items():
+        extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
+    extra += _contextual_laugh_duration(text)
+    return extra
+def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
+    """Estimate speech duration from spoken content + non-verbal actions.
+    Extracts spoken text by priority:
+    1. Quoted text ('...' or "...") -- official prompt guide format
+    2. Text after colon -- simple "Speaker: dialogue" format
+    3. Full text -- fallback
+    Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
+    gasps, etc.) and adds estimated duration for each.
+    """
+    quotes = re.findall(r'"([^"]+)"', text)
+    if not quotes:
+        quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
+        quotes = [q for q in quotes if len(q.split()) > 3]
+    if quotes:
+        spoken = " ".join(quotes)
+    elif ":" in text:
+        spoken = text.split(":", 1)[1].strip()
+    else:
+        spoken = text
+    CHARS_PER_SEC = 14.0
+    text_len = len(spoken)
+    if text_len < 40:
+        chars_per_sec = CHARS_PER_SEC * 0.6
+    elif text_len < 80:
+        chars_per_sec = CHARS_PER_SEC * 0.8
+    else:
+        chars_per_sec = CHARS_PER_SEC
+    chars_per_sec *= speed
+    duration = text_len / chars_per_sec
+    sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
+    duration += sentence_count * 0.3
+    duration += _estimate_nonverbal_duration(text)
+    return max(3.0, round(duration + 2.0, 1))

src/inference.py CHANGED Viewed

@@ -74,151 +74,14 @@ def detect_model_type(checkpoint_path: str) -> str:
     return "distilled"
-_LAUGH_VERBS = {
-    # base seconds per occurrence; gets scaled by the modifier found nearby.
-    # Verb regex covers inflections: laugh/laughs/laughed/laughing.
-    r"\blaugh(?:s|ed|ing)?\b": 1.5,
-    r"\bcackl(?:e|es|ed|ing)\b": 1.5,
-    r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
-    r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
-    r"\bsnicker(?:s|ed|ing)?\b": 0.8,
-    r"\bcru?el laugh\b": 1.5,
-}
-def _contextual_laugh_duration(text: str) -> float:
-    """Context-aware laugh budget.
-    For each laugh verb in the prompt, look at the adjective/adverb that
-    modifies it and scale the base duration:
-      - short modifiers  (briefly, softly, once)     -> 0.4x base
-      - long modifiers   (maniacally, heartily, ...) -> 1.2x base
-      - default (no mod / neutral)                   -> 1.0x base
-    Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
-    time than 'Haha' -- at ~0.2s per extra repeated syllable.
-    """
-    # "softly" / "quietly" describe volume not length, so keep at default 1.0x.
-    short_mod = re.compile(
-        r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
-        re.IGNORECASE)
-    long_mod = re.compile(
-        r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
-        r"hysterically|darkly|wickedly|evilly|loudly|long)"
-        r"|^\s*between phrases", re.IGNORECASE)
-    total = 0.0
-    for pat, base_dur in _LAUGH_VERBS.items():
-        for m in re.finditer(pat, text, re.IGNORECASE):
-            ctx = text[m.end(): m.end() + 40]
-            if short_mod.match(ctx):
-                total += base_dur * 0.4
-            elif long_mod.match(ctx):
-                total += base_dur * 1.2
-            else:
-                total += base_dur
-    # Phonetic laugh repetition inside quotes:
-    #   'Haha' = 2 syllables (base, no bonus)
-    #   'Hahahaha' = 4 syllables (+0.4s)
-    #   'Hehehehahahahahahahaha' ~ 10 syllables (+1.6s)
-    for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
-        for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
-            syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
-            total += 0.2 * max(syls - 2, 0)
-    return total
-def _estimate_nonverbal_duration(text: str) -> float:
-    """Estimate extra duration for non-verbal sounds and actions in the prompt.
-    Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
-    chuckle / laugh budgets scale with the adjective ("maniacally" vs
-    "briefly") and with the repetition length of 'Ha'/'He' tokens inside
-    quotes.
-    """
-    PATTERNS = {
-        # Breathing / sighs
-        r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
-        r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
-        r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
-        r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
-        # Pauses (trimmed; earlier values over-budgeted silence)
-        r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
-        r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
-        r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
-        # Physical actions that produce sound
-        r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
-        r'\bdraws? (?:his|her|a) sword\b': 0.5,
-        r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
-        r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
-        # Vocal actions (not in quotes but take time)
-        r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
-        r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
-        r'\bswallows?\b': 0.5,
-        # (laugh / chuckle / cackle / giggle / snicker handled by
-        # _contextual_laugh_duration below -- modifier-aware, not flat.)
-        # Emotional transitions
-        r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
-        r'\bsteadies? (?:him|her)self\b': 1.0,
-        r'\bcatches? (?:his|her) breath\b': 1.0,
-        r'\bcomposes? (?:him|her)self\b': 0.8,
-        # Scene transitions that imply time
-        r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
-        r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
-    }
-    extra = 0.0
-    for pattern, dur in PATTERNS.items():
-        extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
-    extra += _contextual_laugh_duration(text)
-    return extra
-def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
-    """Estimate speech duration from spoken content + non-verbal actions.
-    Extracts spoken text by priority:
-    1. Quoted text ('...' or "...") -- official prompt guide format
-    2. Text after colon -- simple "Speaker: dialogue" format
-    3. Full text -- fallback
-    Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
-    gasps, etc.) and adds estimated duration for each.
-    """
-    # Try double quotes first (clean, no contraction issues)
-    quotes = re.findall(r'"([^"]+)"', text)
-    if not quotes:
-        # Single quotes: allow apostrophes in contractions (don't, can't, it's)
-        # Match ' to ' but apostrophes NOT followed by space/punctuation are kept inside
-        quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
-        # Filter out short fragments (scene directions like "He pauses")
-        quotes = [q for q in quotes if len(q.split()) > 3]
-    if quotes:
-        spoken = " ".join(quotes)
-    elif ":" in text:
-        spoken = text.split(":", 1)[1].strip()
-    else:
-        spoken = text
-    CHARS_PER_SEC = 14.0
-    text_len = len(spoken)
-    if text_len < 40:
-        chars_per_sec = CHARS_PER_SEC * 0.6
-    elif text_len < 80:
-        chars_per_sec = CHARS_PER_SEC * 0.8
-    else:
-        chars_per_sec = CHARS_PER_SEC
-    chars_per_sec *= speed
-    duration = text_len / chars_per_sec
-    sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
-    duration += sentence_count * 0.3
-    # Add time for non-verbal sounds/actions in the full prompt
-    duration += _estimate_nonverbal_duration(text)
-    return max(3.0, round(duration + 2.0, 1))
 def parse_args():

     return "distilled"
+# Duration estimator lives in duration_estimator.py so that text_chunker and
+# other tooling can import it without dragging the torch / LTX pipeline.
+from duration_estimator import (  # noqa: E402,F401
+    estimate_speech_duration,
+    _contextual_laugh_duration,
+    _estimate_nonverbal_duration,
+    _LAUGH_VERBS,
+)
 def parse_args():

src/inference_server.py CHANGED Viewed

@@ -14,6 +14,7 @@ import re
 import sys
 import time
 from pathlib import Path
 import torch
 import torchaudio
@@ -53,13 +54,40 @@ DEFAULT_NEG = "worst quality, inconsistent, robotic, distorted, noise, static, m
 def estimate_duration(prompt, multiplier=1.1):
-    """Defer to the richer CLI estimator (sentence-aware + non-verbal action
-    budget) so warm-server outputs match the lengths of the per-call CLI runs."""
-    from inference import estimate_speech_duration
     base = estimate_speech_duration(prompt)
     return max(3.0, round(base * multiplier, 1))
 def auto_rescale_for_cfg(cfg: float) -> float:
     """CFG-aware std-rescale schedule that prevents output clipping at high cfg.
@@ -110,6 +138,11 @@ class TTSServer:
         self._velocity_model = None
         self._audio_conditioner = None
         self._audio_decoder = None
         logging.info(f"TTSServer loading on {device}...")
         t0 = time.time()
@@ -203,10 +236,78 @@ class TTSServer:
         )
         logging.info(f"  AudioDecoder (warm): {time.time()-t0:.1f}s")
     @torch.inference_mode()
     def generate(self, prompt, voice_ref=None, cfg_scale=2.5, stg_scale=1.5,
                  duration_multiplier=1.1, seed=42, ref_duration=10.0,
-                 rescale_scale="auto", gen_duration: float = 0.0):
         """Generate audio. Returns (waveform_path, duration_seconds).
         rescale_scale: latent-side CFG std-rescale that prevents clipping at
@@ -214,6 +315,10 @@ class TTSServer:
             float in [0, 1] for a fixed override, or 0 to disable.
         gen_duration: explicit target duration in seconds. 0 (default) → auto
             from prompt + duration_multiplier; >0 overrides everything else.
         """
         t_total = time.time()
@@ -236,6 +341,8 @@ class TTSServer:
         if voice_ref and os.path.exists(voice_ref):
             t0 = time.time()
             voice = decode_audio_from_file(voice_ref, self.device, 0.0, ref_duration)
             w = voice.waveform
             if w.dim() == 2:
                 if w.shape[0] == 1:
@@ -323,15 +430,115 @@ class TTSServer:
         t0 = time.time()
         decoded = self._audio_decoder(latent)
-        logging.info(f"Decode: {time.time()-t0:.2f}s")
         total = time.time() - t_total
-        dur = decoded.waveform.shape[-1] / decoded.sampling_rate
         logging.info(f"Total: {total:.2f}s for {dur:.1f}s audio")
-        return decoded.waveform, decoded.sampling_rate
-    def generate_to_file(self, prompt, output, watermark: bool = True, **kwargs):
-        waveform, sr = self.generate(prompt, **kwargs)
         wav_cpu = waveform.cpu().float()
         if watermark:
             try:

 import sys
 import time
 from pathlib import Path
+from typing import Optional
 import torch
 import torchaudio
 def estimate_duration(prompt, multiplier=1.1):
+    """Defer to the shared sentence-aware + non-verbal action budget estimator
+    so warm-server outputs match the lengths of the per-call CLI runs."""
+    from duration_estimator import estimate_speech_duration
     base = estimate_speech_duration(prompt)
     return max(3.0, round(base * multiplier, 1))
+def _equal_power_crossfade(prev: torch.Tensor, nxt: torch.Tensor,
+                           sample_rate: int, fade_ms: float = 50.0) -> torch.Tensor:
+    """Equal-power crossfade concat: ``[prev | nxt]`` with a smooth boundary.
+    Both tensors are (C, T). Returns (C, T_prev + T_nxt - T_fade).
+    Equal-power (cos/sin envelopes) keeps perceived loudness constant through
+    the join — unlike a linear fade, which dips by ~3 dB in the middle when
+    the two sources are uncorrelated. Default 50 ms is short enough to be
+    inaudible on speech while still masking any waveform-level discontinuity
+    between independently-generated chunks.
+    """
+    fade_samples = int(round(fade_ms * 1e-3 * sample_rate))
+    fade_samples = max(1, min(fade_samples, prev.shape[-1], nxt.shape[-1]))
+    if fade_samples <= 1:
+        return torch.cat([prev, nxt], dim=-1)
+    t = torch.linspace(0.0, 1.0, fade_samples, device=prev.device, dtype=prev.dtype)
+    fade_out = torch.cos(t * torch.pi / 2)   # 1.0 -> 0.0
+    fade_in = torch.sin(t * torch.pi / 2)    # 0.0 -> 1.0
+    prev_tail = prev[..., -fade_samples:] * fade_out
+    nxt_head = nxt[..., :fade_samples] * fade_in
+    mixed = prev_tail + nxt_head
+    return torch.cat([prev[..., :-fade_samples], mixed, nxt[..., fade_samples:]], dim=-1)
 def auto_rescale_for_cfg(cfg: float) -> float:
     """CFG-aware std-rescale schedule that prevents output clipping at high cfg.
         self._velocity_model = None
         self._audio_conditioner = None
         self._audio_decoder = None
+        # RE-USE denoiser for the voice reference (input-side denoise).
+        # Lazy-loaded on first use; the cleaned-waveform cache below keeps
+        # chunked generations from re-denoising the same 10 s clip per chunk.
+        self._ref_denoiser = None
+        self._ref_denoise_cache: dict[tuple, "torch.Tensor"] = {}
         logging.info(f"TTSServer loading on {device}...")
         t0 = time.time()
         )
         logging.info(f"  AudioDecoder (warm): {time.time()-t0:.1f}s")
+    def _denoise_voice_ref(self, voice, voice_ref_path: str, ref_duration: float):
+        """Run RE-USE on the loaded voice reference and replace its waveform
+        with a cleaned mono signal.
+        Why pre-condition rather than post-generate: applying RE-USE to the
+        *output* suppresses paralinguistic events the model generates (laughs,
+        gasps, breaths, sighs) because they're broadband, non-tonal — exactly
+        what universal speech enhancement targets as "noise". Running it on
+        the *reference* instead gives the model a clean speaker / style
+        anchor, which it generalises from at inference time, while leaving
+        the generated paralinguistic content untouched.
+        Cached by ``(path, ref_duration, sampling_rate)`` so chunked
+        generations don't re-denoise the same 10 s clip per chunk.
+        """
+        cache_key = (voice_ref_path, float(ref_duration), int(voice.sampling_rate))
+        if cache_key in self._ref_denoise_cache:
+            return Audio(
+                waveform=self._ref_denoise_cache[cache_key],
+                sampling_rate=voice.sampling_rate,
+            )
+        # Lazy-load the denoiser. target_sr = input sr → no librosa resample
+        # round-trip; RE-USE does pure denoise. (The 48 kHz BWE that
+        # REUSEUpsampler can do is irrelevant here — the VAE conditioner
+        # resamples internally to whatever the audio branch expects.)
+        if self._ref_denoiser is None:
+            from super_resolution import REUSEUpsampler
+            try:
+                self._ref_denoiser = REUSEUpsampler(
+                    target_sr=int(voice.sampling_rate),
+                    device=self.device,
+                    chunk_size_s=1.0,
+                )
+            except Exception as e:
+                # Mamba kernels / weights missing → silently skip the denoise
+                # rather than blocking generation. Surfaces once per session.
+                logging.warning(f"Voice-ref denoise disabled (RE-USE unavailable: {e})")
+                self._ref_denoiser = False  # sentinel: don't retry this session
+                return voice
+        if self._ref_denoiser is False:
+            return voice
+        w = voice.waveform
+        # Collapse to mono — voice cloning is speaker-as-mono-source; we'll
+        # re-broadcast back to stereo after the conditioner.
+        if w.dim() == 3:
+            mono = w[0].mean(dim=0)
+        elif w.dim() == 2:
+            mono = w.mean(dim=0)
+        else:
+            mono = w
+        mono = mono.contiguous()
+        t0 = time.time()
+        cleaned, _ = self._ref_denoiser(mono, in_sr=int(voice.sampling_rate))
+        if cleaned.dim() == 2 and cleaned.shape[0] == 1:
+            cleaned = cleaned[0]
+        # Restore the (1, C=1, T) shape that the rest of the pipeline expects
+        # to consume — downstream code re-expands channels via repeat().
+        cleaned = cleaned.unsqueeze(0).unsqueeze(0).to(self.device, dtype=w.dtype)
+        logging.info(f"Voice-ref denoise (RE-USE): {time.time() - t0:.2f}s")
+        self._ref_denoise_cache[cache_key] = cleaned
+        return Audio(waveform=cleaned, sampling_rate=voice.sampling_rate)
     @torch.inference_mode()
     def generate(self, prompt, voice_ref=None, cfg_scale=2.5, stg_scale=1.5,
                  duration_multiplier=1.1, seed=42, ref_duration=10.0,
+                 rescale_scale="auto", gen_duration: float = 0.0,
+                 denoise_ref: bool = True):
         """Generate audio. Returns (waveform_path, duration_seconds).
         rescale_scale: latent-side CFG std-rescale that prevents clipping at
             float in [0, 1] for a fixed override, or 0 to disable.
         gen_duration: explicit target duration in seconds. 0 (default) → auto
             from prompt + duration_multiplier; >0 overrides everything else.
+        denoise_ref: when True (default) and a voice reference is provided,
+            RE-USE is applied to the *reference* before VAE encoding so the
+            model conditions on a clean speaker / style anchor. Generated
+            output (24→48 kHz) always goes through the LTX BigVGAN BWE.
         """
         t_total = time.time()
         if voice_ref and os.path.exists(voice_ref):
             t0 = time.time()
             voice = decode_audio_from_file(voice_ref, self.device, 0.0, ref_duration)
+            if denoise_ref:
+                voice = self._denoise_voice_ref(voice, voice_ref, ref_duration)
             w = voice.waveform
             if w.dim() == 2:
                 if w.shape[0] == 1:
         t0 = time.time()
         decoded = self._audio_decoder(latent)
+        out_waveform, out_sr = decoded.waveform, decoded.sampling_rate
+        logging.info(f"Decode (LTX BWE): {time.time()-t0:.2f}s")
         total = time.time() - t_total
+        dur = out_waveform.shape[-1] / out_sr
         logging.info(f"Total: {total:.2f}s for {dur:.1f}s audio")
+        return out_waveform, out_sr
+    @torch.inference_mode()
+    def generate_long(self, prompt, max_chunk_duration: float = 45.0,
+                      target_chunk_duration: float = 37.0,
+                      crossfade_ms: float = 50.0,
+                      progress_callback=None,
+                      **kwargs):
+        """Chunk-and-stitch generation for prompts whose estimated duration
+        exceeds ``max_chunk_duration``.
+        Splits ``prompt`` into <= ``max_chunk_duration`` chunks via
+        :func:`text_chunker.chunk_prompt_for_duration`, generates each one
+        through :meth:`generate` (same voice reference + seed for every
+        chunk, so speaker identity stays coherent across joins), and
+        concatenates the waveforms with an equal-power crossfade.
+        Returns ``(waveform, sample_rate)`` matching :meth:`generate`.
+        """
+        from text_chunker import chunk_prompt_for_duration
+        # gen_duration / duration_multiplier are per-chunk; pop them out so we
+        # control sizing here and forward only the per-chunk values.
+        per_chunk_mul = float(kwargs.pop("duration_multiplier", 1.1))
+        # gen_duration coming in as a global target only makes sense for the
+        # single-shot path; chunked generation derives durations per chunk.
+        kwargs.pop("gen_duration", None)
+        chunks = chunk_prompt_for_duration(
+            prompt,
+            max_duration_s=max_chunk_duration,
+            target_duration_s=target_chunk_duration,
+            duration_multiplier=per_chunk_mul,
+        )
+        logging.info(f"Long-form: {len(chunks)} chunks (target {target_chunk_duration:.0f}s, "
+                     f"max {max_chunk_duration:.0f}s)")
+        out_waveform: Optional[torch.Tensor] = None
+        out_sr: Optional[int] = None
+        t_total = time.time()
+        for idx, chunk in enumerate(chunks):
+            logging.info(f"  Chunk {idx + 1}/{len(chunks)}: est {chunk.est_duration_s:.1f}s, "
+                         f"{len(chunk.text)} chars")
+            if progress_callback is not None:
+                try:
+                    progress_callback(idx, len(chunks), chunk.est_duration_s)
+                except Exception as e:
+                    logging.warning(f"progress_callback raised, ignoring: {e}")
+            wav, sr = self.generate(
+                chunk.text,
+                duration_multiplier=per_chunk_mul,
+                **kwargs,
+            )
+            wav = wav.cpu().float()
+            if out_waveform is None:
+                out_waveform, out_sr = wav, sr
+            else:
+                if sr != out_sr:
+                    raise RuntimeError(f"Sample-rate mismatch between chunks: {out_sr} vs {sr}")
+                # Align channel counts: stereo crossfade with a mono buddy
+                # broadcasts cleanly via torch.cat after equalising dim 0.
+                if wav.shape[0] != out_waveform.shape[0]:
+                    if wav.shape[0] == 1:
+                        wav = wav.repeat(out_waveform.shape[0], 1)
+                    elif out_waveform.shape[0] == 1:
+                        out_waveform = out_waveform.repeat(wav.shape[0], 1)
+                out_waveform = _equal_power_crossfade(out_waveform, wav, out_sr,
+                                                      fade_ms=crossfade_ms)
+        total_dur = out_waveform.shape[-1] / out_sr
+        logging.info(f"Long-form total: {time.time() - t_total:.2f}s wall, {total_dur:.1f}s audio")
+        return out_waveform, out_sr
+    def generate_to_file(self, prompt, output, watermark: bool = True,
+                         max_chunk_duration: float = 45.0,
+                         target_chunk_duration: float = 37.0,
+                         crossfade_ms: float = 50.0,
+                         progress_callback=None,
+                         **kwargs):
+        # Auto-route to generate_long when the requested duration (explicit
+        # gen_duration if set, otherwise prompt-estimated) exceeds the chunk
+        # cap. Single-shot path otherwise — same as before, no regression for
+        # short prompts.
+        explicit_dur = float(kwargs.get("gen_duration") or 0.0)
+        est_dur = explicit_dur if explicit_dur > 0 else estimate_duration(
+            prompt, kwargs.get("duration_multiplier", 1.1))
+        if est_dur > max_chunk_duration:
+            waveform, sr = self.generate_long(
+                prompt,
+                max_chunk_duration=max_chunk_duration,
+                target_chunk_duration=target_chunk_duration,
+                crossfade_ms=crossfade_ms,
+                progress_callback=progress_callback,
+                **kwargs,
+            )
+        else:
+            if progress_callback is not None:
+                try:
+                    progress_callback(0, 1, est_dur)
+                except Exception:
+                    pass
+            waveform, sr = self.generate(prompt, **kwargs)
         wav_cpu = waveform.cpu().float()
         if watermark:
             try:

src/model_downloader.py CHANGED Viewed

@@ -15,12 +15,10 @@ logger = logging.getLogger(__name__)
 DRAMABOX_REPO = "ResembleAI/Dramabox"
 GEMMA_REPO = "unsloth/gemma-3-12b-it-bnb-4bit"
 # Default cache directory
-DEFAULT_CACHE = os.environ.get(
-    "DRAMABOX_CACHE",
-    os.path.join(os.path.expanduser("~"), ".cache", "dramabox"),
-)
 # Model files in the HF repo (flat structure)
 MODEL_FILES = {
@@ -75,6 +73,42 @@ def get_gemma_path(cache_dir: str = None) -> str:
     return local_dir
 def get_all_paths(cache_dir: str = None) -> dict:
     """Download all required models and return paths dict.

 DRAMABOX_REPO = "ResembleAI/Dramabox"
 GEMMA_REPO = "unsloth/gemma-3-12b-it-bnb-4bit"
+REUSE_REPO = "nvidia/RE-USE"
 # Default cache directory
+DEFAULT_CACHE = os.path.join(os.environ.get("HF_HOME", os.path.expanduser("~")), ".cache", "dramabox")
 # Model files in the HF repo (flat structure)
 MODEL_FILES = {
     return local_dir
+def get_reuse_code_path(cache_dir: str = None) -> str:
+    """Fetch the nvidia/RE-USE code + configs needed by REUSEUpsampler.
+    Only the .py / .yaml / .json files are pulled (~150 KB) — the 38 MB
+    ``model.safetensors`` is intentionally skipped because
+    ``SEMamba.from_pretrained("nvidia/RE-USE", ...)`` re-downloads weights
+    through the standard HF cache on first instantiation, so vendoring them
+    here would just duplicate ~38 MB on disk.
+    Honors $REUSE_DIR for a pre-vendored copy (e.g. ``third_party/RE-USE/``):
+    if set and exists, that path is returned without touching the network.
+    Falls back to ``third_party/RE-USE/`` if it already contains the model
+    file, otherwise snapshot-downloads into the dramabox cache.
+    """
+    env_dir = os.environ.get("REUSE_DIR")
+    if env_dir and Path(env_dir).is_dir():
+        return env_dir
+    repo_root = Path(__file__).resolve().parent.parent
+    local_vendor = repo_root / "third_party" / "RE-USE"
+    if (local_vendor / "models" / "generator_SEMamba_time_d4.py").is_file():
+        return str(local_vendor)
+    cache_dir = cache_dir or DEFAULT_CACHE
+    logger.info(f"Fetching RE-USE code/configs from {REUSE_REPO}...")
+    local_dir = snapshot_download(
+        repo_id=REUSE_REPO,
+        cache_dir=cache_dir,
+        token=os.environ.get("HF_TOKEN"),
+        allow_patterns=["*.py", "*.yaml", "*.json",
+                        "recipes/*", "models/*.py", "utils/*.py"],
+    )
+    logger.info(f"  -> {local_dir}")
+    return local_dir
 def get_all_paths(cache_dir: str = None) -> dict:
     """Download all required models and return paths dict.

src/super_resolution.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""RE-USE (nvidia/RE-USE) speech-enhancement wrapper.
+Used by ``TTSServer._denoise_voice_ref`` to denoise the input voice reference
+before VAE conditioning. Lazy-loads weights + code on first call so importing
+this module is cheap.
+    up = REUSEUpsampler(target_sr=48000, device="cuda")
+    clean, sr = up(wav, in_sr=24000)        # wav: (C, T) or (T,) float
+"""
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+from typing import Optional, Tuple
+import torch
+# REUSE_DIR is resolved lazily via model_downloader.get_reuse_code_path on
+# first use of REUSEUpsampler — it returns the vendored third_party/RE-USE/
+# tree if present, otherwise snapshot-downloads just the code from HF.
+_REUSE_DIR: Optional[Path] = None
+def _resolve_reuse_dir() -> Path:
+    global _REUSE_DIR
+    if _REUSE_DIR is None:
+        from model_downloader import get_reuse_code_path
+        _REUSE_DIR = Path(get_reuse_code_path())
+    return _REUSE_DIR
+class REUSEUpsampler:
+    """Universal speech enhancement with optional bandwidth extension.
+    nvidia/RE-USE is a 9.6 M-param bidirectional-Mamba model that operates on
+    STFT amplitude+phase. With ``target_sr`` set it both denoises *and* extends
+    the bandwidth to that rate via librosa kaiser-best resample + restoration.
+    License: NSCLv1 (noncommercial). The base ``SEMamba`` class lives in the
+    HF repo under ``models/generator_SEMamba_time_d4.py`` and pulls in the
+    ``mamba_ssm`` / ``causal-conv1d`` CUDA kernels.
+    """
+    def __init__(
+        self,
+        target_sr: int = 48000,
+        config_path: Optional[str] = None,
+        chunk_size_s: float = 1.0,
+        hop_portion: float = 0.5,
+        device: str | torch.device = "cuda",
+    ) -> None:
+        # chunk_size_s: peak VRAM scales linearly with chunk length.
+        #   5.0s -> 2.95 GB | 2.5s -> 1.52 GB | 1.0s -> 0.67 GB (default).
+        # 1.0s is chosen as default so RE-USE fits comfortably on top of the
+        # rest of the DramaBox pipeline on any 24 GB-class GPU.
+        self.device = torch.device(device)
+        self.target_sr = int(target_sr)
+        self.chunk_size_s = float(chunk_size_s)
+        self.hop_portion = float(hop_portion)
+        # Config path is resolved lazily on first use (alongside the code tree)
+        # so importing this module never triggers a download.
+        self._config_path_override = Path(config_path) if config_path else None
+        self.config_path: Optional[Path] = None
+        self._model = None
+        self._cfg = None
+        self._stft_fns = None  # (mag_phase_stft, mag_phase_istft, compress_factor, pad_or_trim)
+    @staticmethod
+    def _ensure_mamba_ssm_importable() -> None:
+        """Import ``mamba_ssm`` cleanly, with a kernel-free fallback if needed.
+        Normal path (kernels present): just import — fast path uses
+        ``selective_scan_cuda`` natively.
+        Fallback (kernels missing): the official package does an unconditional
+        ``import selective_scan_cuda`` at module load. We stub it into
+        ``sys.modules`` before importing, then redirect ``selective_scan_fn``
+        to the pure-PyTorch ``selective_scan_ref`` so the model still runs
+        (~5-10x slower).
+        """
+        try:
+            import selective_scan_cuda  # noqa: F401
+            import mamba_ssm  # noqa: F401
+            return  # Fast path: kernel present.
+        except ImportError:
+            pass
+        import types
+        if "selective_scan_cuda" not in sys.modules:
+            stub = types.ModuleType("selective_scan_cuda")
+            def _missing(*a, **kw):  # pragma: no cover - safety net only
+                raise NotImplementedError(
+                    "selective_scan_cuda kernel missing; the call should have "
+                    "been routed to selective_scan_ref via the runtime patch."
+                )
+            stub.fwd = _missing
+            stub.bwd = _missing
+            sys.modules["selective_scan_cuda"] = stub
+        from mamba_ssm.ops import selective_scan_interface as ssi
+        from mamba_ssm.modules import mamba_simple
+        if getattr(ssi, "_dramabox_kernel_free_patch_applied", False):
+            return
+        ssi.selective_scan_fn = ssi.selective_scan_ref
+        ssi.mamba_inner_fn = ssi.mamba_inner_ref
+        # mamba_simple imported these names by reference at module load -
+        # rebind there too, otherwise Mamba.forward keeps the original handles.
+        mamba_simple.selective_scan_fn = ssi.selective_scan_ref
+        mamba_simple.mamba_inner_fn = ssi.mamba_inner_ref
+        ssi._dramabox_kernel_free_patch_applied = True
+        logging.info(
+            "mamba_ssm kernel missing - using kernel-free fallback "
+            "(selective_scan_fn -> selective_scan_ref). Expect ~5-10x slowdown."
+        )
+    def _lazy_load(self) -> None:
+        if self._model is not None:
+            return
+        # Prefer real CUDA kernels; gracefully fall back to pure-PyTorch impl.
+        self._ensure_mamba_ssm_importable()
+        # The RE-USE module imports `from models...` and `from utils...` —
+        # both relative to the repo root. Add to path during load.
+        reuse_dir = _resolve_reuse_dir()
+        if str(reuse_dir) not in sys.path:
+            sys.path.insert(0, str(reuse_dir))
+        if self.config_path is None:
+            self.config_path = self._config_path_override or (
+                reuse_dir / "recipes" /
+                "USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml"
+            )
+        from models.generator_SEMamba_time_d4 import SEMamba  # type: ignore
+        from models.stfts import mag_phase_stft, mag_phase_istft  # type: ignore
+        from utils.util import load_config, pad_or_trim_to_match  # type: ignore
+        self._cfg = load_config(str(self.config_path))
+        compress_factor = self._cfg["model_cfg"]["compress_factor"]
+        self._stft_fns = (mag_phase_stft, mag_phase_istft, compress_factor, pad_or_trim_to_match)
+        # SEMamba is a PyTorchModelHubMixin; from_pretrained pulls weights from HF.
+        model = SEMamba.from_pretrained("nvidia/RE-USE", cfg=self._cfg).to(self.device)
+        model.train(False)
+        self._model = model
+        n_params = sum(p.numel() for p in model.parameters())
+        logging.info(f"RE-USE loaded: SEMamba ({n_params / 1e6:.1f}M params) -> {self.target_sr} Hz")
+    @staticmethod
+    def _make_even(v: float) -> int:
+        v = int(round(v))
+        return v if v % 2 == 0 else v + 1
+    @torch.inference_mode()
+    def __call__(self, waveform: torch.Tensor, in_sr: int = 16000) -> Tuple[torch.Tensor, int]:
+        """Chunked overlap-add denoise / BWE (ports nvidia/RE-USE inference_chunk.py).
+        Peak VRAM is bounded by ``chunk_size_s * target_sr`` rather than the
+        whole clip, so a 60 s clip costs the same as a 5 s one. Crossfade is
+        a Hann-window normalized overlap-add with default 50% hop.
+        """
+        import math
+        self._lazy_load()
+        import librosa
+        mag_phase_stft, mag_phase_istft, compress_factor, pad_or_trim_to_match = self._stft_fns
+        # STFT params are scaled relative to the config's training rate (8000).
+        base_n_fft = self._cfg["stft_cfg"]["n_fft"]
+        base_hop = self._cfg["stft_cfg"]["hop_size"]
+        base_win = self._cfg["stft_cfg"]["win_size"]
+        base_sr = self._cfg["stft_cfg"]["sampling_rate"]
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        # 1. Resample to target rate first (skips if target_sr == in_sr).
+        if self.target_sr != in_sr:
+            wav_np = waveform.cpu().float().numpy()
+            wav_np = librosa.resample(
+                wav_np, orig_sr=in_sr, target_sr=self.target_sr, res_type="kaiser_best"
+            )
+            wav = torch.from_numpy(wav_np).to(self.device, dtype=torch.float32)
+        else:
+            wav = waveform.to(self.device, dtype=torch.float32)
+        op_sr = self.target_sr
+        n_fft = self._make_even(base_n_fft * op_sr // base_sr)
+        hop = self._make_even(base_hop * op_sr // base_sr)
+        win = self._make_even(base_win * op_sr // base_sr)
+        # 2. Chunked OLA with Hann analysis window. Mirrors inference_chunk.py.
+        chunk_size = int(self.chunk_size_s * op_sr)
+        hop_length = int(self.hop_portion * chunk_size)
+        window = torch.hann_window(chunk_size, device=self.device)
+        n_ch, total = wav.shape
+        enhanced = torch.zeros_like(wav)
+        window_sum = torch.zeros_like(wav)
+        n_chunks = max(1, math.ceil((total - chunk_size) / hop_length) + 1) if total > chunk_size else 1
+        for c in range(n_ch):
+            ch_in = wav[c : c + 1]                              # (1, T)
+            for i in range(n_chunks):
+                start = i * hop_length
+                end = min(start + chunk_size, total)
+                chunk = ch_in[:, start:end]
+                if chunk.shape[-1] < 2:                          # skip degenerate tail
+                    continue
+                noisy_mag, noisy_pha, _ = mag_phase_stft(
+                    chunk, n_fft=n_fft, hop_size=hop, win_size=win,
+                    compress_factor=compress_factor, center=True, addeps=False,
+                )
+                amp_g, pha_g, _ = self._model(noisy_mag, noisy_pha)
+                # "Sweep artifact" filter — match the official inference.
+                mag = torch.expm1(torch.relu(amp_g))
+                zero_portion = (mag == 0).sum(dim=1) / mag.shape[1]
+                amp_g[:, :, (zero_portion > 0.5)[0]] = 0
+                audio_g = mag_phase_istft(amp_g, pha_g, n_fft, hop, win, compress_factor)
+                audio_g = pad_or_trim_to_match(chunk.detach(), audio_g, pad_value=1e-8)
+                w_slice = window[: audio_g.shape[-1]]
+                enhanced[c : c + 1, start : start + audio_g.shape[-1]] += audio_g * w_slice
+                window_sum[c : c + 1, start : start + audio_g.shape[-1]] += w_slice
+        # 3. Normalize where windows overlap. Avoid divide-by-zero at clip tails.
+        mask = window_sum > 1e-8
+        enhanced[mask] = enhanced[mask] / window_sum[mask]
+        return enhanced.clamp(-1.0, 1.0).cpu().float(), op_sr

src/text_chunker.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""Prompt chunking for long-form DramaBox generation.
+The base LTX-2.3 audio DiT was trained on clips <= ~20 s. The silence-prior
+patch in ``inference_server.py`` keeps generations sane up to ~45 s, but the
+prior re-emerges past that boundary. For arbitrary-length prompts we split the
+text into < 45 s chunks, generate each conditioned on the same voice reference,
+and crossfade them back together.
+Chunking is quote-aware (sentence terminators inside ``"..."`` don't count)
+and preserves the speaker-description prefix on every chunk so the model keeps
+the same persona / delivery style across joins.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+# Matches the leading speaker description, ending at the first comma that's
+# directly followed by a space + opening quote. Anything before that is treated
+# as persona/style metadata and re-attached to every chunk.
+#   "A shadowy villain speaks with cold menace, \"You have entered...\""
+#    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+_PREFIX_RE = re.compile(r'^([^"\']{3,}?)(,\s*)(?=["\'])', re.DOTALL)
+@dataclass
+class PromptChunk:
+    text: str
+    est_duration_s: float
+def extract_speaker_prefix(prompt: str) -> tuple[Optional[str], str]:
+    """Return ``(prefix, body)`` where ``prefix`` is the speaker description.
+    If the prompt has the canonical ``"<persona>, "<dialogue>"..."`` form, the
+    persona (without the trailing comma) is returned as the prefix and the rest
+    of the prompt as the body. Otherwise ``(None, prompt)`` — no prefix to
+    propagate, the whole prompt is treated as a single body.
+    """
+    m = _PREFIX_RE.match(prompt)
+    if not m:
+        return None, prompt
+    return m.group(1).strip(), prompt[m.end():]
+def split_sentences_outside_quotes(text: str) -> List[str]:
+    """Split ``text`` into sentences, ignoring terminators inside quotes.
+    A "sentence" here is a span ending in ``.``/``!``/``?`` (optionally followed
+    by a closing quote) at the top level — i.e. not inside an open ``"..."`` or
+    ``'...'`` pair. Empty / whitespace-only fragments are dropped.
+    Examples:
+        >>> split_sentences_outside_quotes('He says, "Hi, how are you?" Then leaves.')
+        ['He says, "Hi, how are you?"', 'Then leaves.']
+    """
+    sentences: List[str] = []
+    buf: List[str] = []
+    in_double = False
+    in_single = False
+    i = 0
+    n = len(text)
+    while i < n:
+        ch = text[i]
+        buf.append(ch)
+        if ch == '"' and not in_single:
+            was_inside = in_double
+            in_double = not in_double
+            # Treat the *closing* quote as a sentence boundary if the last
+            # meaningful char inside it was a terminator: ``...how are you?"``.
+            if was_inside and len(buf) >= 2 and buf[-2] in ".!?":
+                # Boundary requires whitespace / end-of-string after.
+                if i + 1 >= n or text[i + 1].isspace():
+                    sentence = "".join(buf).strip()
+                    if sentence:
+                        sentences.append(sentence)
+                    buf = []
+                    i += 1
+                    continue
+        elif ch == "'" and not in_double:
+            # Apostrophes inside a word (don't, it's) are not quote toggles.
+            prev = text[i - 1] if i > 0 else " "
+            nxt = text[i + 1] if i + 1 < n else " "
+            if not (prev.isalpha() and nxt.isalpha()):
+                in_single = not in_single
+        elif ch in ".!?" and not in_double and not in_single:
+            # Greedily eat trailing closing quotes / punctuation.
+            j = i + 1
+            while j < n and text[j] in '."\')]':
+                buf.append(text[j])
+                if text[j] == '"':
+                    in_double = not in_double  # closing quote toggle
+                j += 1
+            if j >= n or text[j].isspace():
+                sentence = "".join(buf).strip()
+                if sentence:
+                    sentences.append(sentence)
+                buf = []
+                i = j
+                continue
+        i += 1
+    tail = "".join(buf).strip()
+    if tail:
+        sentences.append(tail)
+    return sentences
+def _assemble(prefix: Optional[str], sentences: List[str]) -> str:
+    body = " ".join(s.strip() for s in sentences if s.strip())
+    if not prefix:
+        return body
+    # Re-attach prefix in the canonical "persona, body" form. If the first
+    # sentence already starts with a stage direction (no opening quote), drop
+    # the comma + use a period so the syntax reads naturally.
+    if body.lstrip().startswith(("'", '"')):
+        return f"{prefix}, {body}"
+    return f"{prefix}. {body}"
+def chunk_prompt_for_duration(
+    prompt: str,
+    max_duration_s: float = 45.0,
+    target_duration_s: float = 37.0,
+    duration_multiplier: float = 1.1,
+) -> List[PromptChunk]:
+    """Split ``prompt`` into <= ``max_duration_s`` chunks.
+    Args:
+        prompt: Full scene prompt (DramaBox format or plain text).
+        max_duration_s: Hard cap per chunk; we never emit a chunk whose
+            estimator output (after ``duration_multiplier``) exceeds this.
+        target_duration_s: Soft cap; we close the current chunk when adding
+            the next sentence would push it past this. Leaving 5-10 s of
+            headroom below ``max_duration_s`` keeps us safe against the
+            estimator under-shooting by ~10-15% on action-heavy prompts.
+        duration_multiplier: Same breathing-room multiplier the inference
+            server applies in ``estimate_duration``; matches the per-chunk
+            target the model is actually asked to generate.
+    Returns:
+        List of :class:`PromptChunk`. Single-chunk prompts return a 1-element
+        list with the original prompt unchanged.
+    """
+    from duration_estimator import estimate_speech_duration
+    def _est(t: str) -> float:
+        return estimate_speech_duration(t) * duration_multiplier
+    total = _est(prompt)
+    if total <= max_duration_s:
+        return [PromptChunk(text=prompt, est_duration_s=total)]
+    prefix, body = extract_speaker_prefix(prompt)
+    sentences = split_sentences_outside_quotes(body)
+    if not sentences:
+        # Degenerate: no sentence boundaries. Fall back to whitespace-token
+        # chunking so we still produce SOMETHING under the cap.
+        sentences = body.split()
+    chunks: List[PromptChunk] = []
+    current: List[str] = []
+    current_dur = 0.0
+    for sent in sentences:
+        candidate = _assemble(prefix, current + [sent])
+        cand_dur = _est(candidate)
+        if current and cand_dur > target_duration_s:
+            # Close the current chunk before adding this sentence.
+            assembled = _assemble(prefix, current)
+            chunks.append(PromptChunk(text=assembled, est_duration_s=_est(assembled)))
+            current = [sent]
+            current_dur = _est(_assemble(prefix, current))
+        else:
+            current.append(sent)
+            current_dur = cand_dur
+        # Pathological case: a single sentence whose estimator output is
+        # already past max_duration_s. Emit it on its own and let downstream
+        # generate() truncate the request at the model's hard limit; the user
+        # gets a degraded but non-crashing result instead of an exception.
+        if len(current) == 1 and current_dur > max_duration_s:
+            solo = _assemble(prefix, current)
+            chunks.append(PromptChunk(text=solo, est_duration_s=current_dur))
+            current = []
+            current_dur = 0.0
+    if current:
+        assembled = _assemble(prefix, current)
+        chunks.append(PromptChunk(text=assembled, est_duration_s=_est(assembled)))
+    return chunks