File size: 14,219 Bytes
08c5e28
fdc2b0b
08c5e28
 
 
 
 
 
 
 
 
 
 
96dc22c
e53641f
 
 
 
96dc22c
08c5e28
e53641f
08c5e28
 
 
8cd4942
08c5e28
 
 
fdc2b0b
ac99a44
b8b67ad
ac99a44
 
 
 
fdc2b0b
ac99a44
 
 
 
 
 
 
 
 
 
08c5e28
 
f1c4065
 
 
 
 
 
08c5e28
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
 
 
f1c4065
08c5e28
f1c4065
08c5e28
36a9e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08c5e28
 
 
e53641f
 
 
 
 
 
 
 
 
 
 
 
 
48d32ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e53641f
48d32ab
e53641f
 
 
 
 
 
 
 
7e0eb32
 
 
 
 
e53641f
08c5e28
 
7e0eb32
08c5e28
e53641f
 
 
 
 
 
 
 
7e0eb32
fdc2b0b
7e0eb32
 
 
 
08c5e28
 
 
 
e53641f
 
 
 
5cc51a5
433ac9f
7e0eb32
 
 
 
08c5e28
 
 
e53641f
b2203ed
08c5e28
 
9165469
e53641f
 
 
 
08c5e28
e53641f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3
"""DramaBox β€” Gradio demo (warm server).

Loads the warm TTSServer once, then handles requests at ~2.5 s each. All
generated audio is invisibly watermarked with Resemble Perth before being
returned to the user.
"""
import logging
import os
import sys
import tempfile
import time

import gradio as gr
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from gradio import Server
from gradio.data_classes import FileData
import spaces


# Local src import.
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
from inference_server import TTSServer  # noqa: E402
from model_downloader import get_all_paths  # noqa: E402


logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...")
PATHS = get_all_paths()

# Module-level warm load (same pattern as IndexTTS-2-Demo on ZeroGPU). The
# `spaces` package patches torch so that .to("cuda") at import time pins the
# weights into ZeroGPU's shared memory; each @spaces.GPU call then maps them
# onto the actual GPU instantly. First user request is ~2.5 s instead of ~30 s.
logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...")
tts = TTSServer(
    checkpoint=PATHS["transformer"],
    full_checkpoint=PATHS["audio_components"],
    gemma_root=PATHS["gemma_root"],
    device="cuda",
    dtype=os.environ.get("LTX_DTYPE", "bf16"),
    compile_model=False,                  # torch.compile breaks under ZeroGPU's brief GPU windows
    bnb_4bit=True,                        # unsloth Gemma is pre-quantized
)
logging.info("TTSServer ready.")


# ── Example prompts shipped with a matching voice reference ──────────────────
# Files live under assets/voices/ so users can click a row and generate
# without uploading anything.
_VOICES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "voices")

EXAMPLES: list[tuple[str, str, str]] = [
    (
        "Villain monologue",
        os.path.join(_VOICES_DIR, "male_harvey_keitel.mp3"),
        'A shadowy villain speaks with cold menace, "You have entered my domain, mortal." '
        'He chuckles darkly, "Such arrogance will be your undoing." '
        'His voice rises with fury, "Kneel, or be destroyed where you stand!"',
    ),
    (
        "Talk-show host wheeze-laugh",
        os.path.join(_VOICES_DIR, "male_conan.mp3"),
        'A talk show host gasps with shock, "No! You did NOT just say that!" '
        'He bursts into uncontrollable laughter, "Hahaha! Oh my god, oh my god!" '
        'He wheezes, "I cannot, I literally cannot breathe right now!"',
    ),
    (
        "Tender goodnight whisper",
        os.path.join(_VOICES_DIR, "female_shadowheart.wav"),
        'A woman speaks tenderly, "It has been a long day, my love." '
        'She whispers, "Close your eyes. I am right here." '
        'She hums quietly, "Mmmm-mmm. Sleep now."',
    ),
    (
        "Old-school radio anchor",
        os.path.join(_VOICES_DIR, "male_old_movie.wav"),
        'A radio host clears his throat, "Excuse me, pardon that." '
        'He settles into a warm, professional tone, "Good evening everyone, '
        'and welcome back to the show. We have got a wonderful lineup tonight."',
    ),
    (
        "Catgirl uncontrollable giggling",
        os.path.join(_VOICES_DIR, "female_american.wav"),
        'A playful girl already mid-giggle, "Hehehe, oh my gosh you should see your face!" '
        'She gasps for air between giggles, "Oh my, hehe, oh my, I cannot stop!" '
        'She tries to compose herself, "Ahhhhh okay okay okay, I will stop, I promise."',
    ),
    (
        "Hero stammering courage",
        os.path.join(_VOICES_DIR, "male_arnie.mp3"),
        'A young warrior speaks with a trembling voice, "I... I do not know if I can do this." '
        'He takes a shaky breath, "But someone has to try." '
        'His voice steadies with growing fire, "No more running. I WILL fight!"',
    ),
    (
        "Exhausted dad, fraying patience",
        os.path.join(_VOICES_DIR, "male_petergriffin.wav"),
        'An exhausted father speaks with fraying patience, "Sweetie, daddy is asking very nicely." '
        'He sighs deeply, "Ohhhh my goodness." '
        'He puts on an overly cheerful voice, "Hey buddy! Look at the shiny thing!" '
        'Then he laughs helplessly, "Hahaha, I am losing my mind."',
    ),
    (
        "Smug-confident announcer",
        os.path.join(_VOICES_DIR, "male_samuel_j.mp3"),
        'A confident announcer speaks proudly, "And now, the moment you have all been waiting for." '
        'He chuckles knowingly, "Heheh, trust me, this one is going to blow you away."',
    ),
    # ── Long-form examples (~30 s each) ───────────────────────────────────────
    # These pair a richer multi-beat scene with gen_duration = 30 s in the
    # Examples row below so the model is asked for a full half-minute clip.
    (
        "30s β€’ Villain soliloquy",
        os.path.join(_VOICES_DIR, "male_harvey_keitel.mp3"),
        'A shadowy villain stands at the edge of his throne room, gazing into the dark. '
        'He speaks with slow, measured menace, "So, the little hero has come to finish me, has he?" '
        'He chuckles low and humourless, "Hehe, oh how delightfully predictable you mortals are." '
        'His voice hardens into ice, "I have lived ten thousand years. I have seen empires rise and fall like the tide." '
        'He scoffs, "And you think you, with your borrowed sword and your trembling hands, will be the one to end me?" '
        'A long pause. He whispers, almost tenderly, "I will give you a single chance to turn around and walk away." '
        'Then his voice rises with crushing finality, "Choose, child. The door behind you, or the grave at your feet."',
    ),
    (
        "30s β€’ Late-night radio monologue",
        os.path.join(_VOICES_DIR, "male_old_movie.wav"),
        'A radio host clears his throat softly into the microphone in the late hours of the night. '
        'He settles into a warm, smoky tone, "Good evening, dear listeners, and welcome back to the After Hours Hour." '
        'He sighs contentedly, "Mmm, what a night it has been. The rain is tapping at my window like an old friend." '
        'He chuckles softly, "Heheh, you know the kind of friend, the one that always shows up unannounced." '
        'His voice drops, intimate, "I want you to lean back, wherever you are. Pour yourself something warm." '
        'He pauses, breath audible, "Tonight we are going to talk about love, and loss, and the songs that hold us together." '
        'A smile in his voice, "And I have got the perfect record cued up to start us off, so stay right where you are."',
    ),
    (
        "30s β€’ Stand-up wheeze-laugh",
        os.path.join(_VOICES_DIR, "male_conan.mp3"),
        'A talk show host walks out and the crowd is already roaring. He gasps in mock outrage, "No! No no no!" '
        'He bursts into uncontrollable laughter, "Hahahaha, oh my god, oh my god, you cannot do that to me already!" '
        'He wheezes, gasping for air, "I have not even, hahaha, I have not even said hello yet!" '
        'He tries to compose himself, "Okay, okay, just give me a second here, give me a second." '
        'He clears his throat dramatically, "Ahem. Good evening, ladies and gentlemen." '
        'Then he loses it again, "Hahaha! No, sorry, sorry, I just remembered what happened in the green room." '
        'He pants, "Oh man, oh man, this is going to be one of those nights, I can already tell."',
    ),
    (
        "30s β€’ Bedtime story",
        os.path.join(_VOICES_DIR, "female_shadowheart.wav"),
        'A mother sits at the edge of her child\'s bed in the dim glow of a single lamp. '
        'She speaks softly, "Once upon a time, in a kingdom by the sea, there lived a small dragon named Pip." '
        'She lowers her voice playfully, "Now Pip was not like the other dragons. Pip was afraid of fire." '
        'She smiles warmly, "Mmm, can you imagine? A dragon who was afraid of his own breath?" '
        'A gentle pause, "But Pip had something the other dragons did not have. Pip had courage in his heart." '
        'She hums softly, "Mmmmm. And one cold winter night, when the village below ran out of warmth..." '
        'Her voice drops to a whisper, "Pip closed his eyes, took a deep, deep breath, and remembered who he was."',
    ),
    (
        "30s β€’ Sports commentary",
        os.path.join(_VOICES_DIR, "male_samuel_j.mp3"),
        'A sports commentator leans into the microphone with the crowd roaring around him. '
        'He shouts with rising energy, "Oh, this is it! This is the moment we have been waiting for all season!" '
        'He pants between phrases, "She has the ball at midfield, she is dribbling past one, past two!" '
        'A sudden gasp, "Oh my, what a move! Did you see that footwork, ladies and gentlemen?" '
        'His voice climbs, "She is twenty yards out, fifteen yards out, she winds back, and she SHOOTS!" '
        'A massive pause, then, "GOAAAAAAL! What a strike! What an absolute thunderbolt of a goal!" '
        'He laughs in disbelief, "Hahaha! Unbelievable! Forty thousand fans on their feet, and so am I!"',
    ),
]


app = Server()

# Serve static voice files and images
app.mount("/assets", StaticFiles(directory="assets"), name="assets")


@app.get("/", response_class=HTMLResponse)
async def homepage():
    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
    with open(html_path, "r", encoding="utf-8") as f:
        return f.read()


_GPU_BASE_S = 10           # bare-minimum window even for a single sentence
_GPU_PER_SENTENCE_S = 1    # add 1 s per additional sentence
_GPU_CAP_S = 110           # leave 10 s headroom under ZeroGPU's 120 s ceiling


def _count_sentences(prompt: str) -> int:
    """Count TTS sentences in ``prompt`` using the same quote-aware splitter
    the long-form chunker uses (``src/text_chunker``). Terminators inside
    ``"..."`` dialogue do **not** count, so the GPU window calc agrees with
    what the chunker sees β€” and dialogue-heavy prompts don't get over-budgeted.
    Always returns β‰₯1 so a single fragment still gets a real window.
    """
    if not prompt or not prompt.strip():
        return 1
    try:
        from text_chunker import split_sentences_outside_quotes
        n = len(split_sentences_outside_quotes(prompt))
    except Exception:
        # Fallback: cheap punctuation count if the chunker import fails for any
        # reason β€” preserves the ability to size GPU windows even on a broken
        # import path.
        n = sum(1 for ch in prompt if ch in ".!?")
    return max(1, n)


def _gpu_duration(
    prompt: str,
    audio_ref: FileData | None,
    cfg: float,
    stg: float,
    dur_mult: float,
    gen_dur: float,
    ref_dur: float,
    seed: int,
    denoise_ref: bool = True,
    max_chunk_duration: float = 45.0,
    target_chunk_duration: float = 37.0,
    crossfade_ms: float = 50.0,
) -> int:
    """Per-call ZeroGPU window sizing.

    ZeroGPU rejects any static decorator value above the account's per-call
    cap (120 s on PRO), but ``duration=`` also accepts a callable evaluated
    per request β€” we ask only for what each call needs:

        window = _GPU_BASE_S + (num_sentences - 1) Γ— _GPU_PER_SENTENCE_S

    Defaults: 10 s base + 1 s/extra sentence, capped at 110 s (a 10 s safety
    margin under the 120 s ZeroGPU ceiling). Numbers tuned to observed
    runtime on this Space's hardware.

    Under-allocating is worse than over: if a call exceeds its allocated
    duration ZeroGPU kills it (the user sees a generation failure) **and**
    daily quota is still consumed against the time actually spent. Shorter
    allocations *do* improve queue priority (per HF docs), which is why we
    don't just pin everything at 110.
    """
    n = _count_sentences(prompt)
    needed = _GPU_BASE_S + (n - 1) * _GPU_PER_SENTENCE_S
    return max(_GPU_BASE_S, min(needed, _GPU_CAP_S))


@app.api()
@spaces.GPU(duration=_gpu_duration)
def generate_audio(
    prompt: str,
    audio_ref: FileData | None,
    cfg: float,
    stg: float,
    dur_mult: float,
    gen_dur: float,
    ref_dur: float,
    seed: int,
    denoise_ref: bool = True,
    max_chunk_duration: float = 45.0,
    target_chunk_duration: float = 37.0,
    crossfade_ms: float = 50.0,
) -> FileData:
    if not prompt or not prompt.strip():
        raise gr.Error("Prompt is empty.")

    t0 = time.time()
    ref_path = None
    if audio_ref:
        if isinstance(audio_ref, dict):
            ref_path = audio_ref.get("path")
        elif hasattr(audio_ref, "path"):
            ref_path = audio_ref.path
    if ref_path and not os.path.exists(ref_path):
        ref_path = None

    output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
    # Long-form: generate_to_file auto-routes to the chunk-and-stitch path when
    # the estimated (or explicit gen_dur) duration exceeds max_chunk_duration.
    # denoise_ref runs RE-USE on the voice reference before VAE encoding so the
    # model conditions on a cleaner speaker / style anchor.
    tts.generate_to_file(
        prompt=prompt,
        output=output,
        voice_ref=ref_path,
        cfg_scale=cfg,
        stg_scale=stg,
        duration_multiplier=dur_mult,
        seed=int(seed),
        gen_duration=float(gen_dur),
        ref_duration=float(ref_dur),
        denoise_ref=bool(denoise_ref),
        max_chunk_duration=float(max_chunk_duration),
        target_chunk_duration=float(target_chunk_duration),
        crossfade_ms=float(crossfade_ms),
    )
    elapsed = time.time() - t0
    logging.info(f"Generated in {elapsed:.2f}s -> {output}")
    return FileData(path=output)


if __name__ == "__main__":
    port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
    app.launch(
        server_name="0.0.0.0",
        server_port=port,
        show_error=True
    )