github-actions[bot]
deploy: switch to dramabox requirements @ f090cde
d79393d
"""
Dramabox Space entrypoint — pure Gradio 5.x for ZeroGPU compat.
Why no FastAPI mount:
- ZeroGPU only allocates a GPU for @spaces.GPU functions wired into Gradio
events (button.click / Interface inputs). FastAPI-mounted endpoints
don't trigger HF's ZeroGPU scheduler, and the mounting pattern was
also causing HF's runtime to kill the container after startup.
- This file mirrors the upstream ResembleAI/Dramabox Space's app.py.
- The React frontend (DramaboxTool.tsx) calls the named API endpoint
via `@gradio/client` instead of fetch().
Dramabox checkpoints are lazy-loaded on the first request so the Space
boots even before `dramabox_src/` is vendored — first call will surface
the import error to the caller, subsequent calls reuse the warm server.
"""
from __future__ import annotations
import logging
import os
import sys
import tempfile
import threading
import time
from pathlib import Path
import gradio as gr
import spaces
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Vendored Dramabox source. Resemble doesn't publish TTSServer to PyPI;
# `dramabox_src/` mirrors the upstream Space layout: `src/` (inference glue)
# alongside `ltx2/` (LTX-2 core packages). `inference_server.py` itself does
# `sys.path.insert(0, APP_DIR/'ltx2')` where APP_DIR = parent.parent, so we
# only need to put `dramabox_src/src/` on sys.path here.
_VENDORED_SRC = Path(__file__).parent / "dramabox_src" / "src"
if _VENDORED_SRC.exists() and str(_VENDORED_SRC) not in sys.path:
sys.path.insert(0, str(_VENDORED_SRC))
_tts_lock = threading.Lock()
_tts = None # populated lazily on first on_generate() call
def _get_tts():
"""Load TTSServer once, reuse across calls. Surfaces a clean error
if `dramabox_src/` isn't vendored — caller sees a gr.Error toast."""
global _tts
if _tts is not None:
return _tts
with _tts_lock:
if _tts is not None:
return _tts
try:
from inference_server import TTSServer # type: ignore[import-not-found]
from model_downloader import get_all_paths # type: ignore[import-not-found]
except ImportError as e:
raise gr.Error(
"Dramabox source not vendored on this Space. Copy "
"ResembleAI/Dramabox's src/ into the repo as dramabox_src/."
) from e
logging.info("Fetching Dramabox checkpoints (cached after first run)...")
paths = get_all_paths()
logging.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...")
_tts = TTSServer(
checkpoint=paths["transformer"],
full_checkpoint=paths["audio_components"],
gemma_root=paths["gemma_root"],
device="cuda",
dtype=os.environ.get("LTX_DTYPE", "bf16"),
compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows
bnb_4bit=True, # unsloth Gemma is pre-quantized
)
logging.info("Dramabox TTSServer ready.")
return _tts
@spaces.GPU(duration=60)
def on_generate(prompt, audio_ref, cfg, stg, dur_mult, gen_dur, ref_dur, seed):
"""Main generation endpoint — wired to the Generate button below so
HF's ZeroGPU scheduler detects it at import time."""
if not prompt or not prompt.strip():
raise gr.Error("Prompt is empty.")
tts = _get_tts()
t0 = time.time()
ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None
output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
tts.generate_to_file(
prompt=prompt,
output=output,
voice_ref=ref_path,
cfg_scale=float(cfg),
stg_scale=float(stg),
duration_multiplier=float(dur_mult),
seed=int(seed),
gen_duration=float(gen_dur),
ref_duration=float(ref_dur),
)
elapsed = time.time() - t0
logging.info(f"Dramabox generated in {elapsed:.2f}s -> {output}")
return output
with gr.Blocks(title="VideoVoice Dramabox") as demo:
gr.Markdown(
"""
# VideoVoice — Dramabox
Resemble AI's directable speech engine ("scene prompts" with quoted
dialogue and stage directions). The React frontend at
[videovoice.app/app/dramabox](https://videovoice.app/app/dramabox)
is the primary UI; this Space exposes the model via the named
`/dramabox` Gradio API endpoint, called from the React app through
`@gradio/client`.
"""
)
with gr.Row():
with gr.Column(scale=3):
prompt_in = gr.Textbox(
label="Scene prompt",
placeholder='A weary detective, "I told you it was him." He sighs. "Every time."',
lines=6,
)
audio_ref_in = gr.Audio(
label="Voice reference (optional, 10+ seconds)",
type="filepath",
)
gen_btn = gr.Button("Generate", variant="primary", size="lg")
with gr.Column(scale=2):
with gr.Accordion("Inference settings", open=True):
cfg_in = gr.Slider(1.0, 10.0, value=2.5, step=0.5, label="CFG scale")
stg_in = gr.Slider(0.0, 5.0, value=1.5, step=0.5, label="STG scale")
dur_mult_in = gr.Slider(
0.8, 2.0, value=1.1, step=0.05,
label="Duration × (only used when target duration = 0)",
)
gen_dur_in = gr.Slider(
0.0, 60.0, value=0.0, step=1.0,
label="Target duration (s) — 0 = auto",
)
ref_dur_in = gr.Slider(
3.0, 30.0, value=10.0, step=1.0,
label="Reference duration (s)",
)
seed_in = gr.Number(value=42, label="Seed", precision=0)
audio_out = gr.Audio(label="Generated audio", type="filepath")
gen_btn.click(
on_generate,
inputs=[prompt_in, audio_ref_in, cfg_in, stg_in,
dur_mult_in, gen_dur_in, ref_dur_in, seed_in],
outputs=[audio_out],
api_name="dramabox",
)
if __name__ == "__main__":
demo.queue().launch()