""" Dramabox Space entrypoint — pure Gradio 5.x for ZeroGPU compat. Why no FastAPI mount: - ZeroGPU only allocates a GPU for @spaces.GPU functions wired into Gradio events (button.click / Interface inputs). FastAPI-mounted endpoints don't trigger HF's ZeroGPU scheduler, and the mounting pattern was also causing HF's runtime to kill the container after startup. - This file mirrors the upstream ResembleAI/Dramabox Space's app.py. - The React frontend (DramaboxTool.tsx) calls the named API endpoint via `@gradio/client` instead of fetch(). Dramabox checkpoints are lazy-loaded on the first request so the Space boots even before `dramabox_src/` is vendored — first call will surface the import error to the caller, subsequent calls reuse the warm server. """ from __future__ import annotations import logging import os import sys import tempfile import threading import time from pathlib import Path import gradio as gr import spaces logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # Vendored Dramabox source. Resemble doesn't publish TTSServer to PyPI; # `dramabox_src/` mirrors the upstream Space layout: `src/` (inference glue) # alongside `ltx2/` (LTX-2 core packages). `inference_server.py` itself does # `sys.path.insert(0, APP_DIR/'ltx2')` where APP_DIR = parent.parent, so we # only need to put `dramabox_src/src/` on sys.path here. _VENDORED_SRC = Path(__file__).parent / "dramabox_src" / "src" if _VENDORED_SRC.exists() and str(_VENDORED_SRC) not in sys.path: sys.path.insert(0, str(_VENDORED_SRC)) _tts_lock = threading.Lock() _tts = None # populated lazily on first on_generate() call def _get_tts(): """Load TTSServer once, reuse across calls. Surfaces a clean error if `dramabox_src/` isn't vendored — caller sees a gr.Error toast.""" global _tts if _tts is not None: return _tts with _tts_lock: if _tts is not None: return _tts try: from inference_server import TTSServer # type: ignore[import-not-found] from model_downloader import get_all_paths # type: ignore[import-not-found] except ImportError as e: raise gr.Error( "Dramabox source not vendored on this Space. Copy " "ResembleAI/Dramabox's src/ into the repo as dramabox_src/." ) from e logging.info("Fetching Dramabox checkpoints (cached after first run)...") paths = get_all_paths() logging.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...") _tts = TTSServer( checkpoint=paths["transformer"], full_checkpoint=paths["audio_components"], gemma_root=paths["gemma_root"], device="cuda", dtype=os.environ.get("LTX_DTYPE", "bf16"), compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows bnb_4bit=True, # unsloth Gemma is pre-quantized ) logging.info("Dramabox TTSServer ready.") return _tts @spaces.GPU(duration=60) def on_generate(prompt, audio_ref, cfg, stg, dur_mult, gen_dur, ref_dur, seed): """Main generation endpoint — wired to the Generate button below so HF's ZeroGPU scheduler detects it at import time.""" if not prompt or not prompt.strip(): raise gr.Error("Prompt is empty.") tts = _get_tts() t0 = time.time() ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None output = tempfile.mktemp(suffix=".wav", prefix="dramabox_") tts.generate_to_file( prompt=prompt, output=output, voice_ref=ref_path, cfg_scale=float(cfg), stg_scale=float(stg), duration_multiplier=float(dur_mult), seed=int(seed), gen_duration=float(gen_dur), ref_duration=float(ref_dur), ) elapsed = time.time() - t0 logging.info(f"Dramabox generated in {elapsed:.2f}s -> {output}") return output with gr.Blocks(title="VideoVoice Dramabox") as demo: gr.Markdown( """ # VideoVoice — Dramabox Resemble AI's directable speech engine ("scene prompts" with quoted dialogue and stage directions). The React frontend at [videovoice.app/app/dramabox](https://videovoice.app/app/dramabox) is the primary UI; this Space exposes the model via the named `/dramabox` Gradio API endpoint, called from the React app through `@gradio/client`. """ ) with gr.Row(): with gr.Column(scale=3): prompt_in = gr.Textbox( label="Scene prompt", placeholder='A weary detective, "I told you it was him." He sighs. "Every time."', lines=6, ) audio_ref_in = gr.Audio( label="Voice reference (optional, 10+ seconds)", type="filepath", ) gen_btn = gr.Button("Generate", variant="primary", size="lg") with gr.Column(scale=2): with gr.Accordion("Inference settings", open=True): cfg_in = gr.Slider(1.0, 10.0, value=2.5, step=0.5, label="CFG scale") stg_in = gr.Slider(0.0, 5.0, value=1.5, step=0.5, label="STG scale") dur_mult_in = gr.Slider( 0.8, 2.0, value=1.1, step=0.05, label="Duration × (only used when target duration = 0)", ) gen_dur_in = gr.Slider( 0.0, 60.0, value=0.0, step=1.0, label="Target duration (s) — 0 = auto", ) ref_dur_in = gr.Slider( 3.0, 30.0, value=10.0, step=1.0, label="Reference duration (s)", ) seed_in = gr.Number(value=42, label="Seed", precision=0) audio_out = gr.Audio(label="Generated audio", type="filepath") gen_btn.click( on_generate, inputs=[prompt_in, audio_ref_in, cfg_in, stg_in, dur_mult_in, gen_dur_in, ref_dur_in, seed_in], outputs=[audio_out], api_name="dramabox", ) if __name__ == "__main__": demo.queue().launch()