Spaces:
Running on Zero
Running on Zero
| """ | |
| Dramabox Space entrypoint — pure Gradio 5.x for ZeroGPU compat. | |
| Why no FastAPI mount: | |
| - ZeroGPU only allocates a GPU for @spaces.GPU functions wired into Gradio | |
| events (button.click / Interface inputs). FastAPI-mounted endpoints | |
| don't trigger HF's ZeroGPU scheduler, and the mounting pattern was | |
| also causing HF's runtime to kill the container after startup. | |
| - This file mirrors the upstream ResembleAI/Dramabox Space's app.py. | |
| - The React frontend (DramaboxTool.tsx) calls the named API endpoint | |
| via `@gradio/client` instead of fetch(). | |
| Dramabox checkpoints are lazy-loaded on the first request so the Space | |
| boots even before `dramabox_src/` is vendored — first call will surface | |
| the import error to the caller, subsequent calls reuse the warm server. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| import sys | |
| import tempfile | |
| import threading | |
| import time | |
| from pathlib import Path | |
| import gradio as gr | |
| import spaces | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| # Vendored Dramabox source. Resemble doesn't publish TTSServer to PyPI; | |
| # `dramabox_src/` mirrors the upstream Space layout: `src/` (inference glue) | |
| # alongside `ltx2/` (LTX-2 core packages). `inference_server.py` itself does | |
| # `sys.path.insert(0, APP_DIR/'ltx2')` where APP_DIR = parent.parent, so we | |
| # only need to put `dramabox_src/src/` on sys.path here. | |
| _VENDORED_SRC = Path(__file__).parent / "dramabox_src" / "src" | |
| if _VENDORED_SRC.exists() and str(_VENDORED_SRC) not in sys.path: | |
| sys.path.insert(0, str(_VENDORED_SRC)) | |
| _tts_lock = threading.Lock() | |
| _tts = None # populated lazily on first on_generate() call | |
| def _get_tts(): | |
| """Load TTSServer once, reuse across calls. Surfaces a clean error | |
| if `dramabox_src/` isn't vendored — caller sees a gr.Error toast.""" | |
| global _tts | |
| if _tts is not None: | |
| return _tts | |
| with _tts_lock: | |
| if _tts is not None: | |
| return _tts | |
| try: | |
| from inference_server import TTSServer # type: ignore[import-not-found] | |
| from model_downloader import get_all_paths # type: ignore[import-not-found] | |
| except ImportError as e: | |
| raise gr.Error( | |
| "Dramabox source not vendored on this Space. Copy " | |
| "ResembleAI/Dramabox's src/ into the repo as dramabox_src/." | |
| ) from e | |
| logging.info("Fetching Dramabox checkpoints (cached after first run)...") | |
| paths = get_all_paths() | |
| logging.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...") | |
| _tts = TTSServer( | |
| checkpoint=paths["transformer"], | |
| full_checkpoint=paths["audio_components"], | |
| gemma_root=paths["gemma_root"], | |
| device="cuda", | |
| dtype=os.environ.get("LTX_DTYPE", "bf16"), | |
| compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows | |
| bnb_4bit=True, # unsloth Gemma is pre-quantized | |
| ) | |
| logging.info("Dramabox TTSServer ready.") | |
| return _tts | |
| def on_generate(prompt, audio_ref, cfg, stg, dur_mult, gen_dur, ref_dur, seed): | |
| """Main generation endpoint — wired to the Generate button below so | |
| HF's ZeroGPU scheduler detects it at import time.""" | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Prompt is empty.") | |
| tts = _get_tts() | |
| t0 = time.time() | |
| ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None | |
| output = tempfile.mktemp(suffix=".wav", prefix="dramabox_") | |
| tts.generate_to_file( | |
| prompt=prompt, | |
| output=output, | |
| voice_ref=ref_path, | |
| cfg_scale=float(cfg), | |
| stg_scale=float(stg), | |
| duration_multiplier=float(dur_mult), | |
| seed=int(seed), | |
| gen_duration=float(gen_dur), | |
| ref_duration=float(ref_dur), | |
| ) | |
| elapsed = time.time() - t0 | |
| logging.info(f"Dramabox generated in {elapsed:.2f}s -> {output}") | |
| return output | |
| with gr.Blocks(title="VideoVoice Dramabox") as demo: | |
| gr.Markdown( | |
| """ | |
| # VideoVoice — Dramabox | |
| Resemble AI's directable speech engine ("scene prompts" with quoted | |
| dialogue and stage directions). The React frontend at | |
| [videovoice.app/app/dramabox](https://videovoice.app/app/dramabox) | |
| is the primary UI; this Space exposes the model via the named | |
| `/dramabox` Gradio API endpoint, called from the React app through | |
| `@gradio/client`. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| prompt_in = gr.Textbox( | |
| label="Scene prompt", | |
| placeholder='A weary detective, "I told you it was him." He sighs. "Every time."', | |
| lines=6, | |
| ) | |
| audio_ref_in = gr.Audio( | |
| label="Voice reference (optional, 10+ seconds)", | |
| type="filepath", | |
| ) | |
| gen_btn = gr.Button("Generate", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| with gr.Accordion("Inference settings", open=True): | |
| cfg_in = gr.Slider(1.0, 10.0, value=2.5, step=0.5, label="CFG scale") | |
| stg_in = gr.Slider(0.0, 5.0, value=1.5, step=0.5, label="STG scale") | |
| dur_mult_in = gr.Slider( | |
| 0.8, 2.0, value=1.1, step=0.05, | |
| label="Duration × (only used when target duration = 0)", | |
| ) | |
| gen_dur_in = gr.Slider( | |
| 0.0, 60.0, value=0.0, step=1.0, | |
| label="Target duration (s) — 0 = auto", | |
| ) | |
| ref_dur_in = gr.Slider( | |
| 3.0, 30.0, value=10.0, step=1.0, | |
| label="Reference duration (s)", | |
| ) | |
| seed_in = gr.Number(value=42, label="Seed", precision=0) | |
| audio_out = gr.Audio(label="Generated audio", type="filepath") | |
| gen_btn.click( | |
| on_generate, | |
| inputs=[prompt_in, audio_ref_in, cfg_in, stg_in, | |
| dur_mult_in, gen_dur_in, ref_dur_in, seed_in], | |
| outputs=[audio_out], | |
| api_name="dramabox", | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |