Spaces:
Running on Zero
Running on Zero
| """ | |
| Dramabox — Resemble AI directable speech engine. | |
| Single-Space tool: generates a 48 kHz WAV "performance" from a scene prompt | |
| (quoted dialogue + stage directions) and an optional voice reference. Mirrors | |
| the official ResembleAI/Dramabox Space's on_generate(): same parameter order, | |
| same defaults, same model invocation. | |
| This module only runs on the videovoice-dramabox Space, which must vendor the | |
| Dramabox `src/` directory (inference_server.py + model_downloader.py) and the | |
| requirements-dramabox.txt deps. On any other Space the lazy import below | |
| raises a clean RuntimeError rather than crashing app startup. | |
| The module loads the TTSServer once on first request (warm-load pattern from | |
| the upstream Space) and reuses it across calls. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| import threading | |
| import time | |
| from pathlib import Path | |
| import spaces | |
| # Backend env knobs — kept compatible with the upstream Space. | |
| _LTX_DTYPE = os.environ.get("LTX_DTYPE", "bf16") | |
| # Module-level warm load, guarded by a lock so a flurry of concurrent first | |
| # requests only triggers one load. Subsequent calls are ~2.5s on warm GPU. | |
| _tts_lock = threading.Lock() | |
| _tts_server = None # populated lazily on first generate() call | |
| logger = logging.getLogger("tools_api.dramabox") | |
| def _ensure_server(): | |
| """Lazy-import the Dramabox model + load checkpoints once. Raises a clean | |
| RuntimeError on Spaces that don't ship the Dramabox `src/` vendoring. | |
| """ | |
| global _tts_server | |
| if _tts_server is not None: | |
| return _tts_server | |
| with _tts_lock: | |
| if _tts_server is not None: | |
| return _tts_server | |
| try: | |
| # Vendored from ResembleAI/Dramabox; the Space's `src/` must be on | |
| # sys.path. We add it here so this module doesn't require app.py | |
| # to do the insert itself. | |
| import sys | |
| # Match upstream layout: src/ holds inference_server.py which | |
| # then puts the sibling ltx2/ on sys.path itself. | |
| vendored_src = Path(__file__).parent.parent / "dramabox_src" / "src" | |
| if vendored_src.exists() and str(vendored_src) not in sys.path: | |
| sys.path.insert(0, str(vendored_src)) | |
| from inference_server import TTSServer # type: ignore[import-not-found] | |
| from model_downloader import get_all_paths # type: ignore[import-not-found] | |
| except ImportError as e: | |
| raise RuntimeError( | |
| "Dramabox is not installed on this Space. Vendor " | |
| "ResembleAI/Dramabox's src/ directory at " | |
| "VideoVoice-be/dramabox_src/ and install requirements-dramabox.txt." | |
| ) from e | |
| logger.info("Fetching Dramabox checkpoints (cached after first run)...") | |
| paths = get_all_paths() | |
| logger.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...") | |
| _tts_server = TTSServer( | |
| checkpoint=paths["transformer"], | |
| full_checkpoint=paths["audio_components"], | |
| gemma_root=paths["gemma_root"], | |
| device="cuda", | |
| dtype=_LTX_DTYPE, | |
| compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows | |
| bnb_4bit=True, # unsloth Gemma is pre-quantized | |
| ) | |
| logger.info("Dramabox TTSServer ready.") | |
| return _tts_server | |
| def _generate_scene_gpu( | |
| *, | |
| prompt: str, | |
| out_dir: Path, | |
| audio_ref: Path | None, | |
| cfg: float, | |
| stg: float, | |
| dur_mult: float, | |
| gen_dur: float, | |
| ref_dur: float, | |
| seed: int, | |
| ) -> dict: | |
| """Top-level ZeroGPU wrapper so HF detects Dramabox GPU usage at startup.""" | |
| return _generate_impl( | |
| prompt=prompt, | |
| out_dir=out_dir, | |
| audio_ref=audio_ref, | |
| cfg=cfg, | |
| stg=stg, | |
| dur_mult=dur_mult, | |
| gen_dur=gen_dur, | |
| ref_dur=ref_dur, | |
| seed=seed, | |
| ) | |
| def generate_scene( | |
| *, | |
| prompt: str, | |
| out_dir: Path, | |
| audio_ref: Path | None = None, | |
| cfg: float = 2.5, | |
| stg: float = 1.5, | |
| dur_mult: float = 1.1, | |
| gen_dur: float = 0.0, | |
| ref_dur: float = 10.0, | |
| seed: int = 42, | |
| ) -> dict: | |
| """ | |
| Run Dramabox on `prompt` and write the resulting WAV under `out_dir`. | |
| Returns: | |
| { | |
| "filename": "dramabox_<run_id_short>.wav", | |
| "elapsed": <seconds>, | |
| "settings": {...echo of inputs used...}, | |
| } | |
| """ | |
| prompt = (prompt or "").strip() | |
| if not prompt: | |
| raise ValueError("Prompt is empty.") | |
| return _generate_scene_gpu( | |
| prompt=prompt, | |
| out_dir=out_dir, | |
| audio_ref=audio_ref, | |
| cfg=cfg, | |
| stg=stg, | |
| dur_mult=dur_mult, | |
| gen_dur=gen_dur, | |
| ref_dur=ref_dur, | |
| seed=seed, | |
| ) | |
| def _generate_impl( | |
| *, | |
| prompt: str, | |
| out_dir: Path, | |
| audio_ref: Path | None, | |
| cfg: float, | |
| stg: float, | |
| dur_mult: float, | |
| gen_dur: float, | |
| ref_dur: float, | |
| seed: int, | |
| ) -> dict: | |
| tts = _ensure_server() | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| output = out_dir / f"dramabox_{int(time.time() * 1000)}.wav" | |
| ref_path: str | None = None | |
| if audio_ref is not None and Path(audio_ref).exists(): | |
| ref_path = str(audio_ref) | |
| t0 = time.time() | |
| tts.generate_to_file( | |
| prompt=prompt, | |
| output=str(output), | |
| voice_ref=ref_path, | |
| cfg_scale=float(cfg), | |
| stg_scale=float(stg), | |
| duration_multiplier=float(dur_mult), | |
| seed=int(seed), | |
| gen_duration=float(gen_dur), | |
| ref_duration=float(ref_dur), | |
| ) | |
| elapsed = time.time() - t0 | |
| logger.info(f"Dramabox generated in {elapsed:.2f}s -> {output}") | |
| return { | |
| "filename": output.name, | |
| "elapsed": elapsed, | |
| "settings": { | |
| "cfg": cfg, | |
| "stg": stg, | |
| "dur_mult": dur_mult, | |
| "gen_dur": gen_dur, | |
| "ref_dur": ref_dur, | |
| "seed": seed, | |
| "had_voice_ref": ref_path is not None, | |
| }, | |
| } | |