""" Dramabox — Resemble AI directable speech engine. Single-Space tool: generates a 48 kHz WAV "performance" from a scene prompt (quoted dialogue + stage directions) and an optional voice reference. Mirrors the official ResembleAI/Dramabox Space's on_generate(): same parameter order, same defaults, same model invocation. This module only runs on the videovoice-dramabox Space, which must vendor the Dramabox `src/` directory (inference_server.py + model_downloader.py) and the requirements-dramabox.txt deps. On any other Space the lazy import below raises a clean RuntimeError rather than crashing app startup. The module loads the TTSServer once on first request (warm-load pattern from the upstream Space) and reuses it across calls. """ from __future__ import annotations import logging import os import threading import time from pathlib import Path import spaces # Backend env knobs — kept compatible with the upstream Space. _LTX_DTYPE = os.environ.get("LTX_DTYPE", "bf16") # Module-level warm load, guarded by a lock so a flurry of concurrent first # requests only triggers one load. Subsequent calls are ~2.5s on warm GPU. _tts_lock = threading.Lock() _tts_server = None # populated lazily on first generate() call logger = logging.getLogger("tools_api.dramabox") def _ensure_server(): """Lazy-import the Dramabox model + load checkpoints once. Raises a clean RuntimeError on Spaces that don't ship the Dramabox `src/` vendoring. """ global _tts_server if _tts_server is not None: return _tts_server with _tts_lock: if _tts_server is not None: return _tts_server try: # Vendored from ResembleAI/Dramabox; the Space's `src/` must be on # sys.path. We add it here so this module doesn't require app.py # to do the insert itself. import sys # Match upstream layout: src/ holds inference_server.py which # then puts the sibling ltx2/ on sys.path itself. vendored_src = Path(__file__).parent.parent / "dramabox_src" / "src" if vendored_src.exists() and str(vendored_src) not in sys.path: sys.path.insert(0, str(vendored_src)) from inference_server import TTSServer # type: ignore[import-not-found] from model_downloader import get_all_paths # type: ignore[import-not-found] except ImportError as e: raise RuntimeError( "Dramabox is not installed on this Space. Vendor " "ResembleAI/Dramabox's src/ directory at " "VideoVoice-be/dramabox_src/ and install requirements-dramabox.txt." ) from e logger.info("Fetching Dramabox checkpoints (cached after first run)...") paths = get_all_paths() logger.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...") _tts_server = TTSServer( checkpoint=paths["transformer"], full_checkpoint=paths["audio_components"], gemma_root=paths["gemma_root"], device="cuda", dtype=_LTX_DTYPE, compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows bnb_4bit=True, # unsloth Gemma is pre-quantized ) logger.info("Dramabox TTSServer ready.") return _tts_server @spaces.GPU(duration=60) def _generate_scene_gpu( *, prompt: str, out_dir: Path, audio_ref: Path | None, cfg: float, stg: float, dur_mult: float, gen_dur: float, ref_dur: float, seed: int, ) -> dict: """Top-level ZeroGPU wrapper so HF detects Dramabox GPU usage at startup.""" return _generate_impl( prompt=prompt, out_dir=out_dir, audio_ref=audio_ref, cfg=cfg, stg=stg, dur_mult=dur_mult, gen_dur=gen_dur, ref_dur=ref_dur, seed=seed, ) def generate_scene( *, prompt: str, out_dir: Path, audio_ref: Path | None = None, cfg: float = 2.5, stg: float = 1.5, dur_mult: float = 1.1, gen_dur: float = 0.0, ref_dur: float = 10.0, seed: int = 42, ) -> dict: """ Run Dramabox on `prompt` and write the resulting WAV under `out_dir`. Returns: { "filename": "dramabox_.wav", "elapsed": , "settings": {...echo of inputs used...}, } """ prompt = (prompt or "").strip() if not prompt: raise ValueError("Prompt is empty.") return _generate_scene_gpu( prompt=prompt, out_dir=out_dir, audio_ref=audio_ref, cfg=cfg, stg=stg, dur_mult=dur_mult, gen_dur=gen_dur, ref_dur=ref_dur, seed=seed, ) def _generate_impl( *, prompt: str, out_dir: Path, audio_ref: Path | None, cfg: float, stg: float, dur_mult: float, gen_dur: float, ref_dur: float, seed: int, ) -> dict: tts = _ensure_server() out_dir.mkdir(parents=True, exist_ok=True) output = out_dir / f"dramabox_{int(time.time() * 1000)}.wav" ref_path: str | None = None if audio_ref is not None and Path(audio_ref).exists(): ref_path = str(audio_ref) t0 = time.time() tts.generate_to_file( prompt=prompt, output=str(output), voice_ref=ref_path, cfg_scale=float(cfg), stg_scale=float(stg), duration_multiplier=float(dur_mult), seed=int(seed), gen_duration=float(gen_dur), ref_duration=float(ref_dur), ) elapsed = time.time() - t0 logger.info(f"Dramabox generated in {elapsed:.2f}s -> {output}") return { "filename": output.name, "elapsed": elapsed, "settings": { "cfg": cfg, "stg": stg, "dur_mult": dur_mult, "gen_dur": gen_dur, "ref_dur": ref_dur, "seed": seed, "had_voice_ref": ref_path is not None, }, }