Spaces:

Rafii
/

videovoice-dramabox

Running on Zero

github-actions[bot]

deploy: switch to dramabox requirements @ f090cde

d79393d 7 days ago

6.23 kB

	"""
	Dramabox Space entrypoint — pure Gradio 5.x for ZeroGPU compat.

	Why no FastAPI mount:
	- ZeroGPU only allocates a GPU for @spaces.GPU functions wired into Gradio
	events (button.click / Interface inputs). FastAPI-mounted endpoints
	don't trigger HF's ZeroGPU scheduler, and the mounting pattern was
	also causing HF's runtime to kill the container after startup.
	- This file mirrors the upstream ResembleAI/Dramabox Space's app.py.
	- The React frontend (DramaboxTool.tsx) calls the named API endpoint
	via `@gradio/client` instead of fetch().

	Dramabox checkpoints are lazy-loaded on the first request so the Space
	boots even before `dramabox_src/` is vendored — first call will surface
	the import error to the caller, subsequent calls reuse the warm server.
	"""
	from __future__ import annotations

	import logging
	import os
	import sys
	import tempfile
	import threading
	import time
	from pathlib import Path

	import gradio as gr
	import spaces

	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

	# Vendored Dramabox source. Resemble doesn't publish TTSServer to PyPI;
	# `dramabox_src/` mirrors the upstream Space layout: `src/` (inference glue)
	# alongside `ltx2/` (LTX-2 core packages). `inference_server.py` itself does
	# `sys.path.insert(0, APP_DIR/'ltx2')` where APP_DIR = parent.parent, so we
	# only need to put `dramabox_src/src/` on sys.path here.
	_VENDORED_SRC = Path(__file__).parent / "dramabox_src" / "src"
	if _VENDORED_SRC.exists() and str(_VENDORED_SRC) not in sys.path:
	sys.path.insert(0, str(_VENDORED_SRC))

	_tts_lock = threading.Lock()
	_tts = None # populated lazily on first on_generate() call


	def _get_tts():
	"""Load TTSServer once, reuse across calls. Surfaces a clean error
	if `dramabox_src/` isn't vendored — caller sees a gr.Error toast."""
	global _tts
	if _tts is not None:
	return _tts
	with _tts_lock:
	if _tts is not None:
	return _tts
	try:
	from inference_server import TTSServer # type: ignore[import-not-found]
	from model_downloader import get_all_paths # type: ignore[import-not-found]
	except ImportError as e:
	raise gr.Error(
	"Dramabox source not vendored on this Space. Copy "
	"ResembleAI/Dramabox's src/ into the repo as dramabox_src/."
	) from e

	logging.info("Fetching Dramabox checkpoints (cached after first run)...")
	paths = get_all_paths()

	logging.info("Loading Dramabox warm server (Gemma + DiT + VAE + Decoder)...")
	_tts = TTSServer(
	checkpoint=paths["transformer"],
	full_checkpoint=paths["audio_components"],
	gemma_root=paths["gemma_root"],
	device="cuda",
	dtype=os.environ.get("LTX_DTYPE", "bf16"),
	compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows
	bnb_4bit=True, # unsloth Gemma is pre-quantized
	)
	logging.info("Dramabox TTSServer ready.")
	return _tts


	@spaces.GPU(duration=60)
	def on_generate(prompt, audio_ref, cfg, stg, dur_mult, gen_dur, ref_dur, seed):
	"""Main generation endpoint — wired to the Generate button below so
	HF's ZeroGPU scheduler detects it at import time."""
	if not prompt or not prompt.strip():
	raise gr.Error("Prompt is empty.")
	tts = _get_tts()
	t0 = time.time()
	ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None
	output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
	tts.generate_to_file(
	prompt=prompt,
	output=output,
	voice_ref=ref_path,
	cfg_scale=float(cfg),
	stg_scale=float(stg),
	duration_multiplier=float(dur_mult),
	seed=int(seed),
	gen_duration=float(gen_dur),
	ref_duration=float(ref_dur),
	)
	elapsed = time.time() - t0
	logging.info(f"Dramabox generated in {elapsed:.2f}s -> {output}")
	return output


	with gr.Blocks(title="VideoVoice Dramabox") as demo:
	gr.Markdown(
	"""
	# VideoVoice — Dramabox

	Resemble AI's directable speech engine ("scene prompts" with quoted
	dialogue and stage directions). The React frontend at
	[videovoice.app/app/dramabox](https://videovoice.app/app/dramabox)
	is the primary UI; this Space exposes the model via the named
	`/dramabox` Gradio API endpoint, called from the React app through
	`@gradio/client`.
	"""
	)

	with gr.Row():
	with gr.Column(scale=3):
	prompt_in = gr.Textbox(
	label="Scene prompt",
	placeholder='A weary detective, "I told you it was him." He sighs. "Every time."',
	lines=6,
	)
	audio_ref_in = gr.Audio(
	label="Voice reference (optional, 10+ seconds)",
	type="filepath",
	)
	gen_btn = gr.Button("Generate", variant="primary", size="lg")
	with gr.Column(scale=2):
	with gr.Accordion("Inference settings", open=True):
	cfg_in = gr.Slider(1.0, 10.0, value=2.5, step=0.5, label="CFG scale")
	stg_in = gr.Slider(0.0, 5.0, value=1.5, step=0.5, label="STG scale")
	dur_mult_in = gr.Slider(
	0.8, 2.0, value=1.1, step=0.05,
	label="Duration × (only used when target duration = 0)",
	)
	gen_dur_in = gr.Slider(
	0.0, 60.0, value=0.0, step=1.0,
	label="Target duration (s) — 0 = auto",
	)
	ref_dur_in = gr.Slider(
	3.0, 30.0, value=10.0, step=1.0,
	label="Reference duration (s)",
	)
	seed_in = gr.Number(value=42, label="Seed", precision=0)
	audio_out = gr.Audio(label="Generated audio", type="filepath")

	gen_btn.click(
	on_generate,
	inputs=[prompt_in, audio_ref_in, cfg_in, stg_in,
	dur_mult_in, gen_dur_in, ref_dur_in, seed_in],
	outputs=[audio_out],
	api_name="dramabox",
	)


	if __name__ == "__main__":
	demo.queue().launch()