Spaces:

ResembleAI
/

Dramabox

Running on Zero

Manmay Nakhashi commited on 27 days ago

Commit

ac99a44

1 Parent(s): f1c4065

Warm-load TTSServer at module level (IndexTTS pattern)

Move the TTSServer instantiation out of the lazy _ensure_tts() helper
and into module scope, mirroring how IndexTeam/IndexTTS-2-Demo wires
its model. The 'spaces' package patches torch so device='cuda' at
import time pins the weights into ZeroGPU's shared memory; each
@spaces.GPU call maps them onto the live GPU instantly.

First user request drops from ~30 s (full cold load) to ~2.5 s.

Files changed (1) hide show

app.py +17 -22

app.py CHANGED Viewed

@@ -22,27 +22,23 @@ from model_downloader import get_all_paths  # noqa: E402
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...")
-PATHS = get_all_paths()  # CPU-side download is fine outside the GPU window
-# Lazy-loaded inside the @spaces.GPU function (no GPU available at import time on ZeroGPU).
-_TTS: TTSServer | None = None
-def _ensure_tts() -> TTSServer:
-    global _TTS
-    if _TTS is None:
-        logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...")
-        _TTS = TTSServer(
-            checkpoint=PATHS["transformer"],
-            full_checkpoint=PATHS["audio_components"],
-            gemma_root=PATHS["gemma_root"],
-            device="cuda",
-            dtype=os.environ.get("LTX_DTYPE", "bf16"),
-            compile_model=False,                  # torch.compile breaks under ZeroGPU's brief GPU windows
-            bnb_4bit=True,                        # unsloth Gemma is pre-quantized
-        )
-        logging.info("TTSServer ready.")
-    return _TTS
 # ── Example prompts shipped with a matching voice reference ──────────────────
@@ -115,7 +111,6 @@ def on_generate(prompt: str, audio_ref, cfg: float, stg: float, dur_mult: float,
     if not prompt or not prompt.strip():
         raise gr.Error("Prompt is empty.")
     t0 = time.time()
-    tts = _ensure_tts()
     ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None
     output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
     tts.generate_to_file(

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...")
+PATHS = get_all_paths()
+# Module-level warm load (same pattern as IndexTTS-2-Demo on ZeroGPU). The
+# `spaces` package patches torch so that .to("cuda") at import time pins the
+# weights into ZeroGPU's shared memory; each @spaces.GPU call then maps them
+# onto the actual GPU instantly. First user request is ~2.5 s instead of ~30 s.
+logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...")
+tts = TTSServer(
+    checkpoint=PATHS["transformer"],
+    full_checkpoint=PATHS["audio_components"],
+    gemma_root=PATHS["gemma_root"],
+    device="cuda",
+    dtype=os.environ.get("LTX_DTYPE", "bf16"),
+    compile_model=False,                  # torch.compile breaks under ZeroGPU's brief GPU windows
+    bnb_4bit=True,                        # unsloth Gemma is pre-quantized
+)
+logging.info("TTSServer ready.")
 # ── Example prompts shipped with a matching voice reference ──────────────────
     if not prompt or not prompt.strip():
         raise gr.Error("Prompt is empty.")
     t0 = time.time()
     ref_path = audio_ref if audio_ref and os.path.exists(str(audio_ref)) else None
     output = tempfile.mktemp(suffix=".wav", prefix="dramabox_")
     tts.generate_to_file(