Spaces:

techfreakworm
/

LTX2.3-Studio

Running on Zero

App Files Files Community

techfreakworm commited on 16 days ago

Commit

14dcc06

unverified ·

1 Parent(s): d3e4302

feat(spaces): dynamic ZeroGPU duration + auto-retry on timeout

Browse files

Tier 1 — dynamic duration estimator:
@spaces.GPU(duration=callable) where the callable receives the same args
as _execute_workflow and returns int seconds. Estimate from
(mode, preset, frames) with per-mode base (t2v 90s, lipsync 240s,
style 360s, etc.), preset multiplier (fast 1×, balanced 1.5×, quality 3×),
+60s cold-cache buffer, +0.3s/frame VAE decode. Clamped to [60s, 900s].

Effect: light T2V calls reserve ~190s and jump the ZeroGPU queue;
heavy lipsync/style calls reserve real headroom only when needed.
No more one-size-fits-all 300s/600s wall.

Tier 2 — auto-retry on timeout:
ErrorEvent.category 'gpu_timeout' is now distinguished from generic
execution errors via _classify (matches "GPU task aborted" in the
exception message). The _on_generate handler catches this on the
first attempt only, shows a friendly "Retrying with extended budget"
banner, then re-submits with duration_multiplier=2.0 (which clamps to
the same 900s ceiling). One retry — if that also times out, surface
the error.

submit() signature: now takes preset and duration_multiplier kwargs,
old gpu_duration arg kept as ignored legacy.

Files changed (2) hide show

app.py +74 -31
backend.py +94 -22

app.py CHANGED Viewed

@@ -796,48 +796,91 @@ async def _on_generate(mode_name: str, **inputs: Any):
         wf_module.set_input(workflow, *patch)
     backend = _get_backend()
-    duration = PRESET_DURATION.get(str(inputs.get("preset", "Balanced")), 120)
-    started = time.time()
-    async for event in backend.submit(mode_name, workflow, gpu_duration=duration):
-        elapsed = time.time() - started
         if isinstance(event, backend_module.DownloadEvent):
-            status = ui.render_status(
-                stage_index=0,
-                stage_label=f"Downloading {event.filename}",
-                step=int(event.mb_done),
-                total_steps=int(max(event.mb_total, 1)),
-                elapsed_s=elapsed,
-                eta_s=0,
             )
-            yield status, gr.update()
-        elif isinstance(event, backend_module.ProgressEvent):
-            # Each sampler in the workflow gets its own stage label "Diffusion (n)".
-            # The static `mode.stage_map` describes the full pipeline (encode →
-            # diffusion → upscale → diffusion → decode) but our progress hook
-            # only fires inside samplers, so we label by sampler index instead.
             label = f"Diffusion (Stage {event.stage})"
             eta = (elapsed / max(event.step, 1)) * (event.total_steps - event.step)
-            status = ui.render_status(
-                stage_index=event.stage,
-                stage_label=label,
-                step=event.step,
-                total_steps=event.total_steps,
-                elapsed_s=elapsed,
-                eta_s=eta,
             )
-            yield status, gr.update()
-        elif isinstance(event, backend_module.OutputEvent):
             video_update = event.video_path if event.video_path else gr.update()
-            yield ui._render_idle(), video_update
-        elif isinstance(event, backend_module.ErrorEvent):
-            error_html = (
                 f'<div class="status-card status-error">'
                 f'  <div class="status-row"><span class="status-stage">Error · {event.category}</span></div>'
                 f"  <div>{event.message}</div>"
-                f"</div>"
             )
-            yield error_html, gr.update()
 def _input_keys_for_mode(mode_name: str, h: dict) -> list[str]:

         wf_module.set_input(workflow, *patch)
     backend = _get_backend()
+    preset = params["preset"]  # already lowercased above
+    async def _translate(event, started_at):
+        """Translate one backend event into Gradio (status_html, video) yields.
+        Returns the tuple to yield, plus a flag indicating terminal state.
+        """
+        elapsed = time.time() - started_at
         if isinstance(event, backend_module.DownloadEvent):
+            return (
+                ui.render_status(
+                    stage_index=0,
+                    stage_label=f"Downloading {event.filename}",
+                    step=int(event.mb_done),
+                    total_steps=int(max(event.mb_total, 1)),
+                    elapsed_s=elapsed,
+                    eta_s=0,
+                ),
+                gr.update(),
             )
+        if isinstance(event, backend_module.ProgressEvent):
             label = f"Diffusion (Stage {event.stage})"
             eta = (elapsed / max(event.step, 1)) * (event.total_steps - event.step)
+            return (
+                ui.render_status(
+                    stage_index=event.stage,
+                    stage_label=label,
+                    step=event.step,
+                    total_steps=event.total_steps,
+                    elapsed_s=elapsed,
+                    eta_s=eta,
+                ),
+                gr.update(),
             )
+        if isinstance(event, backend_module.OutputEvent):
             video_update = event.video_path if event.video_path else gr.update()
+            return (ui._render_idle(), video_update)
+        if isinstance(event, backend_module.ErrorEvent):
+            return (
                 f'<div class="status-card status-error">'
                 f'  <div class="status-row"><span class="status-stage">Error · {event.category}</span></div>'
                 f"  <div>{event.message}</div>"
+                f"</div>",
+                gr.update(),
             )
+        return None
+    # Tier 1 + Tier 2: one normal attempt; if it aborts on ZeroGPU duration
+    # cap, retry once with a 2× duration multiplier. Each multiplier is
+    # capped at 900s server-side, so the second attempt never exceeds that.
+    started = time.time()
+    multiplier = 1.0
+    timed_out = False
+    for attempt in (0, 1):
+        if attempt == 1:
+            # Show a friendly retry banner before the second submit
+            yield (
+                '<div class="status-card status-error">'
+                '  <div class="status-row"><span class="status-stage">'
+                "Retrying with extended GPU budget</span></div>"
+                "  <div>First attempt hit the per-call duration cap "
+                "(usually a cold model cache or a heavier mode than estimated). "
+                "Reserving 2× the budget and trying once more.</div>"
+                "</div>",
+                gr.update(),
+            )
+            multiplier = 2.0
+            started = time.time()  # reset so progress ETAs are sensible
+        timed_out = False
+        async for event in backend.submit(
+            mode_name, workflow, preset=preset, duration_multiplier=multiplier
+        ):
+            if (
+                isinstance(event, backend_module.ErrorEvent)
+                and event.category == "gpu_timeout"
+                and attempt == 0
+            ):
+                timed_out = True
+                break  # don't yield the timeout error — auto-retry instead
+            translated = await _translate(event, started)
+            if translated is not None:
+                yield translated
+        if not timed_out:
+            return
 def _input_keys_for_mode(mode_name: str, h: dict) -> list[str]:

backend.py CHANGED Viewed

@@ -63,25 +63,78 @@ def _identity(fn):
     return fn
-# ZeroGPU's startup detector scans loaded modules for spaces.GPU-wrapped
-# functions. The decorator must be applied at module load time — runtime
-# wrapping inside a request handler isn't detected. `duration` is the per-call
-# timeout, NOT a billing cap (HF bills actual usage). Setting it generously
-# (10 min) so heavy modes like lipsync (audio encoder + extra LoRAs + VAE
-# decode + ffmpeg mux) don't hit the 300s wall mid-mux. Light modes return
-# in ~30-60s and free the GPU back into the pool.
-_GPU = spaces.GPU(duration=600) if (spaces is not None and _on_spaces()) else _identity
 @_GPU
-def _execute_workflow(executor: Any, workflow: dict, output_ids: list[str]) -> str:
     """Run the workflow on GPU and return the path of the first video output.
     Returns just the video path (a plain string, picklable across the
-    @spaces.GPU subprocess boundary). Returning the full history_result dict
-    was unreliable on Spaces — under ZeroGPU's GPU-context wrapping, the
-    parent process didn't see the executor's mutated state, so video_path
-    came back empty even when the file was on disk.
     """
     executor.execute(
         workflow,
@@ -291,9 +344,20 @@ class ComfyUILibraryBackend:
         return f"ComfyUILibraryBackend(comfy_dir={self._comfy_dir!r})"
     async def submit(
-        self, mode: str, workflow: dict, gpu_duration: int = 300
     ) -> AsyncIterator[Any]:
-        """Run a workflow end-to-end. Yields Download/Progress/Output/Error events."""
         # Pre-flight: ensure all model files exist.
         try:
             needed = models.walk_workflow_for_models(workflow)
@@ -361,12 +425,14 @@ class ComfyUILibraryBackend:
                 # Use the public setter; it writes the same global the
                 # ProgressBar class reads, but is the documented API.
                 comfy.utils.set_progress_bar_global_hook(_hook)
-                # _execute_workflow is module-level and decorated with
-                # @spaces.GPU(duration=600) on Spaces — that's what makes the
-                # heavy compute run on a borrowed H200. Off-Spaces it's a
-                # plain call. Returns the video path directly (computed
-                # inside the GPU context so the executor's history is fresh).
-                video_path = _execute_workflow(self._executor, workflow, output_ids)
                 # Fallback: if history_result didn't surface a path (rare on
                 # Spaces — happens when ZeroGPU's subprocess boundary drops
                 # mutated state), scan the output dir for the newest mp4
@@ -415,8 +481,14 @@ class ComfyUILibraryBackend:
 def _classify(exc: Exception) -> str:
     name = type(exc).__name__.lower()
-    if "outofmemory" in name or "cuda out of memory" in str(exc).lower():
         return "oom"
     if "interrupt" in name:
         return "interrupt"
     return "execution"

     return fn
+# --- Per-call ZeroGPU duration estimator -----------------------------------
+# `duration` is a per-call timeout. Shorter declared duration → faster queue
+# priority on the shared ZeroGPU pool. Estimating from (mode, preset, frames)
+# instead of using a one-size-fits-all 600s cap means light T2V calls jump
+# the queue while heavy modes (lipsync, style) reserve real headroom.
+_BASE_DURATION_S: dict[str, int] = {
+    # Rough sampler+decode time at ~120 frames, balanced preset, warm cache.
+    "t2v": 90,
+    "i2v": 90,
+    "a2v": 120,
+    "lipsync": 240,   # extra: audio encoder + audio VAE + extra LoRAs
+    "keyframe": 180,
+    "style": 360,     # extra: preprocessor (canny/dwpose/depth) + IC-LoRAs
+}
+_PRESET_MULT: dict[str, float] = {"fast": 1.0, "balanced": 1.5, "quality": 3.0}
+def _frames_from_workflow(workflow: dict) -> int:
+    """Read the frame count from the workflow's EmptyLTXVLatentVideo node."""
+    for node in workflow.values():
+        if isinstance(node, dict) and node.get("class_type") == "EmptyLTXVLatentVideo":
+            try:
+                return int((node.get("inputs") or {}).get("length", 121))
+            except (TypeError, ValueError):
+                return 121
+    return 121
+def _duration_for(
+    executor: Any,
+    workflow: dict,
+    output_ids: list[str],
+    mode: str,
+    preset: str,
+    multiplier: float = 1.0,
+) -> int:
+    """ZeroGPU duration estimator. Same signature as _execute_workflow.
+    Estimate = (base × preset multiplier + cold-cache buffer + per-frame VAE
+    decode time) × retry multiplier, clamped to [60s, 900s]. The 900s ceiling
+    keeps a single failed call from torching the daily quota.
+    """
+    base = _BASE_DURATION_S.get(mode, 180)
+    mult = _PRESET_MULT.get(preset.lower(), 1.5)
+    frames = _frames_from_workflow(workflow)
+    est = int((base * mult + 60 + frames * 0.3) * multiplier)
+    return max(60, min(est, 900))
+# Decorate at module load time so ZeroGPU's startup analyzer detects it.
+_GPU = (
+    spaces.GPU(duration=_duration_for)
+    if (spaces is not None and _on_spaces())
+    else _identity
+)
 @_GPU
+def _execute_workflow(
+    executor: Any,
+    workflow: dict,
+    output_ids: list[str],
+    mode: str,
+    preset: str,
+    multiplier: float = 1.0,
+) -> str:
     """Run the workflow on GPU and return the path of the first video output.
     Returns just the video path (a plain string, picklable across the
+    @spaces.GPU subprocess boundary). The `mode`, `preset`, and `multiplier`
+    args are consumed by `_duration_for` to estimate the GPU slot to reserve.
     """
     executor.execute(
         workflow,
         return f"ComfyUILibraryBackend(comfy_dir={self._comfy_dir!r})"
     async def submit(
+        self,
+        mode: str,
+        workflow: dict,
+        *,
+        preset: str = "balanced",
+        duration_multiplier: float = 1.0,
+        gpu_duration: int = 0,  # legacy, ignored (now derived from preset+frames)
     ) -> AsyncIterator[Any]:
+        """Run a workflow end-to-end. Yields Download/Progress/Output/Error events.
+        `preset` and `duration_multiplier` flow through to the @spaces.GPU
+        duration estimator. The handler can re-call submit() with
+        duration_multiplier=2.0 if the first attempt aborts on timeout.
+        """
         # Pre-flight: ensure all model files exist.
         try:
             needed = models.walk_workflow_for_models(workflow)
                 # Use the public setter; it writes the same global the
                 # ProgressBar class reads, but is the documented API.
                 comfy.utils.set_progress_bar_global_hook(_hook)
+                # _execute_workflow is module-level and decorated with a
+                # @spaces.GPU(duration=callable) on Spaces — the callable
+                # estimates per-call timeout from (mode, preset, frames) so
+                # light calls get fast queue priority while heavy ones reserve
+                # real headroom. Off-Spaces it's a plain call.
+                video_path = _execute_workflow(
+                    self._executor, workflow, output_ids, mode, preset, duration_multiplier,
+                )
                 # Fallback: if history_result didn't surface a path (rare on
                 # Spaces — happens when ZeroGPU's subprocess boundary drops
                 # mutated state), scan the output dir for the newest mp4
 def _classify(exc: Exception) -> str:
     name = type(exc).__name__.lower()
+    msg = str(exc).lower()
+    if "outofmemory" in name or "cuda out of memory" in msg:
         return "oom"
+    # ZeroGPU enforces the @spaces.GPU(duration=N) cap and re-raises as
+    # gradio.exceptions.Error('GPU task aborted'). Surface a distinct
+    # category so the handler can offer a retry with a bigger budget.
+    if "gpu task aborted" in msg or ("gpu" in msg and "aborted" in msg):
+        return "gpu_timeout"
     if "interrupt" in name:
         return "interrupt"
     return "execution"