Spaces:

multimodalart
/

pid

Running on Zero

App Files Files Community

apolinario commited on 1 day ago

Commit

afb0b5a

1 Parent(s): 6cd8e25

Stream taef1 previews from Z-Image (thread+queue) into a flash gr.Image; swap to gr.Gallery only when all PiD steps done

Browse files

Files changed (1) hide show

app.py +86 -29

app.py CHANGED Viewed

@@ -100,6 +100,11 @@ _gm.Gemma2Model.forward = _patched_gemma2_forward
 pipeline, pipe_cfg = load_pipeline(BACKBONE, dtype=DTYPE)
 pipeline.to("cuda")
 print("[pid] loading PiD decoder...", flush=True)
 pid_meta = get_pid_checkpoint(BACKBONE, CKPT_TYPE)
 pid_model, _pid_cfg = load_model_from_checkpoint(
@@ -121,6 +126,19 @@ def _latent_to_pil(tensor: torch.Tensor) -> Image.Image:
     return Image.fromarray(arr)
 def _pid_decode(latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str) -> Image.Image:
     baseline_neg1_1 = baseline_01 * 2.0 - 1.0
     lq_h, lq_w = baseline_01.shape[-2], baseline_01.shape[-1]
@@ -150,6 +168,10 @@ def _evenly_spaced_capture_steps(total_steps: int, num_captures: int) -> list[in
     return sorted({int(round(x)) for x in raw})
 @spaces.GPU(duration=240)
 def generate(
     prompt: str,
@@ -158,41 +180,73 @@ def generate(
     guidance_scale: float = 5.0,
     seed: int = 0,
     resolution: int = 512,
-    progress=gr.Progress(track_tqdm=True),
 ):
     if not prompt or not prompt.strip():
         raise gr.Error("Please enter a prompt.")
     num_inference_steps = int(num_inference_steps)
     num_captures = int(num_captures)
-    resolution = int(resolution)
-    H = W = resolution
-    capture_ks = set(_evenly_spaced_capture_steps(num_inference_steps, num_captures))
-    progress(0.05, desc="Running Z-Image latent diffusion…")
     xt_cb = XtCaptureCallback(capture_ks) if capture_ks else None
-    generator = torch.Generator(device="cuda").manual_seed(int(seed))
-    gen_kwargs = dict(
-        prompt=prompt,
-        height=H,
-        width=W,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=float(guidance_scale),
-        num_images_per_prompt=1,
-        output_type="latent",
-        generator=generator,
-    )
-    gen_kwargs.update(pipe_cfg.extra_generate_kwargs)
-    if xt_cb is not None:
-        gen_kwargs["callback_on_step_end"] = xt_cb
-        gen_kwargs["callback_on_step_end_tensor_inputs"] = ["latents"]
-    with torch.no_grad():
-        raw_output = pipeline(**gen_kwargs)
     final_latent = extract_latent(pipeline, raw_output, pipe_cfg, H, W)
-    progress(0.5, desc="Decoding each captured step with PiD…")
     steps_iter = []
     if xt_cb is not None:
         for K in sorted(xt_cb.captured.keys()):
@@ -204,14 +258,16 @@ def generate(
     steps_iter.append(("final x₀", final_latent, final_sigma))
     outputs: list[tuple[Image.Image, str]] = []
-    total = len(steps_iter)
-    for i, (label, latent, sigma) in enumerate(steps_iter):
-        progress(0.5 + 0.5 * (i / total), desc=f"PiD decoding {label}")
         with torch.no_grad():
             baseline_01 = decode_with_pipeline_vae(pipeline, latent, pipe_cfg)
             pid_img = _pid_decode(latent, baseline_01, sigma, prompt)
         outputs.append((pid_img, f"{label}  (σ={sigma:.3f})"))
-        yield outputs
 DESCRIPTION = """
@@ -246,12 +302,13 @@ with gr.Blocks(theme=gr.themes.Citrus(), css=CSS) as demo:
             seed = gr.Number(label="Seed", value=0, precision=0)
             run = gr.Button("Run", variant="primary")
         with gr.Column(scale=2):
-            gallery = gr.Gallery(label="PiD-decoded denoising trajectory", columns=2, object_fit="contain")
     run.click(
         fn=generate,
         inputs=[prompt, num_inference_steps, num_captures, guidance_scale, seed, resolution],
-        outputs=[gallery],
     )
 if __name__ == "__main__":

 pipeline, pipe_cfg = load_pipeline(BACKBONE, dtype=DTYPE)
 pipeline.to("cuda")
+print("[pid] loading TAEF1 (fast preview decoder)...", flush=True)
+from diffusers import AutoencoderTiny
+taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=DTYPE).to("cuda")
+taef1.eval()
 print("[pid] loading PiD decoder...", flush=True)
 pid_meta = get_pid_checkpoint(BACKBONE, CKPT_TYPE)
 pid_model, _pid_cfg = load_model_from_checkpoint(
     return Image.fromarray(arr)
+def _taef1_preview(packed_latent: torch.Tensor, H: int, W: int) -> Image.Image:
+    """Fast low-res decode of a Z-Image latent using TAEF1 (FLUX-1 compatible)."""
+    with torch.no_grad():
+        unpacked = extract_latent(pipeline, SimpleNamespace(images=packed_latent), pipe_cfg, H, W)
+        scale = pipeline.vae.config.scaling_factor
+        shift = getattr(pipeline.vae.config, "shift_factor", None) or 0.0
+        denorm = unpacked.to(dtype=DTYPE) / scale + shift
+        img = taef1.decode(denorm).sample
+        img = (img.float().clamp(-1, 1) + 1) / 2
+        arr = (img[0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    return Image.fromarray(arr)
 def _pid_decode(latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str) -> Image.Image:
     baseline_neg1_1 = baseline_01 * 2.0 - 1.0
     lq_h, lq_w = baseline_01.shape[-2], baseline_01.shape[-1]
     return sorted({int(round(x)) for x in raw})
+import threading
+import queue as _queue
 @spaces.GPU(duration=240)
 def generate(
     prompt: str,
     guidance_scale: float = 5.0,
     seed: int = 0,
     resolution: int = 512,
 ):
     if not prompt or not prompt.strip():
         raise gr.Error("Please enter a prompt.")
     num_inference_steps = int(num_inference_steps)
     num_captures = int(num_captures)
+    H = W = int(resolution)
+    # initial: show the live-preview image, hide the final gallery
+    yield gr.update(visible=True, value=None), gr.update(visible=False, value=None)
+    capture_ks = set(_evenly_spaced_capture_steps(num_inference_steps, num_captures))
     xt_cb = XtCaptureCallback(capture_ks) if capture_ks else None
+    # ---- Run Z-Image in a thread; stream taef1 previews via a queue ----
+    preview_q: "_queue.Queue" = _queue.Queue()
+    _DONE = object()
+    def streaming_cb(pipe, step_index, timestep, callback_kwargs):
+        if xt_cb is not None:
+            xt_cb(pipe, step_index, timestep, callback_kwargs)
+        try:
+            preview = _taef1_preview(callback_kwargs["latents"], H, W)
+            preview_q.put((step_index, preview))
+        except Exception as e:
+            print(f"[pid] taef1 preview failed at step {step_index}: {e}", flush=True)
+        return callback_kwargs
+    def run_pipeline():
+        gen_torch = torch.Generator(device="cuda").manual_seed(int(seed))
+        gen_kwargs = dict(
+            prompt=prompt,
+            height=H,
+            width=W,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=float(guidance_scale),
+            num_images_per_prompt=1,
+            output_type="latent",
+            generator=gen_torch,
+            callback_on_step_end=streaming_cb,
+            callback_on_step_end_tensor_inputs=["latents"],
+        )
+        gen_kwargs.update(pipe_cfg.extra_generate_kwargs)
+        try:
+            with torch.no_grad():
+                out = pipeline(**gen_kwargs)
+            preview_q.put((_DONE, out))
+        except Exception as e:
+            preview_q.put((_DONE, e))
+    thread = threading.Thread(target=run_pipeline, daemon=True)
+    thread.start()
+    raw_output = None
+    while True:
+        step_index, payload = preview_q.get()
+        if step_index is _DONE:
+            if isinstance(payload, Exception):
+                raise payload
+            raw_output = payload
+            break
+        yield gr.update(visible=True, value=payload), gr.update(visible=False)
+    thread.join()
     final_latent = extract_latent(pipeline, raw_output, pipe_cfg, H, W)
+    # ---- PiD per-step decode (sequentially) ----
     steps_iter = []
     if xt_cb is not None:
         for K in sorted(xt_cb.captured.keys()):
     steps_iter.append(("final x₀", final_latent, final_sigma))
     outputs: list[tuple[Image.Image, str]] = []
+    for label, latent, sigma in steps_iter:
         with torch.no_grad():
             baseline_01 = decode_with_pipeline_vae(pipeline, latent, pipe_cfg)
             pid_img = _pid_decode(latent, baseline_01, sigma, prompt)
         outputs.append((pid_img, f"{label}  (σ={sigma:.3f})"))
+        # Flash the latest PiD output in the live-preview image during PiD decoding too
+        yield gr.update(visible=True, value=pid_img), gr.update(visible=False)
+    # ---- Done: hide live preview, show the final gallery ----
+    yield gr.update(visible=False, value=None), gr.update(visible=True, value=outputs)
 DESCRIPTION = """
             seed = gr.Number(label="Seed", value=0, precision=0)
             run = gr.Button("Run", variant="primary")
         with gr.Column(scale=2):
+            live_preview = gr.Image(label="Live preview", visible=True, show_label=True, type="pil")
+            gallery = gr.Gallery(label="PiD-decoded denoising trajectory", visible=False, columns=2, object_fit="contain")
     run.click(
         fn=generate,
         inputs=[prompt, num_inference_steps, num_captures, guidance_scale, seed, resolution],
+        outputs=[live_preview, gallery],
     )
 if __name__ == "__main__":