Spaces:

multimodalart
/

pid

Running on Zero

App Files Files Community

apolinario commited on 1 day ago

Commit

622f4d0

1 Parent(s): e2f50b1

Stream PiD's 4 internal student-sampler steps (yield after each); 'Upscaling with PiD — step K/4' label

Browse files

Files changed (1) hide show

app.py +74 -20

app.py CHANGED Viewed

@@ -139,24 +139,74 @@ def _taef1_preview(packed_latent: torch.Tensor, H: int, W: int) -> Image.Image:
     return Image.fromarray(arr)
-def _pid_decode(latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str) -> Image.Image:
-    baseline_neg1_1 = baseline_01 * 2.0 - 1.0
     lq_h, lq_w = baseline_01.shape[-2], baseline_01.shape[-1]
-    data_batch = {
-        pid_model.config.input_caption_key: [caption],
-        "LQ_video_or_image": baseline_neg1_1.to(dtype=DTYPE, device="cuda"),
-        "LQ_latent": latent.to(dtype=DTYPE, device="cuda"),
-        "degrade_sigma": torch.tensor([sigma], device="cuda", dtype=torch.float32),
-    }
-    samples = pid_model.generate_samples_from_batch(
-        data_batch,
-        cfg_scale=1.0,
-        num_steps=PID_INFERENCE_STEPS,
-        seed=0,
-        shift=None,
-        image_size=(lq_h * SR_SCALE, lq_w * SR_SCALE),
     )
-    return _latent_to_pil(samples[0])
 def _evenly_spaced_capture_steps(total_steps: int, num_captures: int) -> list[int]:
@@ -248,11 +298,15 @@ def generate(
         (baseline_01[0].clamp(0, 1).permute(1, 2, 0).float().cpu().numpy() * 255).astype(np.uint8)
     )
-    # ---- PiD upscaling on the final latent ----
-    yield gr.update(visible=True, value=zimage_img, label="Upscaling with PiD (4× super-resolution, 4 steps)…"), gr.update(visible=False)
     final_sigma = float(pipeline.scheduler.sigmas[-1].item())
-    with torch.no_grad():
-        pid_img = _pid_decode(final_latent, baseline_01, final_sigma, prompt)
     # ---- Done: hide live preview, show the A/B slider ----
     yield (

     return Image.fromarray(arr)
+def _pid_pixel_to_pil(x: torch.Tensor) -> Image.Image:
+    """PiD pixel-space tensor (B, 3, H, W) in [-1, 1] -> PIL.Image."""
+    arr = ((x[0].float().clamp(-1, 1) + 1) * 127.5).permute(1, 2, 0).cpu().numpy().astype(np.uint8)
+    return Image.fromarray(arr)
+def _pid_stream(latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str, num_steps: int = PID_INFERENCE_STEPS):
+    """Reimplementation of PiDDistillModel.generate_samples_from_batch that yields
+    the current pixel-space tensor after each of the `num_steps` student-sampler
+    iterations. Final yield is the clean output."""
+    from contextlib import nullcontext
+    B = 1
     lq_h, lq_w = baseline_01.shape[-2], baseline_01.shape[-1]
+    img_h, img_w = lq_h * SR_SCALE, lq_w * SR_SCALE
+    caption_embs, _ = pid_model._encode_text_raw([caption])
+    caption_embs = caption_embs.to(**pid_model.tensor_kwargs)
+    lq_video_or_image = (baseline_01 * 2.0 - 1.0).to(dtype=DTYPE, device="cuda")
+    lq_latent = latent.to(dtype=DTYPE, device="cuda")
+    degrade_sigma_tensor = torch.tensor([sigma], device="cuda", dtype=torch.float32)
+    gen = torch.Generator(device="cuda").manual_seed(0)
+    noise = torch.randn(B, 3, img_h, img_w, device="cuda", generator=gen)
+    t_list = pid_model._get_t_list(device=torch.device("cuda"), num_steps=num_steps)
+    autocast_ctx = (
+        torch.autocast("cuda", dtype=pid_model.autocast_dtype)
+        if pid_model.autocast_dtype
+        else nullcontext()
     )
+    net = pid_model.net
+    net.eval()
+    timescale = pid_model.fm_trainer.timescale
+    student_sample_type = pid_model.config.student_sample_type
+    prediction_type = pid_model.config.prediction_type
+    x = noise
+    with torch.no_grad(), autocast_ctx:
+        steps_total = len(t_list) - 1
+        for step_idx, (t_cur, t_next) in enumerate(zip(t_list[:-1], t_list[1:])):
+            t_cur_batch = t_cur.expand(B)
+            t_cur_scaled = t_cur_batch * timescale
+            v_pred = net(
+                x,
+                t_cur_scaled,
+                caption_embs,
+                lq_video_or_image=lq_video_or_image,
+                lq_latent=lq_latent,
+                degrade_sigma=degrade_sigma_tensor,
+            )
+            if t_next.item() > 0:
+                if student_sample_type == "ode":
+                    v_for_step = pid_model._net_output_to_velocity(x, v_pred, t_cur_batch, prediction_type)
+                    dt = t_next - t_cur
+                    x = x + dt * v_for_step
+                else:
+                    x0_pred = pid_model._velocity_to_x0(x, v_pred, t_cur_batch)
+                    eps_infer = torch.randn(
+                        x0_pred.shape, device=x0_pred.device, dtype=x0_pred.dtype, generator=gen
+                    )
+                    s = [B] + [1] * (x.ndim - 1)
+                    t_next_bcast = t_next.reshape(1).expand(s)
+                    x = (1.0 - t_next_bcast) * x0_pred + t_next_bcast * eps_infer
+            else:
+                x = pid_model._velocity_to_x0(x, v_pred, t_cur_batch)
+            yield step_idx + 1, steps_total, x.clone()
 def _evenly_spaced_capture_steps(total_steps: int, num_captures: int) -> list[int]:
         (baseline_01[0].clamp(0, 1).permute(1, 2, 0).float().cpu().numpy() * 255).astype(np.uint8)
     )
+    # ---- PiD upscaling on the final latent, streaming the 4 internal steps ----
     final_sigma = float(pipeline.scheduler.sigmas[-1].item())
+    pid_img = None
+    for k, total, x in _pid_stream(final_latent, baseline_01, final_sigma, prompt):
+        pid_img = _pid_pixel_to_pil(x)
+        yield (
+            gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),
+            gr.update(visible=False),
+        )
     # ---- Done: hide live preview, show the A/B slider ----
     yield (