Spaces:

techfreakworm
/

z-image-studio

Running on Zero

App Files Files Community

techfreakworm commited on 7 days ago

Commit

9514256

unverified ·

1 Parent(s): dc32ce0

fix(upscale): crop upscaled image to multiple-of-16 dims before refinement

Browse files

DiffSynth allocates the noise tensor at ceil-multiple-of-16 but its VAE
encodes at floor-multiple-of-16, so a 1240x728 ESRGAN output produced a
156-wide noise latent against a 154-wide input latent and add_noise
crashed with 'tensor a (154) must match tensor b (156) at dim 3'.
Floor-crop to mod 16 in modes.call_upscale so both paths agree.

Files changed (2) hide show

modes.py +10 -3
tests/test_modes.py +33 -0

modes.py CHANGED Viewed

@@ -153,6 +153,16 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
     upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
     _swap_transformer(pipe, "Turbo")
     kwargs: dict[str, Any] = dict(
@@ -162,9 +172,6 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
         sigma_shift=3.0,
         input_image=upscaled,
         denoising_strength=float(params.get("refine_denoise", 0.33)),
-        # Track the upscaled image's dims so the noise initializer builds latents of
-        # the same shape as the VAE-encoded input_image. Otherwise DiffSynth defaults
-        # height/width to 1024 and add_noise crashes on a shape mismatch.
         height=upscaled.size[1],
         width=upscaled.size[0],
         seed=int(params.get("seed", 0)),

     upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
+    # DiffSynth rounds height/width *up* to multiples of 16 when allocating noise,
+    # but its VAE rounds the encoded image *down* to the same modulus. If we hand it
+    # an upscaled PIL whose dims aren't already aligned, the two latents come back
+    # at different shapes and add_noise crashes (RuntimeError: tensor a vs b on dim 3).
+    # Crop to the floor-multiple-of-16 here so both paths land on the same shape.
+    w, h = upscaled.size
+    aligned_w, aligned_h = (w // 16) * 16, (h // 16) * 16
+    if (aligned_w, aligned_h) != (w, h):
+        upscaled = upscaled.crop((0, 0, aligned_w, aligned_h))
     _swap_transformer(pipe, "Turbo")
     kwargs: dict[str, Any] = dict(
         sigma_shift=3.0,
         input_image=upscaled,
         denoising_strength=float(params.get("refine_denoise", 0.33)),
         height=upscaled.size[1],
         width=upscaled.size[0],
         seed=int(params.get("seed", 0)),

tests/test_modes.py CHANGED Viewed

@@ -201,6 +201,39 @@ def test_upscale_runs_realesrgan_then_pipeline(fake_pipe, monkeypatch):
     assert meta["mode"] == "upscale"
 def test_upscale_rejects_missing_image(fake_pipe):
     with pytest.raises(ValueError):
         modes.call_upscale(

     assert meta["mode"] == "upscale"
+def test_upscale_crops_to_multiple_of_16(fake_pipe, monkeypatch):
+    """Regression: an upscaled image with non-aligned dims used to crash the pipeline
+    in add_noise because DiffSynth rounds height/width *up* to mod 16 for the noise
+    tensor while its VAE rounds *down* for the encoded latents. We crop to mod 16
+    before passing in, so both shapes agree."""
+    def fake_2x(img, model_path):
+        return Image.new("RGB", (1240, 728))  # 1240, 728 are NOT multiples of 16
+    monkeypatch.setattr(modes, "upscale", type("U", (), {"realesrgan_2x": staticmethod(fake_2x)}))
+    _out, meta = modes.call_upscale(
+        fake_pipe,
+        params=dict(
+            prompt="masterpiece, 8k",
+            input_image=Image.new("RGB", (620, 364)),
+            refine_steps=5,
+            refine_denoise=0.33,
+            seed=0,
+            lora_path=None,
+            lora_strength=0.0,
+            esrgan_model_path="/fake/path/RealESRGAN_x4plus.pth",
+        ),
+    )
+    kwargs = fake_pipe.call_args.kwargs
+    assert kwargs["width"] == 1232  # 1240 // 16 * 16
+    assert kwargs["height"] == 720  # 728 // 16 * 16
+    assert kwargs["input_image"].size == (1232, 720)
+    assert meta["width"] == 1232
+    assert meta["height"] == 720
 def test_upscale_rejects_missing_image(fake_pipe):
     with pytest.raises(ValueError):
         modes.call_upscale(