Spaces:
Running on Zero
Running on Zero
fix(upscale): crop upscaled image to multiple-of-16 dims before refinement
Browse filesDiffSynth allocates the noise tensor at ceil-multiple-of-16 but its VAE
encodes at floor-multiple-of-16, so a 1240x728 ESRGAN output produced a
156-wide noise latent against a 154-wide input latent and add_noise
crashed with 'tensor a (154) must match tensor b (156) at dim 3'.
Floor-crop to mod 16 in modes.call_upscale so both paths agree.
- modes.py +10 -3
- tests/test_modes.py +33 -0
modes.py
CHANGED
|
@@ -153,6 +153,16 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
|
|
| 153 |
|
| 154 |
upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
_swap_transformer(pipe, "Turbo")
|
| 157 |
|
| 158 |
kwargs: dict[str, Any] = dict(
|
|
@@ -162,9 +172,6 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
|
|
| 162 |
sigma_shift=3.0,
|
| 163 |
input_image=upscaled,
|
| 164 |
denoising_strength=float(params.get("refine_denoise", 0.33)),
|
| 165 |
-
# Track the upscaled image's dims so the noise initializer builds latents of
|
| 166 |
-
# the same shape as the VAE-encoded input_image. Otherwise DiffSynth defaults
|
| 167 |
-
# height/width to 1024 and add_noise crashes on a shape mismatch.
|
| 168 |
height=upscaled.size[1],
|
| 169 |
width=upscaled.size[0],
|
| 170 |
seed=int(params.get("seed", 0)),
|
|
|
|
| 153 |
|
| 154 |
upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
|
| 155 |
|
| 156 |
+
# DiffSynth rounds height/width *up* to multiples of 16 when allocating noise,
|
| 157 |
+
# but its VAE rounds the encoded image *down* to the same modulus. If we hand it
|
| 158 |
+
# an upscaled PIL whose dims aren't already aligned, the two latents come back
|
| 159 |
+
# at different shapes and add_noise crashes (RuntimeError: tensor a vs b on dim 3).
|
| 160 |
+
# Crop to the floor-multiple-of-16 here so both paths land on the same shape.
|
| 161 |
+
w, h = upscaled.size
|
| 162 |
+
aligned_w, aligned_h = (w // 16) * 16, (h // 16) * 16
|
| 163 |
+
if (aligned_w, aligned_h) != (w, h):
|
| 164 |
+
upscaled = upscaled.crop((0, 0, aligned_w, aligned_h))
|
| 165 |
+
|
| 166 |
_swap_transformer(pipe, "Turbo")
|
| 167 |
|
| 168 |
kwargs: dict[str, Any] = dict(
|
|
|
|
| 172 |
sigma_shift=3.0,
|
| 173 |
input_image=upscaled,
|
| 174 |
denoising_strength=float(params.get("refine_denoise", 0.33)),
|
|
|
|
|
|
|
|
|
|
| 175 |
height=upscaled.size[1],
|
| 176 |
width=upscaled.size[0],
|
| 177 |
seed=int(params.get("seed", 0)),
|
tests/test_modes.py
CHANGED
|
@@ -201,6 +201,39 @@ def test_upscale_runs_realesrgan_then_pipeline(fake_pipe, monkeypatch):
|
|
| 201 |
assert meta["mode"] == "upscale"
|
| 202 |
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
def test_upscale_rejects_missing_image(fake_pipe):
|
| 205 |
with pytest.raises(ValueError):
|
| 206 |
modes.call_upscale(
|
|
|
|
| 201 |
assert meta["mode"] == "upscale"
|
| 202 |
|
| 203 |
|
| 204 |
+
def test_upscale_crops_to_multiple_of_16(fake_pipe, monkeypatch):
|
| 205 |
+
"""Regression: an upscaled image with non-aligned dims used to crash the pipeline
|
| 206 |
+
in add_noise because DiffSynth rounds height/width *up* to mod 16 for the noise
|
| 207 |
+
tensor while its VAE rounds *down* for the encoded latents. We crop to mod 16
|
| 208 |
+
before passing in, so both shapes agree."""
|
| 209 |
+
|
| 210 |
+
def fake_2x(img, model_path):
|
| 211 |
+
return Image.new("RGB", (1240, 728)) # 1240, 728 are NOT multiples of 16
|
| 212 |
+
|
| 213 |
+
monkeypatch.setattr(modes, "upscale", type("U", (), {"realesrgan_2x": staticmethod(fake_2x)}))
|
| 214 |
+
|
| 215 |
+
_out, meta = modes.call_upscale(
|
| 216 |
+
fake_pipe,
|
| 217 |
+
params=dict(
|
| 218 |
+
prompt="masterpiece, 8k",
|
| 219 |
+
input_image=Image.new("RGB", (620, 364)),
|
| 220 |
+
refine_steps=5,
|
| 221 |
+
refine_denoise=0.33,
|
| 222 |
+
seed=0,
|
| 223 |
+
lora_path=None,
|
| 224 |
+
lora_strength=0.0,
|
| 225 |
+
esrgan_model_path="/fake/path/RealESRGAN_x4plus.pth",
|
| 226 |
+
),
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
kwargs = fake_pipe.call_args.kwargs
|
| 230 |
+
assert kwargs["width"] == 1232 # 1240 // 16 * 16
|
| 231 |
+
assert kwargs["height"] == 720 # 728 // 16 * 16
|
| 232 |
+
assert kwargs["input_image"].size == (1232, 720)
|
| 233 |
+
assert meta["width"] == 1232
|
| 234 |
+
assert meta["height"] == 720
|
| 235 |
+
|
| 236 |
+
|
| 237 |
def test_upscale_rejects_missing_image(fake_pipe):
|
| 238 |
with pytest.raises(ValueError):
|
| 239 |
modes.call_upscale(
|