techfreakworm commited on
Commit
9514256
·
unverified ·
1 Parent(s): dc32ce0

fix(upscale): crop upscaled image to multiple-of-16 dims before refinement

Browse files

DiffSynth allocates the noise tensor at ceil-multiple-of-16 but its VAE
encodes at floor-multiple-of-16, so a 1240x728 ESRGAN output produced a
156-wide noise latent against a 154-wide input latent and add_noise
crashed with 'tensor a (154) must match tensor b (156) at dim 3'.
Floor-crop to mod 16 in modes.call_upscale so both paths agree.

Files changed (2) hide show
  1. modes.py +10 -3
  2. tests/test_modes.py +33 -0
modes.py CHANGED
@@ -153,6 +153,16 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
153
 
154
  upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
155
 
 
 
 
 
 
 
 
 
 
 
156
  _swap_transformer(pipe, "Turbo")
157
 
158
  kwargs: dict[str, Any] = dict(
@@ -162,9 +172,6 @@ def call_upscale(pipe: Any, params: dict[str, Any]) -> tuple[Image.Image, dict[s
162
  sigma_shift=3.0,
163
  input_image=upscaled,
164
  denoising_strength=float(params.get("refine_denoise", 0.33)),
165
- # Track the upscaled image's dims so the noise initializer builds latents of
166
- # the same shape as the VAE-encoded input_image. Otherwise DiffSynth defaults
167
- # height/width to 1024 and add_noise crashes on a shape mismatch.
168
  height=upscaled.size[1],
169
  width=upscaled.size[0],
170
  seed=int(params.get("seed", 0)),
 
153
 
154
  upscaled = upscale.realesrgan_2x(input_image, model_path=params["esrgan_model_path"])
155
 
156
+ # DiffSynth rounds height/width *up* to multiples of 16 when allocating noise,
157
+ # but its VAE rounds the encoded image *down* to the same modulus. If we hand it
158
+ # an upscaled PIL whose dims aren't already aligned, the two latents come back
159
+ # at different shapes and add_noise crashes (RuntimeError: tensor a vs b on dim 3).
160
+ # Crop to the floor-multiple-of-16 here so both paths land on the same shape.
161
+ w, h = upscaled.size
162
+ aligned_w, aligned_h = (w // 16) * 16, (h // 16) * 16
163
+ if (aligned_w, aligned_h) != (w, h):
164
+ upscaled = upscaled.crop((0, 0, aligned_w, aligned_h))
165
+
166
  _swap_transformer(pipe, "Turbo")
167
 
168
  kwargs: dict[str, Any] = dict(
 
172
  sigma_shift=3.0,
173
  input_image=upscaled,
174
  denoising_strength=float(params.get("refine_denoise", 0.33)),
 
 
 
175
  height=upscaled.size[1],
176
  width=upscaled.size[0],
177
  seed=int(params.get("seed", 0)),
tests/test_modes.py CHANGED
@@ -201,6 +201,39 @@ def test_upscale_runs_realesrgan_then_pipeline(fake_pipe, monkeypatch):
201
  assert meta["mode"] == "upscale"
202
 
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  def test_upscale_rejects_missing_image(fake_pipe):
205
  with pytest.raises(ValueError):
206
  modes.call_upscale(
 
201
  assert meta["mode"] == "upscale"
202
 
203
 
204
+ def test_upscale_crops_to_multiple_of_16(fake_pipe, monkeypatch):
205
+ """Regression: an upscaled image with non-aligned dims used to crash the pipeline
206
+ in add_noise because DiffSynth rounds height/width *up* to mod 16 for the noise
207
+ tensor while its VAE rounds *down* for the encoded latents. We crop to mod 16
208
+ before passing in, so both shapes agree."""
209
+
210
+ def fake_2x(img, model_path):
211
+ return Image.new("RGB", (1240, 728)) # 1240, 728 are NOT multiples of 16
212
+
213
+ monkeypatch.setattr(modes, "upscale", type("U", (), {"realesrgan_2x": staticmethod(fake_2x)}))
214
+
215
+ _out, meta = modes.call_upscale(
216
+ fake_pipe,
217
+ params=dict(
218
+ prompt="masterpiece, 8k",
219
+ input_image=Image.new("RGB", (620, 364)),
220
+ refine_steps=5,
221
+ refine_denoise=0.33,
222
+ seed=0,
223
+ lora_path=None,
224
+ lora_strength=0.0,
225
+ esrgan_model_path="/fake/path/RealESRGAN_x4plus.pth",
226
+ ),
227
+ )
228
+
229
+ kwargs = fake_pipe.call_args.kwargs
230
+ assert kwargs["width"] == 1232 # 1240 // 16 * 16
231
+ assert kwargs["height"] == 720 # 728 // 16 * 16
232
+ assert kwargs["input_image"].size == (1232, 720)
233
+ assert meta["width"] == 1232
234
+ assert meta["height"] == 720
235
+
236
+
237
  def test_upscale_rejects_missing_image(fake_pipe):
238
  with pytest.raises(ValueError):
239
  modes.call_upscale(