dippoo Claude Opus 4.5 commited on
Commit
f723987
·
1 Parent(s): 21323c4

Add multi-reference image support + fix missing python-dotenv

Browse files

- Add python-dotenv to requirements.txt (was missing, causing import error)
- Add multi-reference image support for img2img (character + pose reference)
- Add pose/style drop zone to UI with side-by-side layout
- Add multi-ref models: SeeDream Sequential, Kling O1, Qwen Multi-Angle
- Update wavespeed_provider to handle multiple image uploads

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

requirements.txt CHANGED
@@ -12,4 +12,5 @@ apscheduler>=3.10.0
12
  httpx>=0.26.0
13
  pyyaml>=6.0
14
  python-multipart>=0.0.6
 
15
  higgsfield-client>=0.1.0
 
12
  httpx>=0.26.0
13
  pyyaml>=6.0
14
  python-multipart>=0.0.6
15
+ python-dotenv>=1.0.0
16
  higgsfield-client>=0.1.0
src/content_engine/api/routes_generation.py CHANGED
@@ -289,6 +289,7 @@ async def cancel_job(job_id: str):
289
  @router.post("/generate/img2img", response_model=GenerationResponse)
290
  async def generate_img2img(
291
  image: UploadFile = File(...),
 
292
  positive_prompt: str = Form(""),
293
  negative_prompt: str = Form(""),
294
  character_id: str | None = Form(None),
@@ -309,12 +310,19 @@ async def generate_img2img(
309
  Supports both local (ComfyUI) and cloud (WaveSpeed edit) backends.
310
  - Local: denoise-based img2img via ComfyUI
311
  - Cloud: prompt-guided editing via SeeDream/NanoBanana Edit APIs
 
 
312
  """
313
  import json as json_module
314
 
315
  job_id = str(uuid.uuid4())
316
  image_bytes = await image.read()
317
 
 
 
 
 
 
318
  # Parse template variables
319
  try:
320
  variables = json_module.loads(variables_json) if variables_json else {}
@@ -330,6 +338,7 @@ async def generate_img2img(
330
  _run_cloud_img2img(
331
  job_id=job_id,
332
  image_bytes=image_bytes,
 
333
  positive_prompt=positive_prompt,
334
  model=checkpoint,
335
  content_rating=content_rating,
@@ -491,6 +500,7 @@ async def _run_cloud_img2img(
491
  *,
492
  job_id: str,
493
  image_bytes: bytes,
 
494
  positive_prompt: str,
495
  model: str | None,
496
  content_rating: str,
@@ -545,6 +555,7 @@ async def _run_cloud_img2img(
545
  result = await _wavespeed_provider.edit_image(
546
  prompt=final_prompt,
547
  image_bytes=image_bytes,
 
548
  model=model,
549
  size=size,
550
  )
 
289
  @router.post("/generate/img2img", response_model=GenerationResponse)
290
  async def generate_img2img(
291
  image: UploadFile = File(...),
292
+ image2: UploadFile | None = File(default=None),
293
  positive_prompt: str = Form(""),
294
  negative_prompt: str = Form(""),
295
  character_id: str | None = Form(None),
 
310
  Supports both local (ComfyUI) and cloud (WaveSpeed edit) backends.
311
  - Local: denoise-based img2img via ComfyUI
312
  - Cloud: prompt-guided editing via SeeDream/NanoBanana Edit APIs
313
+
314
+ Multi-reference: Pass a second image (pose/style reference) for models that support it.
315
  """
316
  import json as json_module
317
 
318
  job_id = str(uuid.uuid4())
319
  image_bytes = await image.read()
320
 
321
+ # Read second reference image if provided (for multi-ref models)
322
+ image_bytes_2 = None
323
+ if image2 is not None:
324
+ image_bytes_2 = await image2.read()
325
+
326
  # Parse template variables
327
  try:
328
  variables = json_module.loads(variables_json) if variables_json else {}
 
338
  _run_cloud_img2img(
339
  job_id=job_id,
340
  image_bytes=image_bytes,
341
+ image_bytes_2=image_bytes_2,
342
  positive_prompt=positive_prompt,
343
  model=checkpoint,
344
  content_rating=content_rating,
 
500
  *,
501
  job_id: str,
502
  image_bytes: bytes,
503
+ image_bytes_2: bytes | None,
504
  positive_prompt: str,
505
  model: str | None,
506
  content_rating: str,
 
555
  result = await _wavespeed_provider.edit_image(
556
  prompt=final_prompt,
557
  image_bytes=image_bytes,
558
+ image_bytes_2=image_bytes_2,
559
  model=model,
560
  size=size,
561
  )
src/content_engine/api/ui.html CHANGED
@@ -909,6 +909,12 @@ select { cursor: pointer; }
909
  <option value="higgsfield-soul">Higgsfield Soul (Faces)</option>
910
  <option value="gpt-image-1.5-edit">GPT Image 1.5 Edit</option>
911
  </optgroup>
 
 
 
 
 
 
912
  <optgroup label="NSFW Friendly">
913
  <option value="seedream-4-edit">SeeDream v4 Edit</option>
914
  <option value="wan-2.6-edit">WAN 2.6 Edit</option>
@@ -927,7 +933,7 @@ select { cursor: pointer; }
927
  </optgroup>
928
  </select>
929
  <div style="font-size:11px;color:var(--text-secondary);margin-top:4px">
930
- Upload a reference image and describe changes. The model preserves faces and poses.
931
  </div>
932
  </div>
933
 
@@ -1005,14 +1011,30 @@ select { cursor: pointer; }
1005
 
1006
  <!-- Reference image upload for img2img -->
1007
  <div id="img2img-section" style="display:none">
1008
- <div class="section-title">Reference Image</div>
1009
- <div class="drop-zone" id="ref-drop-zone" onclick="document.getElementById('ref-file-input').click()">
1010
- <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:32px;height:32px;opacity:0.5;margin-bottom:8px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1011
- <div>Drop image here or click to browse</div>
1012
- <div style="font-size:11px;margin-top:4px">PNG, JPG supported</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
  </div>
1014
- <input type="file" id="ref-file-input" accept="image/*" style="display:none" onchange="handleRefImage(this)">
1015
- <label>Denoise Strength (0 = keep original, 1 = ignore reference)</label>
 
 
1016
  <div class="slider-row">
1017
  <input type="range" id="gen-denoise" min="0" max="1" step="0.05" value="0.65" oninput="this.nextElementSibling.textContent=this.value">
1018
  <span class="value">0.65</span>
@@ -1511,6 +1533,7 @@ let currentBatchId = null;
1511
  let batchPollInterval = null;
1512
  let trainingPollInterval = null;
1513
  let refImageFile = null;
 
1514
  let videoImageFile = null;
1515
  let trainImageFiles = [];
1516
  let trainCaptions = {}; // filename -> caption text
@@ -1663,7 +1686,7 @@ document.addEventListener('DOMContentLoaded', async () => {
1663
 
1664
  // --- Drop zone setup ---
1665
  function setupDropZones() {
1666
- ['ref-drop-zone', 'train-drop-zone', 'video-drop-zone'].forEach(id => {
1667
  const zone = document.getElementById(id);
1668
  if (!zone) return;
1669
  zone.addEventListener('dragover', e => { e.preventDefault(); zone.classList.add('dragover'); });
@@ -1676,6 +1699,9 @@ function setupDropZones() {
1676
  if (id === 'ref-drop-zone') {
1677
  refImageFile = file;
1678
  showRefPreview(file);
 
 
 
1679
  } else if (id === 'video-drop-zone') {
1680
  videoImageFile = file;
1681
  showVideoPreview(file);
@@ -1700,9 +1726,9 @@ function showRefPreview(file) {
1700
  const reader = new FileReader();
1701
  reader.onload = e => {
1702
  zone.innerHTML = `
1703
- <img src="${e.target.result}">
1704
- <div style="margin-top:6px;font-size:12px">${file.name}</div>
1705
- <button class="btn btn-secondary btn-small" onclick="event.stopPropagation();clearRefImage()" style="margin-top:8px">Remove</button>
1706
  `;
1707
  };
1708
  reader.readAsDataURL(file);
@@ -1713,13 +1739,46 @@ function clearRefImage() {
1713
  const zone = document.getElementById('ref-drop-zone');
1714
  zone.classList.remove('has-file');
1715
  zone.innerHTML = `
1716
- <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:32px;height:32px;opacity:0.5;margin-bottom:8px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1717
- <div>Drop image here or click to browse</div>
1718
- <div style="font-size:11px;margin-top:4px">PNG, JPG supported</div>
1719
  `;
1720
  document.getElementById('ref-file-input').value = '';
1721
  }
1722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1723
  function handleVideoImage(input) {
1724
  if (input.files[0]) {
1725
  videoImageFile = input.files[0];
@@ -2158,6 +2217,10 @@ async function doGenerate() {
2158
  }
2159
  const formData = new FormData();
2160
  formData.append('image', refImageFile);
 
 
 
 
2161
  formData.append('positive_prompt', document.getElementById('gen-positive').value || '');
2162
  formData.append('negative_prompt', document.getElementById('gen-negative').value || '');
2163
  formData.append('content_rating', selectedRating);
 
909
  <option value="higgsfield-soul">Higgsfield Soul (Faces)</option>
910
  <option value="gpt-image-1.5-edit">GPT Image 1.5 Edit</option>
911
  </optgroup>
912
+ <optgroup label="Multi-Reference (2+ images)">
913
+ <option value="seedream-4.5-multi">SeeDream v4.5 Sequential (up to 3)</option>
914
+ <option value="seedream-4-multi">SeeDream v4 Sequential (up to 3)</option>
915
+ <option value="kling-o1-multi">Kling O1 (up to 10 refs)</option>
916
+ <option value="qwen-multi-angle">Qwen Multi-Angle</option>
917
+ </optgroup>
918
  <optgroup label="NSFW Friendly">
919
  <option value="seedream-4-edit">SeeDream v4 Edit</option>
920
  <option value="wan-2.6-edit">WAN 2.6 Edit</option>
 
933
  </optgroup>
934
  </select>
935
  <div style="font-size:11px;color:var(--text-secondary);margin-top:4px">
936
+ Single-ref models use character image. Multi-ref models combine both images for consistency.
937
  </div>
938
  </div>
939
 
 
1011
 
1012
  <!-- Reference image upload for img2img -->
1013
  <div id="img2img-section" style="display:none">
1014
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
1015
+ <div>
1016
+ <div class="section-title">Character Reference</div>
1017
+ <div class="drop-zone" id="ref-drop-zone" onclick="document.getElementById('ref-file-input').click()" style="min-height:140px">
1018
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:28px;height:28px;opacity:0.5;margin-bottom:6px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1019
+ <div style="font-size:12px">Drop or click</div>
1020
+ <div style="font-size:10px;margin-top:2px;color:var(--text-secondary)">Main subject</div>
1021
+ </div>
1022
+ <input type="file" id="ref-file-input" accept="image/*" style="display:none" onchange="handleRefImage(this)">
1023
+ </div>
1024
+ <div>
1025
+ <div class="section-title">Pose/Style Reference <span style="font-weight:400;font-size:10px;color:var(--text-secondary)">(optional)</span></div>
1026
+ <div class="drop-zone" id="pose-drop-zone" onclick="document.getElementById('pose-file-input').click()" style="min-height:140px">
1027
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:28px;height:28px;opacity:0.5;margin-bottom:6px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1028
+ <div style="font-size:12px">Drop or click</div>
1029
+ <div style="font-size:10px;margin-top:2px;color:var(--text-secondary)">Pose or style</div>
1030
+ </div>
1031
+ <input type="file" id="pose-file-input" accept="image/*" style="display:none" onchange="handlePoseImage(this)">
1032
+ </div>
1033
  </div>
1034
+ <div style="font-size:11px;color:var(--text-secondary);margin-top:8px">
1035
+ Multi-ref models (SeeDream Sequential, Kling O1) use both images for character consistency.
1036
+ </div>
1037
+ <label style="margin-top:12px">Denoise Strength (0 = keep original, 1 = ignore reference)</label>
1038
  <div class="slider-row">
1039
  <input type="range" id="gen-denoise" min="0" max="1" step="0.05" value="0.65" oninput="this.nextElementSibling.textContent=this.value">
1040
  <span class="value">0.65</span>
 
1533
  let batchPollInterval = null;
1534
  let trainingPollInterval = null;
1535
  let refImageFile = null;
1536
+ let poseImageFile = null;
1537
  let videoImageFile = null;
1538
  let trainImageFiles = [];
1539
  let trainCaptions = {}; // filename -> caption text
 
1686
 
1687
  // --- Drop zone setup ---
1688
  function setupDropZones() {
1689
+ ['ref-drop-zone', 'pose-drop-zone', 'train-drop-zone', 'video-drop-zone'].forEach(id => {
1690
  const zone = document.getElementById(id);
1691
  if (!zone) return;
1692
  zone.addEventListener('dragover', e => { e.preventDefault(); zone.classList.add('dragover'); });
 
1699
  if (id === 'ref-drop-zone') {
1700
  refImageFile = file;
1701
  showRefPreview(file);
1702
+ } else if (id === 'pose-drop-zone') {
1703
+ poseImageFile = file;
1704
+ showPosePreview(file);
1705
  } else if (id === 'video-drop-zone') {
1706
  videoImageFile = file;
1707
  showVideoPreview(file);
 
1726
  const reader = new FileReader();
1727
  reader.onload = e => {
1728
  zone.innerHTML = `
1729
+ <img src="${e.target.result}" style="max-height:100px;max-width:100%;border-radius:4px">
1730
+ <div style="margin-top:4px;font-size:11px">${file.name.substring(0,15)}${file.name.length > 15 ? '...' : ''}</div>
1731
+ <button class="btn btn-secondary btn-small" onclick="event.stopPropagation();clearRefImage()" style="margin-top:4px;padding:2px 8px;font-size:10px">Remove</button>
1732
  `;
1733
  };
1734
  reader.readAsDataURL(file);
 
1739
  const zone = document.getElementById('ref-drop-zone');
1740
  zone.classList.remove('has-file');
1741
  zone.innerHTML = `
1742
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:28px;height:28px;opacity:0.5;margin-bottom:6px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1743
+ <div style="font-size:12px">Drop or click</div>
1744
+ <div style="font-size:10px;margin-top:2px;color:var(--text-secondary)">Main subject</div>
1745
  `;
1746
  document.getElementById('ref-file-input').value = '';
1747
  }
1748
 
1749
+ function handlePoseImage(input) {
1750
+ if (input.files[0]) {
1751
+ poseImageFile = input.files[0];
1752
+ showPosePreview(poseImageFile);
1753
+ }
1754
+ }
1755
+
1756
+ function showPosePreview(file) {
1757
+ const zone = document.getElementById('pose-drop-zone');
1758
+ zone.classList.add('has-file');
1759
+ const reader = new FileReader();
1760
+ reader.onload = e => {
1761
+ zone.innerHTML = `
1762
+ <img src="${e.target.result}" style="max-height:100px;max-width:100%;border-radius:4px">
1763
+ <div style="margin-top:4px;font-size:11px">${file.name.substring(0,15)}${file.name.length > 15 ? '...' : ''}</div>
1764
+ <button class="btn btn-secondary btn-small" onclick="event.stopPropagation();clearPoseImage()" style="margin-top:4px;padding:2px 8px;font-size:10px">Remove</button>
1765
+ `;
1766
+ };
1767
+ reader.readAsDataURL(file);
1768
+ }
1769
+
1770
+ function clearPoseImage() {
1771
+ poseImageFile = null;
1772
+ const zone = document.getElementById('pose-drop-zone');
1773
+ zone.classList.remove('has-file');
1774
+ zone.innerHTML = `
1775
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" style="width:28px;height:28px;opacity:0.5;margin-bottom:6px"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="17 8 12 3 7 8"/><line x1="12" y1="3" x2="12" y2="15"/></svg>
1776
+ <div style="font-size:12px">Drop or click</div>
1777
+ <div style="font-size:10px;margin-top:2px;color:var(--text-secondary)">Pose or style</div>
1778
+ `;
1779
+ document.getElementById('pose-file-input').value = '';
1780
+ }
1781
+
1782
  function handleVideoImage(input) {
1783
  if (input.files[0]) {
1784
  videoImageFile = input.files[0];
 
2217
  }
2218
  const formData = new FormData();
2219
  formData.append('image', refImageFile);
2220
+ // Add pose/style reference image if provided (for multi-ref models)
2221
+ if (poseImageFile) {
2222
+ formData.append('image2', poseImageFile);
2223
+ }
2224
  formData.append('positive_prompt', document.getElementById('gen-positive').value || '');
2225
  formData.append('negative_prompt', document.getElementById('gen-negative').value || '');
2226
  formData.append('content_rating', selectedRating);
src/content_engine/services/cloud_providers/wavespeed_provider.py CHANGED
@@ -103,6 +103,9 @@ EDIT_MODEL_MAP = {
103
  # SeeDream Edit (ByteDance) - NSFW OK
104
  "seedream-4.5-edit": "bytedance/seedream-v4.5/edit",
105
  "seedream-4-edit": "bytedance/seedream-v4/edit",
 
 
 
106
  # WAN Edit (Alibaba)
107
  "wan-2.6-edit": "alibaba/wan-2.6/image-edit",
108
  "wan-2.5-edit": "alibaba/wan-2.5/image-edit",
@@ -126,6 +129,27 @@ EDIT_MODEL_MAP = {
126
  "default": "bytedance/seedream-v4.5/edit",
127
  }
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  WAVESPEED_API_BASE = "https://api.wavespeed.ai/api/v3"
130
 
131
 
@@ -153,6 +177,9 @@ class WaveSpeedProvider(CloudProvider):
153
  """Resolve a friendly name to a WaveSpeed edit model API path."""
154
  if model_name and model_name in EDIT_MODEL_MAP:
155
  return EDIT_MODEL_MAP[model_name]
 
 
 
156
  if model_name:
157
  return model_name
158
  return EDIT_MODEL_MAP["default"]
@@ -430,23 +457,38 @@ class WaveSpeedProvider(CloudProvider):
430
  *,
431
  prompt: str,
432
  image_bytes: bytes,
 
433
  model: str | None = None,
434
  size: str | None = None,
435
  ) -> CloudGenerationResult:
436
- """Full edit flow: upload image to temp host, call edit API, download result."""
 
 
 
 
 
 
 
 
437
  start = time.time()
438
 
439
  # WaveSpeed edit APIs require minimum image size (3686400 pixels = ~1920x1920)
440
  # Auto-upscale small images to meet the requirement
441
  image_bytes = self._ensure_min_image_size(image_bytes, min_pixels=3686400)
442
 
443
- # Upload reference image to a public URL
444
- image_url = await self._upload_temp_image(image_bytes)
 
 
 
 
 
 
445
 
446
  # Submit edit job
447
  job_id = await self.submit_edit(
448
  prompt=prompt,
449
- image_urls=[image_url],
450
  model=model,
451
  size=size,
452
  )
 
103
  # SeeDream Edit (ByteDance) - NSFW OK
104
  "seedream-4.5-edit": "bytedance/seedream-v4.5/edit",
105
  "seedream-4-edit": "bytedance/seedream-v4/edit",
106
+ # SeeDream Multi-Image (Character Consistency across images)
107
+ "seedream-4.5-multi": "bytedance/seedream-v4.5/edit-sequential",
108
+ "seedream-4-multi": "bytedance/seedream-v4/edit-sequential",
109
  # WAN Edit (Alibaba)
110
  "wan-2.6-edit": "alibaba/wan-2.6/image-edit",
111
  "wan-2.5-edit": "alibaba/wan-2.5/image-edit",
 
129
  "default": "bytedance/seedream-v4.5/edit",
130
  }
131
 
132
+ # Models that support multiple reference images
133
+ MULTI_REF_MODELS = {
134
+ # SeeDream Sequential (up to 3 images for character consistency)
135
+ "seedream-4.5-multi": "bytedance/seedream-v4.5/edit-sequential",
136
+ "seedream-4-multi": "bytedance/seedream-v4/edit-sequential",
137
+ # Kling O1 (up to 10 reference images)
138
+ "kling-o1-multi": "kwaivgi/kling-o1/image-to-image",
139
+ # Qwen Multi-Angle (multiple angles of same subject)
140
+ "qwen-multi-angle": "wavespeed-ai/qwen-image/edit-multiple-angles",
141
+ }
142
+
143
+ # Reference-to-Video models (character + pose reference)
144
+ REF_TO_VIDEO_MAP = {
145
+ # WAN 2.6 Reference-to-Video (multi-view identity consistency)
146
+ "wan-2.6-ref": "alibaba/wan-2.6/reference-to-video",
147
+ "wan-2.6-ref-flash": "alibaba/wan-2.6/reference-to-video-flash",
148
+ # Kling O3 Reference-to-Video
149
+ "kling-o3-ref": "kwaivgi/kling-video-o3-pro/reference-to-video",
150
+ "kling-o3-std-ref": "kwaivgi/kling-video-o3-std/reference-to-video",
151
+ }
152
+
153
  WAVESPEED_API_BASE = "https://api.wavespeed.ai/api/v3"
154
 
155
 
 
177
  """Resolve a friendly name to a WaveSpeed edit model API path."""
178
  if model_name and model_name in EDIT_MODEL_MAP:
179
  return EDIT_MODEL_MAP[model_name]
180
+ # Check multi-reference models
181
+ if model_name and model_name in MULTI_REF_MODELS:
182
+ return MULTI_REF_MODELS[model_name]
183
  if model_name:
184
  return model_name
185
  return EDIT_MODEL_MAP["default"]
 
457
  *,
458
  prompt: str,
459
  image_bytes: bytes,
460
+ image_bytes_2: bytes | None = None,
461
  model: str | None = None,
462
  size: str | None = None,
463
  ) -> CloudGenerationResult:
464
+ """Full edit flow: upload image(s) to temp host, call edit API, download result.
465
+
466
+ Args:
467
+ prompt: The edit prompt
468
+ image_bytes: Primary reference image (character/subject)
469
+ image_bytes_2: Optional second reference image (pose/style reference)
470
+ model: Model name (some models support multiple references)
471
+ size: Output size (widthxheight)
472
+ """
473
  start = time.time()
474
 
475
  # WaveSpeed edit APIs require minimum image size (3686400 pixels = ~1920x1920)
476
  # Auto-upscale small images to meet the requirement
477
  image_bytes = self._ensure_min_image_size(image_bytes, min_pixels=3686400)
478
 
479
+ # Upload reference image(s) to public URLs
480
+ image_urls = [await self._upload_temp_image(image_bytes)]
481
+
482
+ # Upload second reference if provided (for multi-ref models)
483
+ if image_bytes_2:
484
+ image_bytes_2 = self._ensure_min_image_size(image_bytes_2, min_pixels=3686400)
485
+ image_urls.append(await self._upload_temp_image(image_bytes_2))
486
+ logger.info("Multi-reference edit: uploading 2 images for model=%s", model)
487
 
488
  # Submit edit job
489
  job_id = await self.submit_edit(
490
  prompt=prompt,
491
+ image_urls=image_urls,
492
  model=model,
493
  size=size,
494
  )