vikashmakeit commited on
Commit
70dedbc
·
verified ·
1 Parent(s): a956c3b

Fix VLM models: use actual vision models with correct providers (Qwen2.5-VL, Gemma-4 via novita, Kimi via fireworks-ai)

Browse files
Files changed (1) hide show
  1. app.py +52 -110
app.py CHANGED
@@ -65,10 +65,17 @@ Apply the edit and return ONLY the complete updated JSON (no markdown, no explan
65
  }}
66
  }}"""
67
 
 
68
  VISION_MODELS = [
69
- ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
70
- ("google/gemma-4-31B-it", "together", "Gemma 4 31B"),
71
- ("moonshotai/Kimi-K2.5", "together", "Kimi K2.5"),
 
 
 
 
 
 
72
  ]
73
 
74
  def _extract_response_text(message):
@@ -104,7 +111,7 @@ def _call_vlm(messages, timeout=180):
104
  analysis['_model_used'] = display_name
105
  print(f"[VLM] OK: {display_name} detected {analysis.get('garment_type','?')}")
106
  return analysis
107
- else: print(f"[VLM] {display_name}: HTTP {response.status_code}")
108
  except Exception as e:
109
  print(f"[VLM] {display_name} failed: {e}"); continue
110
  return None
@@ -174,8 +181,7 @@ def process_image(image, garment_type_override="Auto-detect"):
174
  p2d, p3d, summary, j = _generate_all_outputs(analysis)
175
  return p2d, p3d, summary, j, []
176
  except Exception as e:
177
- traceback.print_exc()
178
- return None, None, f"Error: {e}", "{}", []
179
 
180
  def process_text(description):
181
  if not description.strip(): return None, None, "Enter a description.", "{}", []
@@ -187,8 +193,7 @@ def process_text(description):
187
  if analysis is None:
188
  desc_lower = description.lower()
189
  for gt in ['hoodie','jacket','coat','blazer','dress','skirt','pants','trousers','jeans','vest','shirt','blouse','top']:
190
- if gt in desc_lower:
191
- analysis = get_default_analysis(gt); analysis['description'] = description; break
192
  if analysis is None: analysis = get_default_analysis("shirt"); analysis['description'] = description
193
  _current_analysis["data"] = copy.deepcopy(analysis)
194
  try:
@@ -215,8 +220,7 @@ def chat_edit(message, history):
215
  try: updated = _call_vlm([{"role": "user", "content": edit_prompt}], timeout=90)
216
  except: pass
217
  if updated is None:
218
- updated = copy.deepcopy(current)
219
- msg_lower = message.lower()
220
  if "long sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 65
221
  elif "short sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 25
222
  if "no collar" in msg_lower: updated['features']['has_collar'] = False; updated['features']['collar_type'] = 'none'
@@ -243,92 +247,43 @@ def chat_edit(message, history):
243
  history = history or []; history.append((message, ai_msg))
244
  return history, p2d, p3d, summary, j
245
 
246
- # ── Agentic Refinement ──────────────────────────────────────────────────────
247
  def run_refinement(image, garment_type_override, max_iters):
248
- """Run the agentic refinement loop."""
249
  if image is None:
250
- yield None, None, None, "Please upload a garment image.", "{}", None
251
- return
252
-
253
- # Step 1: Initial VLM analysis
254
  analysis = None
255
- try:
256
- analysis = analyze_with_vlm(image)
257
- except Exception as e:
258
- print(f"VLM failed: {e}")
259
-
260
  if analysis is None:
261
  gt = garment_type_override.lower() if garment_type_override != "Auto-detect" else "shirt"
262
  analysis = get_default_analysis(gt)
263
-
264
- if garment_type_override != "Auto-detect":
265
- analysis['garment_type'] = garment_type_override.lower()
266
-
267
- # Generate function for the loop
268
- def gen_fn(a):
269
- return _generate_all_outputs(a)
270
-
271
- # Run refinement loop
272
- max_iters = int(max_iters)
273
- result = refinement_loop(
274
- original_image=image,
275
- initial_analysis=analysis,
276
- generate_fn=gen_fn,
277
- max_iterations=max_iters,
278
- target_composite=0.82,
279
- plateau_patience=3,
280
- lr=0.7,
281
- )
282
-
283
- # Build log markdown
284
- log_lines = [f"## Refinement Results\n"]
285
- log_lines.append(f"**Converged:** {'✅ Yes' if result['converged'] else '❌ No'}")
286
- log_lines.append(f"**Iterations:** {result['total_iterations']}")
287
- log_lines.append(f"**Best Score:** {result['best_score']:.4f}")
288
- if result['scores']:
289
- log_lines.append(f"**Score progression:** {' → '.join(f'{s:.3f}' for s in result['scores'])}")
290
  log_lines.append("")
291
-
292
  for step in result['history']:
293
- it = step['iteration']
294
- status = step.get('status', '?')
295
- metrics = step.get('metrics', {})
296
  log_lines.append(f"### Iteration {it} — {status}")
297
- if metrics:
298
- log_lines.append(f"SSIM={metrics.get('ssim',0):.3f} | Edge={metrics.get('edge_ssim',0):.3f} | Composite={metrics.get('composite',0):.3f}")
299
- if step.get('new_best'):
300
- log_lines.append("⭐ **New best!**")
301
  diffs = step.get('vlm_differences', [])
302
- if diffs:
303
- log_lines.append("**Differences:** " + "; ".join(diffs[:3]))
304
  adj = step.get('adjustments', {})
305
- if adj:
306
- log_lines.append("**Adjustments:** " + ", ".join(f"{k}={v}" for k, v in adj.items()))
307
  reason = step.get('reason', '')
308
- if reason:
309
- log_lines.append(f"*{reason}*")
310
  log_lines.append("")
311
-
312
- log_md = "\n".join(log_lines)
313
-
314
- # Get best outputs
315
- best = result['best_analysis']
316
- _current_analysis["data"] = copy.deepcopy(best)
317
- try:
318
- p2d, p3d, summary, j = _generate_all_outputs(best)
319
- except:
320
- p2d, p3d, summary, j = None, None, "Error generating final outputs", "{}"
321
-
322
- # Get last projection
323
  last_proj = None
324
  for step in reversed(result['history']):
325
- if 'projection' in step:
326
- last_proj = step['projection']
327
- break
328
 
329
- yield p2d, p3d, last_proj, log_md, j, summary
330
-
331
- # ── UI ──────────────────────────────────────────────────────────────────────
332
  CSS = """
333
  .main-header { text-align: center; margin-bottom: 20px; }
334
  .info-box { padding: 15px; border-radius: 10px; background: #f0f7ff; border: 1px solid #cce0ff; margin: 10px 0; }
@@ -336,15 +291,11 @@ CSS = """
336
  """
337
 
338
  with gr.Blocks(title="Garment Pattern Studio") as demo:
339
- gr.HTML("""<div class="main-header">
340
- <h1>🧵 Garment Pattern Studio</h1>
341
- <p style="font-size: 1.1em; color: #555;">Analyze garments, edit with chat, preview in 3D, refine with AI agent</p>
342
- </div>
343
- <div class="info-box">
344
- <b>Powered by:</b> Qwen 3.5 · Gemma 4 · Kimi K2.5 via
345
  <a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
346
- &nbsp;|&nbsp; <b>3D view built from actual 2D pattern pieces</b>
347
- </div>""")
348
 
349
  with gr.Tab("📸 From Image"):
350
  with gr.Row():
@@ -365,14 +316,7 @@ with gr.Blocks(title="Garment Pattern Studio") as demo:
365
  with gr.Column(scale=1):
366
  text_input = gr.Textbox(label="Describe the garment", placeholder="e.g., A fitted A-line dress with cap sleeves", lines=3)
367
  text_btn = gr.Button("Generate Pattern", variant="primary", size="lg")
368
- gr.Examples(examples=[
369
- ["A classic dress shirt with long sleeves and button-down collar"],
370
- ["A flared midi skirt with high waist"],
371
- ["An oversized hoodie with kangaroo pocket"],
372
- ["A fitted blazer with notched lapel collar"],
373
- ["Slim-fit straight-leg jeans with pockets"],
374
- ["A knee-length A-line dress with cap sleeves"],
375
- ], inputs=text_input)
376
  with gr.Column(scale=2):
377
  with gr.Row():
378
  with gr.Column(): txt_pattern_2d = gr.Image(label="2D Pattern", height=400)
@@ -425,12 +369,13 @@ with gr.Blocks(title="Garment Pattern Studio") as demo:
425
  with gr.Tab("🔄 Agentic Refinement"):
426
  gr.Markdown("""### Iterative Refinement Loop
427
  Upload a garment image. The AI agent will:
428
- 1. **Analyze** → extract initial pattern parameters
429
- 2. **Generate** → create 2D pattern + 3D garment
430
  3. **Project** → render 3D to 2D front view
431
- 4. **Compare** → measure similarity (SSIM + VLM visual comparison)
432
- 5. **Refine** → VLM suggests parameter adjustments
433
- 6. **Repeat** until convergence or max iterations
 
434
 
435
  *Requires HF_TOKEN for VLM-powered refinement.*""")
436
  with gr.Row():
@@ -448,17 +393,14 @@ Upload a garment image. The AI agent will:
448
  with gr.Column(): refine_log = gr.Markdown(label="Refinement Log")
449
  refine_summary = gr.Markdown()
450
  with gr.Accordion("Best Parameters JSON", open=False): refine_json = gr.Code(language="json")
451
-
452
- refine_btn.click(run_refinement,
453
- inputs=[refine_image, refine_type, refine_iters],
454
  outputs=[refine_2d, refine_3d, refine_proj, refine_log, refine_json, refine_summary])
455
 
456
- gr.HTML("""<div class="ref-box" style="margin-top: 20px;"><h4>Research References</h4><ul>
457
- <li><b>ChatGarment</b> (2024) — VLM + dialogue for garment editing [<a href="https://arxiv.org/abs/2412.17811">Paper</a>]</li>
458
- <li><b>NGL-Prompter</b> (2025) — Training-free VLM pattern estimation [<a href="https://arxiv.org/abs/2602.20700">Paper</a>]</li>
459
- <li><b>RRVF</b> (2025) — Render-compare visual feedback loops [<a href="https://arxiv.org/abs/2507.20766">Paper</a>]</li>
460
- <li><b>SceneAssistant</b> (2026) — Agentic VLM scene refinement [<a href="https://arxiv.org/abs/2603.12238">Paper</a>]</li>
461
- </ul></div>""")
462
 
463
  if __name__ == "__main__":
464
  demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS, theme=gr.themes.Soft())
 
65
  }}
66
  }}"""
67
 
68
+ # Actual Vision-Language Models with correct providers
69
  VISION_MODELS = [
70
+ ("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
71
+ ("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
72
+ ("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
73
+ ]
74
+ # Text-only models (for chat edit without images)
75
+ TEXT_MODELS = [
76
+ ("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
77
+ ("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
78
+ ("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
79
  ]
80
 
81
  def _extract_response_text(message):
 
111
  analysis['_model_used'] = display_name
112
  print(f"[VLM] OK: {display_name} detected {analysis.get('garment_type','?')}")
113
  return analysis
114
+ else: print(f"[VLM] {display_name}: HTTP {response.status_code} - {response.text[:200]}")
115
  except Exception as e:
116
  print(f"[VLM] {display_name} failed: {e}"); continue
117
  return None
 
181
  p2d, p3d, summary, j = _generate_all_outputs(analysis)
182
  return p2d, p3d, summary, j, []
183
  except Exception as e:
184
+ traceback.print_exc(); return None, None, f"Error: {e}", "{}", []
 
185
 
186
  def process_text(description):
187
  if not description.strip(): return None, None, "Enter a description.", "{}", []
 
193
  if analysis is None:
194
  desc_lower = description.lower()
195
  for gt in ['hoodie','jacket','coat','blazer','dress','skirt','pants','trousers','jeans','vest','shirt','blouse','top']:
196
+ if gt in desc_lower: analysis = get_default_analysis(gt); analysis['description'] = description; break
 
197
  if analysis is None: analysis = get_default_analysis("shirt"); analysis['description'] = description
198
  _current_analysis["data"] = copy.deepcopy(analysis)
199
  try:
 
220
  try: updated = _call_vlm([{"role": "user", "content": edit_prompt}], timeout=90)
221
  except: pass
222
  if updated is None:
223
+ updated = copy.deepcopy(current); msg_lower = message.lower()
 
224
  if "long sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 65
225
  elif "short sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 25
226
  if "no collar" in msg_lower: updated['features']['has_collar'] = False; updated['features']['collar_type'] = 'none'
 
247
  history = history or []; history.append((message, ai_msg))
248
  return history, p2d, p3d, summary, j
249
 
 
250
  def run_refinement(image, garment_type_override, max_iters):
 
251
  if image is None:
252
+ yield None, None, None, "Please upload a garment image.", "{}", None; return
 
 
 
253
  analysis = None
254
+ try: analysis = analyze_with_vlm(image)
255
+ except Exception as e: print(f"VLM failed: {e}")
 
 
 
256
  if analysis is None:
257
  gt = garment_type_override.lower() if garment_type_override != "Auto-detect" else "shirt"
258
  analysis = get_default_analysis(gt)
259
+ if garment_type_override != "Auto-detect": analysis['garment_type'] = garment_type_override.lower()
260
+ def gen_fn(a): return _generate_all_outputs(a)
261
+ result = refinement_loop(original_image=image, initial_analysis=analysis, generate_fn=gen_fn,
262
+ max_iterations=int(max_iters), target_composite=0.82, plateau_patience=3, lr=0.7)
263
+ log_lines = [f"## Refinement Results\n", f"**Converged:** {'✅ Yes' if result['converged'] else '❌ No'}",
264
+ f"**Iterations:** {result['total_iterations']}", f"**Best Score:** {result['best_score']:.4f}"]
265
+ if result['scores']: log_lines.append(f"**Scores:** {' → '.join(f'{s:.3f}' for s in result['scores'])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  log_lines.append("")
 
267
  for step in result['history']:
268
+ it, status, metrics = step['iteration'], step.get('status','?'), step.get('metrics',{})
 
 
269
  log_lines.append(f"### Iteration {it} — {status}")
270
+ if metrics: log_lines.append(f"SSIM={metrics.get('ssim',0):.3f} | Edge={metrics.get('edge_ssim',0):.3f} | Composite={metrics.get('composite',0):.3f}")
271
+ if step.get('new_best'): log_lines.append("⭐ **New best!**")
 
 
272
  diffs = step.get('vlm_differences', [])
273
+ if diffs: log_lines.append("**Differences:** " + "; ".join(diffs[:3]))
 
274
  adj = step.get('adjustments', {})
275
+ if adj: log_lines.append("**Adjustments:** " + ", ".join(f"{k}={v}" for k, v in adj.items()))
 
276
  reason = step.get('reason', '')
277
+ if reason: log_lines.append(f"*{reason}*")
 
278
  log_lines.append("")
279
+ best = result['best_analysis']; _current_analysis["data"] = copy.deepcopy(best)
280
+ try: p2d, p3d, summary, j = _generate_all_outputs(best)
281
+ except: p2d, p3d, summary, j = None, None, "Error", "{}"
 
 
 
 
 
 
 
 
 
282
  last_proj = None
283
  for step in reversed(result['history']):
284
+ if 'projection' in step: last_proj = step['projection']; break
285
+ yield p2d, p3d, last_proj, "\n".join(log_lines), j, summary
 
286
 
 
 
 
287
  CSS = """
288
  .main-header { text-align: center; margin-bottom: 20px; }
289
  .info-box { padding: 15px; border-radius: 10px; background: #f0f7ff; border: 1px solid #cce0ff; margin: 10px 0; }
 
291
  """
292
 
293
  with gr.Blocks(title="Garment Pattern Studio") as demo:
294
+ gr.HTML("""<div class="main-header"><h1>🧵 Garment Pattern Studio</h1>
295
+ <p style="font-size:1.1em;color:#555;">Analyze garments, edit with chat, preview in 3D, refine with AI agent</p></div>
296
+ <div class="info-box"><b>Powered by:</b> Qwen2.5-VL 72B · Gemma 4 31B · Kimi K2.5 via
 
 
 
297
  <a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
298
+ &nbsp;|&nbsp; <b>3D view built from actual 2D pattern pieces</b></div>""")
 
299
 
300
  with gr.Tab("📸 From Image"):
301
  with gr.Row():
 
316
  with gr.Column(scale=1):
317
  text_input = gr.Textbox(label="Describe the garment", placeholder="e.g., A fitted A-line dress with cap sleeves", lines=3)
318
  text_btn = gr.Button("Generate Pattern", variant="primary", size="lg")
319
+ gr.Examples(examples=[["A classic dress shirt with long sleeves and button-down collar"],["A flared midi skirt with high waist"],["An oversized hoodie with kangaroo pocket"],["A fitted blazer with notched lapel collar"],["Slim-fit straight-leg jeans with pockets"],["A knee-length A-line dress with cap sleeves"]], inputs=text_input)
 
 
 
 
 
 
 
320
  with gr.Column(scale=2):
321
  with gr.Row():
322
  with gr.Column(): txt_pattern_2d = gr.Image(label="2D Pattern", height=400)
 
369
  with gr.Tab("🔄 Agentic Refinement"):
370
  gr.Markdown("""### Iterative Refinement Loop
371
  Upload a garment image. The AI agent will:
372
+ 1. **Analyze** → extract initial pattern parameters via VLM
373
+ 2. **Generate** → create 2D pattern + 3D garment from pattern pieces
374
  3. **Project** → render 3D to 2D front view
375
+ 4. **Compare** → SSIM + Edge-SSIM similarity metrics
376
+ 5. **VLM Assess** → visual comparison, identify differences, suggest adjustments
377
+ 6. **Refine** apply damped parameter updates, keep-best tracking
378
+ 7. **Repeat** until convergence or max iterations
379
 
380
  *Requires HF_TOKEN for VLM-powered refinement.*""")
381
  with gr.Row():
 
393
  with gr.Column(): refine_log = gr.Markdown(label="Refinement Log")
394
  refine_summary = gr.Markdown()
395
  with gr.Accordion("Best Parameters JSON", open=False): refine_json = gr.Code(language="json")
396
+ refine_btn.click(run_refinement, inputs=[refine_image, refine_type, refine_iters],
 
 
397
  outputs=[refine_2d, refine_3d, refine_proj, refine_log, refine_json, refine_summary])
398
 
399
+ gr.HTML("""<div class="ref-box" style="margin-top:20px;"><h4>Research References</h4><ul>
400
+ <li><b>ChatGarment</b> (2024) [<a href="https://arxiv.org/abs/2412.17811">Paper</a>]</li>
401
+ <li><b>NGL-Prompter</b> (2025) [<a href="https://arxiv.org/abs/2602.20700">Paper</a>]</li>
402
+ <li><b>RRVF</b> (2025) — Render-compare visual feedback [<a href="https://arxiv.org/abs/2507.20766">Paper</a>]</li>
403
+ <li><b>SceneAssistant</b> (2026) — Agentic VLM refinement [<a href="https://arxiv.org/abs/2603.12238">Paper</a>]</li></ul></div>""")
 
404
 
405
  if __name__ == "__main__":
406
  demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS, theme=gr.themes.Soft())