Spaces:
Running
Running
Fix VLM models: use actual vision models with correct providers (Qwen2.5-VL, Gemma-4 via novita, Kimi via fireworks-ai)
Browse files
app.py
CHANGED
|
@@ -65,10 +65,17 @@ Apply the edit and return ONLY the complete updated JSON (no markdown, no explan
|
|
| 65 |
}}
|
| 66 |
}}"""
|
| 67 |
|
|
|
|
| 68 |
VISION_MODELS = [
|
| 69 |
-
("Qwen/
|
| 70 |
-
("google/gemma-4-31B-it", "
|
| 71 |
-
("moonshotai/Kimi-K2.5", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
]
|
| 73 |
|
| 74 |
def _extract_response_text(message):
|
|
@@ -104,7 +111,7 @@ def _call_vlm(messages, timeout=180):
|
|
| 104 |
analysis['_model_used'] = display_name
|
| 105 |
print(f"[VLM] OK: {display_name} detected {analysis.get('garment_type','?')}")
|
| 106 |
return analysis
|
| 107 |
-
else: print(f"[VLM] {display_name}: HTTP {response.status_code}")
|
| 108 |
except Exception as e:
|
| 109 |
print(f"[VLM] {display_name} failed: {e}"); continue
|
| 110 |
return None
|
|
@@ -174,8 +181,7 @@ def process_image(image, garment_type_override="Auto-detect"):
|
|
| 174 |
p2d, p3d, summary, j = _generate_all_outputs(analysis)
|
| 175 |
return p2d, p3d, summary, j, []
|
| 176 |
except Exception as e:
|
| 177 |
-
traceback.print_exc()
|
| 178 |
-
return None, None, f"Error: {e}", "{}", []
|
| 179 |
|
| 180 |
def process_text(description):
|
| 181 |
if not description.strip(): return None, None, "Enter a description.", "{}", []
|
|
@@ -187,8 +193,7 @@ def process_text(description):
|
|
| 187 |
if analysis is None:
|
| 188 |
desc_lower = description.lower()
|
| 189 |
for gt in ['hoodie','jacket','coat','blazer','dress','skirt','pants','trousers','jeans','vest','shirt','blouse','top']:
|
| 190 |
-
if gt in desc_lower:
|
| 191 |
-
analysis = get_default_analysis(gt); analysis['description'] = description; break
|
| 192 |
if analysis is None: analysis = get_default_analysis("shirt"); analysis['description'] = description
|
| 193 |
_current_analysis["data"] = copy.deepcopy(analysis)
|
| 194 |
try:
|
|
@@ -215,8 +220,7 @@ def chat_edit(message, history):
|
|
| 215 |
try: updated = _call_vlm([{"role": "user", "content": edit_prompt}], timeout=90)
|
| 216 |
except: pass
|
| 217 |
if updated is None:
|
| 218 |
-
updated = copy.deepcopy(current)
|
| 219 |
-
msg_lower = message.lower()
|
| 220 |
if "long sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 65
|
| 221 |
elif "short sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 25
|
| 222 |
if "no collar" in msg_lower: updated['features']['has_collar'] = False; updated['features']['collar_type'] = 'none'
|
|
@@ -243,92 +247,43 @@ def chat_edit(message, history):
|
|
| 243 |
history = history or []; history.append((message, ai_msg))
|
| 244 |
return history, p2d, p3d, summary, j
|
| 245 |
|
| 246 |
-
# ── Agentic Refinement ──────────────────────────────────────────────────────
|
| 247 |
def run_refinement(image, garment_type_override, max_iters):
|
| 248 |
-
"""Run the agentic refinement loop."""
|
| 249 |
if image is None:
|
| 250 |
-
yield None, None, None, "Please upload a garment image.", "{}", None
|
| 251 |
-
return
|
| 252 |
-
|
| 253 |
-
# Step 1: Initial VLM analysis
|
| 254 |
analysis = None
|
| 255 |
-
try:
|
| 256 |
-
|
| 257 |
-
except Exception as e:
|
| 258 |
-
print(f"VLM failed: {e}")
|
| 259 |
-
|
| 260 |
if analysis is None:
|
| 261 |
gt = garment_type_override.lower() if garment_type_override != "Auto-detect" else "shirt"
|
| 262 |
analysis = get_default_analysis(gt)
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
#
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
# Run refinement loop
|
| 272 |
-
max_iters = int(max_iters)
|
| 273 |
-
result = refinement_loop(
|
| 274 |
-
original_image=image,
|
| 275 |
-
initial_analysis=analysis,
|
| 276 |
-
generate_fn=gen_fn,
|
| 277 |
-
max_iterations=max_iters,
|
| 278 |
-
target_composite=0.82,
|
| 279 |
-
plateau_patience=3,
|
| 280 |
-
lr=0.7,
|
| 281 |
-
)
|
| 282 |
-
|
| 283 |
-
# Build log markdown
|
| 284 |
-
log_lines = [f"## Refinement Results\n"]
|
| 285 |
-
log_lines.append(f"**Converged:** {'✅ Yes' if result['converged'] else '❌ No'}")
|
| 286 |
-
log_lines.append(f"**Iterations:** {result['total_iterations']}")
|
| 287 |
-
log_lines.append(f"**Best Score:** {result['best_score']:.4f}")
|
| 288 |
-
if result['scores']:
|
| 289 |
-
log_lines.append(f"**Score progression:** {' → '.join(f'{s:.3f}' for s in result['scores'])}")
|
| 290 |
log_lines.append("")
|
| 291 |
-
|
| 292 |
for step in result['history']:
|
| 293 |
-
it = step['iteration']
|
| 294 |
-
status = step.get('status', '?')
|
| 295 |
-
metrics = step.get('metrics', {})
|
| 296 |
log_lines.append(f"### Iteration {it} — {status}")
|
| 297 |
-
if metrics:
|
| 298 |
-
|
| 299 |
-
if step.get('new_best'):
|
| 300 |
-
log_lines.append("⭐ **New best!**")
|
| 301 |
diffs = step.get('vlm_differences', [])
|
| 302 |
-
if diffs:
|
| 303 |
-
log_lines.append("**Differences:** " + "; ".join(diffs[:3]))
|
| 304 |
adj = step.get('adjustments', {})
|
| 305 |
-
if adj:
|
| 306 |
-
log_lines.append("**Adjustments:** " + ", ".join(f"{k}={v}" for k, v in adj.items()))
|
| 307 |
reason = step.get('reason', '')
|
| 308 |
-
if reason:
|
| 309 |
-
log_lines.append(f"*{reason}*")
|
| 310 |
log_lines.append("")
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
# Get best outputs
|
| 315 |
-
best = result['best_analysis']
|
| 316 |
-
_current_analysis["data"] = copy.deepcopy(best)
|
| 317 |
-
try:
|
| 318 |
-
p2d, p3d, summary, j = _generate_all_outputs(best)
|
| 319 |
-
except:
|
| 320 |
-
p2d, p3d, summary, j = None, None, "Error generating final outputs", "{}"
|
| 321 |
-
|
| 322 |
-
# Get last projection
|
| 323 |
last_proj = None
|
| 324 |
for step in reversed(result['history']):
|
| 325 |
-
if 'projection' in step:
|
| 326 |
-
|
| 327 |
-
break
|
| 328 |
|
| 329 |
-
yield p2d, p3d, last_proj, log_md, j, summary
|
| 330 |
-
|
| 331 |
-
# ── UI ──────────────────────────────────────────────────────────────────────
|
| 332 |
CSS = """
|
| 333 |
.main-header { text-align: center; margin-bottom: 20px; }
|
| 334 |
.info-box { padding: 15px; border-radius: 10px; background: #f0f7ff; border: 1px solid #cce0ff; margin: 10px 0; }
|
|
@@ -336,15 +291,11 @@ CSS = """
|
|
| 336 |
"""
|
| 337 |
|
| 338 |
with gr.Blocks(title="Garment Pattern Studio") as demo:
|
| 339 |
-
gr.HTML("""<div class="main-header">
|
| 340 |
-
<
|
| 341 |
-
|
| 342 |
-
</div>
|
| 343 |
-
<div class="info-box">
|
| 344 |
-
<b>Powered by:</b> Qwen 3.5 · Gemma 4 · Kimi K2.5 via
|
| 345 |
<a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
|
| 346 |
-
| <b>3D view built from actual 2D pattern pieces</b>
|
| 347 |
-
</div>""")
|
| 348 |
|
| 349 |
with gr.Tab("📸 From Image"):
|
| 350 |
with gr.Row():
|
|
@@ -365,14 +316,7 @@ with gr.Blocks(title="Garment Pattern Studio") as demo:
|
|
| 365 |
with gr.Column(scale=1):
|
| 366 |
text_input = gr.Textbox(label="Describe the garment", placeholder="e.g., A fitted A-line dress with cap sleeves", lines=3)
|
| 367 |
text_btn = gr.Button("Generate Pattern", variant="primary", size="lg")
|
| 368 |
-
gr.Examples(examples=[
|
| 369 |
-
["A classic dress shirt with long sleeves and button-down collar"],
|
| 370 |
-
["A flared midi skirt with high waist"],
|
| 371 |
-
["An oversized hoodie with kangaroo pocket"],
|
| 372 |
-
["A fitted blazer with notched lapel collar"],
|
| 373 |
-
["Slim-fit straight-leg jeans with pockets"],
|
| 374 |
-
["A knee-length A-line dress with cap sleeves"],
|
| 375 |
-
], inputs=text_input)
|
| 376 |
with gr.Column(scale=2):
|
| 377 |
with gr.Row():
|
| 378 |
with gr.Column(): txt_pattern_2d = gr.Image(label="2D Pattern", height=400)
|
|
@@ -425,12 +369,13 @@ with gr.Blocks(title="Garment Pattern Studio") as demo:
|
|
| 425 |
with gr.Tab("🔄 Agentic Refinement"):
|
| 426 |
gr.Markdown("""### Iterative Refinement Loop
|
| 427 |
Upload a garment image. The AI agent will:
|
| 428 |
-
1. **Analyze** → extract initial pattern parameters
|
| 429 |
-
2. **Generate** → create 2D pattern + 3D garment
|
| 430 |
3. **Project** → render 3D to 2D front view
|
| 431 |
-
4. **Compare** →
|
| 432 |
-
5. **
|
| 433 |
-
6. **
|
|
|
|
| 434 |
|
| 435 |
*Requires HF_TOKEN for VLM-powered refinement.*""")
|
| 436 |
with gr.Row():
|
|
@@ -448,17 +393,14 @@ Upload a garment image. The AI agent will:
|
|
| 448 |
with gr.Column(): refine_log = gr.Markdown(label="Refinement Log")
|
| 449 |
refine_summary = gr.Markdown()
|
| 450 |
with gr.Accordion("Best Parameters JSON", open=False): refine_json = gr.Code(language="json")
|
| 451 |
-
|
| 452 |
-
refine_btn.click(run_refinement,
|
| 453 |
-
inputs=[refine_image, refine_type, refine_iters],
|
| 454 |
outputs=[refine_2d, refine_3d, refine_proj, refine_log, refine_json, refine_summary])
|
| 455 |
|
| 456 |
-
gr.HTML("""<div class="ref-box" style="margin-top:
|
| 457 |
-
<li><b>ChatGarment</b> (2024)
|
| 458 |
-
<li><b>NGL-Prompter</b> (2025)
|
| 459 |
-
<li><b>RRVF</b> (2025) — Render-compare visual feedback
|
| 460 |
-
<li><b>SceneAssistant</b> (2026) — Agentic VLM
|
| 461 |
-
</ul></div>""")
|
| 462 |
|
| 463 |
if __name__ == "__main__":
|
| 464 |
demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS, theme=gr.themes.Soft())
|
|
|
|
| 65 |
}}
|
| 66 |
}}"""
|
| 67 |
|
| 68 |
+
# Actual Vision-Language Models with correct providers
|
| 69 |
VISION_MODELS = [
|
| 70 |
+
("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
|
| 71 |
+
("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
|
| 72 |
+
("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
|
| 73 |
+
]
|
| 74 |
+
# Text-only models (for chat edit without images)
|
| 75 |
+
TEXT_MODELS = [
|
| 76 |
+
("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
|
| 77 |
+
("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
|
| 78 |
+
("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
|
| 79 |
]
|
| 80 |
|
| 81 |
def _extract_response_text(message):
|
|
|
|
| 111 |
analysis['_model_used'] = display_name
|
| 112 |
print(f"[VLM] OK: {display_name} detected {analysis.get('garment_type','?')}")
|
| 113 |
return analysis
|
| 114 |
+
else: print(f"[VLM] {display_name}: HTTP {response.status_code} - {response.text[:200]}")
|
| 115 |
except Exception as e:
|
| 116 |
print(f"[VLM] {display_name} failed: {e}"); continue
|
| 117 |
return None
|
|
|
|
| 181 |
p2d, p3d, summary, j = _generate_all_outputs(analysis)
|
| 182 |
return p2d, p3d, summary, j, []
|
| 183 |
except Exception as e:
|
| 184 |
+
traceback.print_exc(); return None, None, f"Error: {e}", "{}", []
|
|
|
|
| 185 |
|
| 186 |
def process_text(description):
|
| 187 |
if not description.strip(): return None, None, "Enter a description.", "{}", []
|
|
|
|
| 193 |
if analysis is None:
|
| 194 |
desc_lower = description.lower()
|
| 195 |
for gt in ['hoodie','jacket','coat','blazer','dress','skirt','pants','trousers','jeans','vest','shirt','blouse','top']:
|
| 196 |
+
if gt in desc_lower: analysis = get_default_analysis(gt); analysis['description'] = description; break
|
|
|
|
| 197 |
if analysis is None: analysis = get_default_analysis("shirt"); analysis['description'] = description
|
| 198 |
_current_analysis["data"] = copy.deepcopy(analysis)
|
| 199 |
try:
|
|
|
|
| 220 |
try: updated = _call_vlm([{"role": "user", "content": edit_prompt}], timeout=90)
|
| 221 |
except: pass
|
| 222 |
if updated is None:
|
| 223 |
+
updated = copy.deepcopy(current); msg_lower = message.lower()
|
|
|
|
| 224 |
if "long sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 65
|
| 225 |
elif "short sleeve" in msg_lower: updated['measurements']['sleeve_length'] = 25
|
| 226 |
if "no collar" in msg_lower: updated['features']['has_collar'] = False; updated['features']['collar_type'] = 'none'
|
|
|
|
| 247 |
history = history or []; history.append((message, ai_msg))
|
| 248 |
return history, p2d, p3d, summary, j
|
| 249 |
|
|
|
|
| 250 |
def run_refinement(image, garment_type_override, max_iters):
|
|
|
|
| 251 |
if image is None:
|
| 252 |
+
yield None, None, None, "Please upload a garment image.", "{}", None; return
|
|
|
|
|
|
|
|
|
|
| 253 |
analysis = None
|
| 254 |
+
try: analysis = analyze_with_vlm(image)
|
| 255 |
+
except Exception as e: print(f"VLM failed: {e}")
|
|
|
|
|
|
|
|
|
|
| 256 |
if analysis is None:
|
| 257 |
gt = garment_type_override.lower() if garment_type_override != "Auto-detect" else "shirt"
|
| 258 |
analysis = get_default_analysis(gt)
|
| 259 |
+
if garment_type_override != "Auto-detect": analysis['garment_type'] = garment_type_override.lower()
|
| 260 |
+
def gen_fn(a): return _generate_all_outputs(a)
|
| 261 |
+
result = refinement_loop(original_image=image, initial_analysis=analysis, generate_fn=gen_fn,
|
| 262 |
+
max_iterations=int(max_iters), target_composite=0.82, plateau_patience=3, lr=0.7)
|
| 263 |
+
log_lines = [f"## Refinement Results\n", f"**Converged:** {'✅ Yes' if result['converged'] else '❌ No'}",
|
| 264 |
+
f"**Iterations:** {result['total_iterations']}", f"**Best Score:** {result['best_score']:.4f}"]
|
| 265 |
+
if result['scores']: log_lines.append(f"**Scores:** {' → '.join(f'{s:.3f}' for s in result['scores'])}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
log_lines.append("")
|
|
|
|
| 267 |
for step in result['history']:
|
| 268 |
+
it, status, metrics = step['iteration'], step.get('status','?'), step.get('metrics',{})
|
|
|
|
|
|
|
| 269 |
log_lines.append(f"### Iteration {it} — {status}")
|
| 270 |
+
if metrics: log_lines.append(f"SSIM={metrics.get('ssim',0):.3f} | Edge={metrics.get('edge_ssim',0):.3f} | Composite={metrics.get('composite',0):.3f}")
|
| 271 |
+
if step.get('new_best'): log_lines.append("⭐ **New best!**")
|
|
|
|
|
|
|
| 272 |
diffs = step.get('vlm_differences', [])
|
| 273 |
+
if diffs: log_lines.append("**Differences:** " + "; ".join(diffs[:3]))
|
|
|
|
| 274 |
adj = step.get('adjustments', {})
|
| 275 |
+
if adj: log_lines.append("**Adjustments:** " + ", ".join(f"{k}={v}" for k, v in adj.items()))
|
|
|
|
| 276 |
reason = step.get('reason', '')
|
| 277 |
+
if reason: log_lines.append(f"*{reason}*")
|
|
|
|
| 278 |
log_lines.append("")
|
| 279 |
+
best = result['best_analysis']; _current_analysis["data"] = copy.deepcopy(best)
|
| 280 |
+
try: p2d, p3d, summary, j = _generate_all_outputs(best)
|
| 281 |
+
except: p2d, p3d, summary, j = None, None, "Error", "{}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
last_proj = None
|
| 283 |
for step in reversed(result['history']):
|
| 284 |
+
if 'projection' in step: last_proj = step['projection']; break
|
| 285 |
+
yield p2d, p3d, last_proj, "\n".join(log_lines), j, summary
|
|
|
|
| 286 |
|
|
|
|
|
|
|
|
|
|
| 287 |
CSS = """
|
| 288 |
.main-header { text-align: center; margin-bottom: 20px; }
|
| 289 |
.info-box { padding: 15px; border-radius: 10px; background: #f0f7ff; border: 1px solid #cce0ff; margin: 10px 0; }
|
|
|
|
| 291 |
"""
|
| 292 |
|
| 293 |
with gr.Blocks(title="Garment Pattern Studio") as demo:
|
| 294 |
+
gr.HTML("""<div class="main-header"><h1>🧵 Garment Pattern Studio</h1>
|
| 295 |
+
<p style="font-size:1.1em;color:#555;">Analyze garments, edit with chat, preview in 3D, refine with AI agent</p></div>
|
| 296 |
+
<div class="info-box"><b>Powered by:</b> Qwen2.5-VL 72B · Gemma 4 31B · Kimi K2.5 via
|
|
|
|
|
|
|
|
|
|
| 297 |
<a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
|
| 298 |
+
| <b>3D view built from actual 2D pattern pieces</b></div>""")
|
|
|
|
| 299 |
|
| 300 |
with gr.Tab("📸 From Image"):
|
| 301 |
with gr.Row():
|
|
|
|
| 316 |
with gr.Column(scale=1):
|
| 317 |
text_input = gr.Textbox(label="Describe the garment", placeholder="e.g., A fitted A-line dress with cap sleeves", lines=3)
|
| 318 |
text_btn = gr.Button("Generate Pattern", variant="primary", size="lg")
|
| 319 |
+
gr.Examples(examples=[["A classic dress shirt with long sleeves and button-down collar"],["A flared midi skirt with high waist"],["An oversized hoodie with kangaroo pocket"],["A fitted blazer with notched lapel collar"],["Slim-fit straight-leg jeans with pockets"],["A knee-length A-line dress with cap sleeves"]], inputs=text_input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
with gr.Column(scale=2):
|
| 321 |
with gr.Row():
|
| 322 |
with gr.Column(): txt_pattern_2d = gr.Image(label="2D Pattern", height=400)
|
|
|
|
| 369 |
with gr.Tab("🔄 Agentic Refinement"):
|
| 370 |
gr.Markdown("""### Iterative Refinement Loop
|
| 371 |
Upload a garment image. The AI agent will:
|
| 372 |
+
1. **Analyze** → extract initial pattern parameters via VLM
|
| 373 |
+
2. **Generate** → create 2D pattern + 3D garment from pattern pieces
|
| 374 |
3. **Project** → render 3D to 2D front view
|
| 375 |
+
4. **Compare** → SSIM + Edge-SSIM similarity metrics
|
| 376 |
+
5. **VLM Assess** → visual comparison, identify differences, suggest adjustments
|
| 377 |
+
6. **Refine** → apply damped parameter updates, keep-best tracking
|
| 378 |
+
7. **Repeat** until convergence or max iterations
|
| 379 |
|
| 380 |
*Requires HF_TOKEN for VLM-powered refinement.*""")
|
| 381 |
with gr.Row():
|
|
|
|
| 393 |
with gr.Column(): refine_log = gr.Markdown(label="Refinement Log")
|
| 394 |
refine_summary = gr.Markdown()
|
| 395 |
with gr.Accordion("Best Parameters JSON", open=False): refine_json = gr.Code(language="json")
|
| 396 |
+
refine_btn.click(run_refinement, inputs=[refine_image, refine_type, refine_iters],
|
|
|
|
|
|
|
| 397 |
outputs=[refine_2d, refine_3d, refine_proj, refine_log, refine_json, refine_summary])
|
| 398 |
|
| 399 |
+
gr.HTML("""<div class="ref-box" style="margin-top:20px;"><h4>Research References</h4><ul>
|
| 400 |
+
<li><b>ChatGarment</b> (2024) [<a href="https://arxiv.org/abs/2412.17811">Paper</a>]</li>
|
| 401 |
+
<li><b>NGL-Prompter</b> (2025) [<a href="https://arxiv.org/abs/2602.20700">Paper</a>]</li>
|
| 402 |
+
<li><b>RRVF</b> (2025) — Render-compare visual feedback [<a href="https://arxiv.org/abs/2507.20766">Paper</a>]</li>
|
| 403 |
+
<li><b>SceneAssistant</b> (2026) — Agentic VLM refinement [<a href="https://arxiv.org/abs/2603.12238">Paper</a>]</li></ul></div>""")
|
|
|
|
| 404 |
|
| 405 |
if __name__ == "__main__":
|
| 406 |
demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS, theme=gr.themes.Soft())
|