vikashmakeit commited on
Commit
e9ba07a
·
verified ·
1 Parent(s): 6df9de5

Fix VLM: use Llama-4-Scout via nscale (confirmed working with images), Kimi-K2.6 and Qwen3.5-9B via together as fallbacks

Browse files
Files changed (1) hide show
  1. app.py +13 -22
app.py CHANGED
@@ -65,17 +65,14 @@ Apply the edit and return ONLY the complete updated JSON (no markdown, no explan
65
  }}
66
  }}"""
67
 
68
- # Actual Vision-Language Models with correct providers
 
 
 
69
  VISION_MODELS = [
70
- ("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
71
- ("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
72
- ("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
73
- ]
74
- # Text-only models (for chat edit without images)
75
- TEXT_MODELS = [
76
- ("Qwen/Qwen2.5-VL-72B-Instruct", "together", "Qwen2.5-VL 72B"),
77
- ("google/gemma-4-31B-it", "novita", "Gemma 4 31B"),
78
- ("moonshotai/Kimi-K2.5", "fireworks-ai", "Kimi K2.5"),
79
  ]
80
 
81
  def _extract_response_text(message):
@@ -293,7 +290,7 @@ CSS = """
293
  with gr.Blocks(title="Garment Pattern Studio") as demo:
294
  gr.HTML("""<div class="main-header"><h1>🧵 Garment Pattern Studio</h1>
295
  <p style="font-size:1.1em;color:#555;">Analyze garments, edit with chat, preview in 3D, refine with AI agent</p></div>
296
- <div class="info-box"><b>Powered by:</b> Qwen2.5-VL 72B · Gemma 4 31B · Kimi K2.5 via
297
  <a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
298
  &nbsp;|&nbsp; <b>3D view built from actual 2D pattern pieces</b></div>""")
299
 
@@ -367,17 +364,11 @@ with gr.Blocks(title="Garment Pattern Studio") as demo:
367
  chat_clear.click(clear_chat, outputs=[chatbot, chat_pattern_2d, chat_3d, chat_summary, chat_json])
368
 
369
  with gr.Tab("🔄 Agentic Refinement"):
370
- gr.Markdown("""### Iterative Refinement Loop
371
- Upload a garment image. The AI agent will:
372
- 1. **Analyze** → extract initial pattern parameters via VLM
373
- 2. **Generate** → create 2D pattern + 3D garment from pattern pieces
374
- 3. **Project** render 3D to 2D front view
375
- 4. **Compare** → SSIM + Edge-SSIM similarity metrics
376
- 5. **VLM Assess** → visual comparison, identify differences, suggest adjustments
377
- 6. **Refine** → apply damped parameter updates, keep-best tracking
378
- 7. **Repeat** until convergence or max iterations
379
-
380
- *Requires HF_TOKEN for VLM-powered refinement.*""")
381
  with gr.Row():
382
  with gr.Column(scale=1):
383
  refine_image = gr.Image(type="pil", label="Upload Garment Image", height=300)
 
65
  }}
66
  }}"""
67
 
68
+ # Verified working VLMs (tested 2026-04-25)
69
+ # Llama-4-Scout: confirmed image support, answers in content field
70
+ # Kimi-K2.6: image support, answers in reasoning field
71
+ # Qwen3.5-9B: image support unclear, answers in reasoning field
72
  VISION_MODELS = [
73
+ ("meta-llama/Llama-4-Scout-17B-16E-Instruct", "nscale", "Llama-4-Scout 17B"),
74
+ ("moonshotai/Kimi-K2.6", "together", "Kimi K2.6"),
75
+ ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
 
 
 
 
 
 
76
  ]
77
 
78
  def _extract_response_text(message):
 
290
  with gr.Blocks(title="Garment Pattern Studio") as demo:
291
  gr.HTML("""<div class="main-header"><h1>🧵 Garment Pattern Studio</h1>
292
  <p style="font-size:1.1em;color:#555;">Analyze garments, edit with chat, preview in 3D, refine with AI agent</p></div>
293
+ <div class="info-box"><b>Powered by:</b> Llama-4-Scout · Kimi K2.6 · Qwen 3.5 via
294
  <a href="https://huggingface.co/docs/inference-providers">HF Inference Providers</a>
295
  &nbsp;|&nbsp; <b>3D view built from actual 2D pattern pieces</b></div>""")
296
 
 
364
  chat_clear.click(clear_chat, outputs=[chatbot, chat_pattern_2d, chat_3d, chat_summary, chat_json])
365
 
366
  with gr.Tab("🔄 Agentic Refinement"):
367
+ gr.Markdown("""### ⚠️ Work In Progress — Iterative Refinement Loop
368
+ Upload a garment image. The AI agent will iteratively refine pattern parameters.
369
+ See [README](https://huggingface.co/spaces/vikashmakeit/garment-to-pattern) for full design docs.
370
+
371
+ **Status:** Core components (projection, similarity, convergence loop) work. VLM feedback integration needs further testing.""")
 
 
 
 
 
 
372
  with gr.Row():
373
  with gr.Column(scale=1):
374
  refine_image = gr.Image(type="pil", label="Upload Garment Image", height=300)