vikashmakeit commited on
Commit
c7bbe0b
·
verified ·
1 Parent(s): 5bebd96

Upgrade to Qwen3.5-9B, Gemma 4, and Kimi-K2.5 (replace outdated Qwen2.5-VL)\n\nVerified working via Together AI provider:\n- Qwen/Qwen3.5-9B (primary VLM)\n- google/gemma-4-31B-it (Gemma 4)\n- moonshotai/Kimi-K2.5 (fallback)\n\nHandles reasoning+content response format from newer models."

Browse files
Files changed (1) hide show
  1. app.py +137 -63
app.py CHANGED
@@ -1,9 +1,14 @@
1
  """
2
  Garment Image → 2D Sewing Pattern Demo
3
 
4
- Uses a VLM (via HF Inference API) to analyze garment images and extract
5
  structured parameters, then generates flat 2D sewing pattern pieces.
6
 
 
 
 
 
 
7
  Approach inspired by:
8
  - ChatGarment (arxiv:2412.17811): VLM → JSON → GarmentCode → 2D patterns
9
  - NGL-Prompter (arxiv:2602.20700): Training-free VLM → semantic params → patterns
@@ -61,8 +66,63 @@ Be precise with the garment type. Estimate realistic measurements for an average
61
  Only include measurements relevant to the garment type (e.g., skip pant_length for a shirt).
62
  """
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def analyze_with_vlm(image: Image.Image) -> Dict:
65
- """Analyze garment image using HF Inference API (Qwen2.5-VL)."""
 
 
 
 
 
 
 
66
  import requests
67
  import base64
68
  from io import BytesIO
@@ -71,22 +131,26 @@ def analyze_with_vlm(image: Image.Image) -> Dict:
71
  if not hf_token:
72
  return None
73
 
 
 
 
 
 
 
 
74
  buf = BytesIO()
75
  image_rgb = image.convert('RGB')
76
  image_rgb.save(buf, format='JPEG', quality=85)
77
  img_b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
78
 
79
- models = [
80
- "Qwen/Qwen2.5-VL-72B-Instruct",
81
- "Qwen/Qwen2.5-VL-32B-Instruct",
82
- "Qwen/Qwen2.5-VL-7B-Instruct",
83
- "meta-llama/Llama-4-Scout-17B-16E-Instruct",
84
- ]
85
-
86
- for model_id in models:
87
  try:
88
- url = f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions"
89
- headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
 
 
 
 
90
  payload = {
91
  "model": model_id,
92
  "messages": [{
@@ -100,28 +164,38 @@ def analyze_with_vlm(image: Image.Image) -> Dict:
100
  "temperature": 0.1,
101
  }
102
 
103
- response = requests.post(url, headers=headers, json=payload, timeout=120)
 
104
 
105
  if response.status_code == 200:
106
  result = response.json()
107
- content = result['choices'][0]['message']['content']
 
 
 
 
 
108
 
109
- json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', content)
110
- if json_match:
111
- json_str = json_match.group(1)
112
- else:
113
- json_match = re.search(r'\{[\s\S]*\}', content)
114
- if json_match:
115
- json_str = json_match.group()
116
- else:
117
- continue
118
 
119
  analysis = json.loads(json_str)
120
- analysis['_model_used'] = model_id
 
121
  return analysis
122
-
 
 
 
 
 
 
 
 
123
  except Exception as e:
124
- print(f"Model {model_id} failed: {e}")
125
  continue
126
 
127
  return None
@@ -253,7 +327,7 @@ def process_image(image: Optional[Image.Image], garment_type_override: str = "Au
253
  try:
254
  analysis = analyze_with_vlm(image)
255
  if analysis:
256
- model_info = f"\n\n*Analysis by: {analysis.get('_model_used', 'VLM')}*"
257
  except Exception as e:
258
  print(f"VLM analysis failed: {e}")
259
  traceback.print_exc()
@@ -283,7 +357,7 @@ def process_image(image: Optional[Image.Image], garment_type_override: str = "Au
283
 
284
 
285
  def process_text_description(description: str) -> Tuple:
286
- """Generate pattern from text description."""
287
  import requests
288
 
289
  hf_token = os.environ.get("HF_TOKEN", "")
@@ -291,7 +365,7 @@ def process_text_description(description: str) -> Tuple:
291
  if not description.strip():
292
  return None, "Please enter a garment description.", "{}"
293
 
294
- # Try VLM-based analysis first
295
  if hf_token:
296
  TEXT_PROMPT = f"""You are a professional fashion pattern maker. Based on this garment description, extract precise sewing pattern parameters.
297
 
@@ -320,36 +394,33 @@ Return ONLY a JSON object (no markdown, no explanation) with this exact structur
320
 
321
  Only include measurements relevant to the garment type. Use realistic values in cm."""
322
 
323
- try:
324
- models = ["Qwen/Qwen2.5-72B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"]
325
- for model_id in models:
326
- try:
327
- url = f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions"
328
- headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
329
- payload = {
330
- "model": model_id,
331
- "messages": [{"role": "user", "content": TEXT_PROMPT}],
332
- "max_tokens": 1500, "temperature": 0.1
333
- }
334
- response = requests.post(url, headers=headers, json=payload, timeout=60)
335
- if response.status_code == 200:
336
- content = response.json()['choices'][0]['message']['content']
337
- json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', content)
338
- if json_match:
339
- json_str = json_match.group(1)
340
- else:
341
- json_match = re.search(r'\{[\s\S]*\}', content)
342
- json_str = json_match.group() if json_match else None
343
- if json_str:
344
- analysis = json.loads(json_str)
345
- pattern_image, summary = generate_pattern_from_analysis(analysis)
346
- summary += f"\n\n*Analysis by: {model_id}*"
347
- return pattern_image, summary, json.dumps(analysis, indent=2)
348
- except Exception as e:
349
- print(f"Text model {model_id} failed: {e}")
350
- continue
351
- except Exception as e:
352
- print(f"Text analysis failed: {e}")
353
 
354
  # Fallback: keyword matching
355
  desc_lower = description.lower()
@@ -417,13 +488,16 @@ with gr.Blocks(css=CSS, title="Garment → 2D Sewing Pattern", theme=gr.themes.S
417
 
418
  gr.HTML("""
419
  <div class="info-box">
420
- <b>How it works:</b> A Vision-Language Model analyzes the garment image to identify type, style, and proportions.
421
- These parameters feed into a parametric pattern generator that produces anatomically-correct 2D sewing pattern pieces
422
  with seam allowances, grain lines, and notches.
423
  <br><br>
 
 
 
424
  <b>Based on research:</b>
425
- <a href="https://arxiv.org/abs/2412.17811" target="_blank">ChatGarment</a> (VLM → JSON → 2D patterns) &
426
- <a href="https://arxiv.org/abs/2602.20700" target="_blank">NGL-Prompter</a> (training-free VLM approach)
427
  </div>
428
  """)
429
 
 
1
  """
2
  Garment Image → 2D Sewing Pattern Demo
3
 
4
+ Uses modern VLMs (via HF Inference Providers) to analyze garment images and extract
5
  structured parameters, then generates flat 2D sewing pattern pieces.
6
 
7
+ Models (verified working April 2026):
8
+ - Qwen/Qwen3.5-9B via Together AI (primary)
9
+ - google/gemma-4-31B-it via Together AI (Gemma 4)
10
+ - moonshotai/Kimi-K2.5 via Together AI (fallback)
11
+
12
  Approach inspired by:
13
  - ChatGarment (arxiv:2412.17811): VLM → JSON → GarmentCode → 2D patterns
14
  - NGL-Prompter (arxiv:2602.20700): Training-free VLM → semantic params → patterns
 
66
  Only include measurements relevant to the garment type (e.g., skip pant_length for a shirt).
67
  """
68
 
69
+ # Model configurations: (model_id, provider, display_name)
70
+ # Verified working via HF Inference Providers (April 2026)
71
+ VISION_MODELS = [
72
+ ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
73
+ ("google/gemma-4-31B-it", "together", "Gemma 4 31B"),
74
+ ("moonshotai/Kimi-K2.5", "together", "Kimi K2.5"),
75
+ ]
76
+
77
+ TEXT_MODELS = [
78
+ ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
79
+ ("google/gemma-4-31B-it", "together", "Gemma 4 31B"),
80
+ ("moonshotai/Kimi-K2.5", "together", "Kimi K2.5"),
81
+ ]
82
+
83
+
84
+ def _extract_response_text(message: dict) -> str:
85
+ """
86
+ Extract text from a model response message.
87
+
88
+ Newer models (Qwen3.5, Gemma4) use 'reasoning' field for chain-of-thought
89
+ and 'content' for the final answer. We prefer 'content' when non-empty.
90
+ """
91
+ content = message.get('content', '') or ''
92
+ reasoning = message.get('reasoning', '') or ''
93
+
94
+ # Prefer content (final answer) over reasoning
95
+ if content.strip():
96
+ return content.strip()
97
+ if reasoning.strip():
98
+ return reasoning.strip()
99
+ return ''
100
+
101
+
102
+ def _extract_json_from_text(text: str) -> Optional[str]:
103
+ """Extract JSON object from text that may contain markdown or other wrappers."""
104
+ # Try markdown code block first
105
+ json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text)
106
+ if json_match:
107
+ return json_match.group(1)
108
+
109
+ # Try raw JSON object
110
+ json_match = re.search(r'\{[\s\S]*\}', text)
111
+ if json_match:
112
+ return json_match.group()
113
+
114
+ return None
115
+
116
+
117
  def analyze_with_vlm(image: Image.Image) -> Dict:
118
+ """
119
+ Analyze garment image using modern VLMs via HF Inference Providers.
120
+
121
+ Tries models in priority order:
122
+ 1. Qwen 3.5 9B (fast, good structured output)
123
+ 2. Gemma 4 31B (Google, strong vision)
124
+ 3. Kimi K2.5 (Moonshot AI, good general VLM)
125
+ """
126
  import requests
127
  import base64
128
  from io import BytesIO
 
131
  if not hf_token:
132
  return None
133
 
134
+ # Resize image if too large (save bandwidth & speed)
135
+ max_dim = 1024
136
+ if max(image.size) > max_dim:
137
+ ratio = max_dim / max(image.size)
138
+ new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
139
+ image = image.resize(new_size, Image.LANCZOS)
140
+
141
  buf = BytesIO()
142
  image_rgb = image.convert('RGB')
143
  image_rgb.save(buf, format='JPEG', quality=85)
144
  img_b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
145
 
146
+ for model_id, provider, display_name in VISION_MODELS:
 
 
 
 
 
 
 
147
  try:
148
+ url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
149
+ headers = {
150
+ "Authorization": f"Bearer {hf_token}",
151
+ "Content-Type": "application/json"
152
+ }
153
+
154
  payload = {
155
  "model": model_id,
156
  "messages": [{
 
164
  "temperature": 0.1,
165
  }
166
 
167
+ print(f"[VLM] Trying {display_name} ({model_id}) via {provider}...")
168
+ response = requests.post(url, headers=headers, json=payload, timeout=180)
169
 
170
  if response.status_code == 200:
171
  result = response.json()
172
+ message = result['choices'][0]['message']
173
+ text = _extract_response_text(message)
174
+
175
+ if not text:
176
+ print(f"[VLM] {display_name}: empty response")
177
+ continue
178
 
179
+ json_str = _extract_json_from_text(text)
180
+ if not json_str:
181
+ print(f"[VLM] {display_name}: no JSON found in response")
182
+ continue
 
 
 
 
 
183
 
184
  analysis = json.loads(json_str)
185
+ analysis['_model_used'] = f"{display_name} ({model_id})"
186
+ print(f"[VLM] ✅ {display_name}: detected '{analysis.get('garment_type', '?')}'")
187
  return analysis
188
+ else:
189
+ print(f"[VLM] {display_name}: HTTP {response.status_code}")
190
+
191
+ except json.JSONDecodeError as e:
192
+ print(f"[VLM] {display_name}: JSON parse error: {e}")
193
+ continue
194
+ except requests.exceptions.Timeout:
195
+ print(f"[VLM] {display_name}: timeout (180s)")
196
+ continue
197
  except Exception as e:
198
+ print(f"[VLM] {display_name} failed: {e}")
199
  continue
200
 
201
  return None
 
327
  try:
328
  analysis = analyze_with_vlm(image)
329
  if analysis:
330
+ model_info = f"\n\n*🤖 Analysis by: {analysis.get('_model_used', 'VLM')}*"
331
  except Exception as e:
332
  print(f"VLM analysis failed: {e}")
333
  traceback.print_exc()
 
357
 
358
 
359
  def process_text_description(description: str) -> Tuple:
360
+ """Generate pattern from text description using VLM."""
361
  import requests
362
 
363
  hf_token = os.environ.get("HF_TOKEN", "")
 
365
  if not description.strip():
366
  return None, "Please enter a garment description.", "{}"
367
 
368
+ # Try VLM-based analysis
369
  if hf_token:
370
  TEXT_PROMPT = f"""You are a professional fashion pattern maker. Based on this garment description, extract precise sewing pattern parameters.
371
 
 
394
 
395
  Only include measurements relevant to the garment type. Use realistic values in cm."""
396
 
397
+ for model_id, provider, display_name in TEXT_MODELS:
398
+ try:
399
+ url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
400
+ headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
401
+ payload = {
402
+ "model": model_id,
403
+ "messages": [{"role": "user", "content": TEXT_PROMPT}],
404
+ "max_tokens": 2000, "temperature": 0.1
405
+ }
406
+
407
+ print(f"[Text] Trying {display_name} via {provider}...")
408
+ response = requests.post(url, headers=headers, json=payload, timeout=90)
409
+
410
+ if response.status_code == 200:
411
+ message = response.json()['choices'][0]['message']
412
+ text = _extract_response_text(message)
413
+ json_str = _extract_json_from_text(text)
414
+
415
+ if json_str:
416
+ analysis = json.loads(json_str)
417
+ pattern_image, summary = generate_pattern_from_analysis(analysis)
418
+ summary += f"\n\n*🤖 Analysis by: {display_name} ({model_id})*"
419
+ return pattern_image, summary, json.dumps(analysis, indent=2)
420
+
421
+ except Exception as e:
422
+ print(f"[Text] {display_name} failed: {e}")
423
+ continue
 
 
 
424
 
425
  # Fallback: keyword matching
426
  desc_lower = description.lower()
 
488
 
489
  gr.HTML("""
490
  <div class="info-box">
491
+ <b>How it works:</b> A Vision-Language Model analyzes the garment to identify type, style, and proportions.
492
+ These parameters feed into a parametric pattern generator that produces 2D sewing pattern pieces
493
  with seam allowances, grain lines, and notches.
494
  <br><br>
495
+ <b>Powered by:</b> Qwen 3.5 · Gemma 4 · Kimi K2.5 via
496
+ <a href="https://huggingface.co/docs/inference-providers" target="_blank">HF Inference Providers</a>
497
+ <br>
498
  <b>Based on research:</b>
499
+ <a href="https://arxiv.org/abs/2412.17811" target="_blank">ChatGarment</a> &
500
+ <a href="https://arxiv.org/abs/2602.20700" target="_blank">NGL-Prompter</a>
501
  </div>
502
  """)
503