Spaces:

vikashmakeit
/

garment-to-pattern

Running

App Files Files Community

vikashmakeit commited on 14 days ago

Commit

c7bbe0b

verified ·

1 Parent(s): 5bebd96

Upgrade to Qwen3.5-9B, Gemma 4, and Kimi-K2.5 (replace outdated Qwen2.5-VL)\n\nVerified working via Together AI provider:\n- Qwen/Qwen3.5-9B (primary VLM)\n- google/gemma-4-31B-it (Gemma 4)\n- moonshotai/Kimi-K2.5 (fallback)\n\nHandles reasoning+content response format from newer models."

Browse files

Files changed (1) hide show

app.py +137 -63

app.py CHANGED Viewed

@@ -1,9 +1,14 @@
 """
 Garment Image → 2D Sewing Pattern Demo
-Uses a VLM (via HF Inference API) to analyze garment images and extract
 structured parameters, then generates flat 2D sewing pattern pieces.
 Approach inspired by:
 - ChatGarment (arxiv:2412.17811): VLM → JSON → GarmentCode → 2D patterns
 - NGL-Prompter (arxiv:2602.20700): Training-free VLM → semantic params → patterns
@@ -61,8 +66,63 @@ Be precise with the garment type. Estimate realistic measurements for an average
 Only include measurements relevant to the garment type (e.g., skip pant_length for a shirt).
 """
 def analyze_with_vlm(image: Image.Image) -> Dict:
-    """Analyze garment image using HF Inference API (Qwen2.5-VL)."""
     import requests
     import base64
     from io import BytesIO
@@ -71,22 +131,26 @@ def analyze_with_vlm(image: Image.Image) -> Dict:
     if not hf_token:
         return None
     buf = BytesIO()
     image_rgb = image.convert('RGB')
     image_rgb.save(buf, format='JPEG', quality=85)
     img_b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
-    models = [
-        "Qwen/Qwen2.5-VL-72B-Instruct",
-        "Qwen/Qwen2.5-VL-32B-Instruct",
-        "Qwen/Qwen2.5-VL-7B-Instruct",
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    ]
-    for model_id in models:
         try:
-            url = f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions"
-            headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
             payload = {
                 "model": model_id,
                 "messages": [{
@@ -100,28 +164,38 @@ def analyze_with_vlm(image: Image.Image) -> Dict:
                 "temperature": 0.1,
             }
-            response = requests.post(url, headers=headers, json=payload, timeout=120)
             if response.status_code == 200:
                 result = response.json()
-                content = result['choices'][0]['message']['content']
-                json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', content)
-                if json_match:
-                    json_str = json_match.group(1)
-                else:
-                    json_match = re.search(r'\{[\s\S]*\}', content)
-                    if json_match:
-                        json_str = json_match.group()
-                    else:
-                        continue
                 analysis = json.loads(json_str)
-                analysis['_model_used'] = model_id
                 return analysis
         except Exception as e:
-            print(f"Model {model_id} failed: {e}")
             continue
     return None
@@ -253,7 +327,7 @@ def process_image(image: Optional[Image.Image], garment_type_override: str = "Au
         try:
             analysis = analyze_with_vlm(image)
             if analysis:
-                model_info = f"\n\n*Analysis by: {analysis.get('_model_used', 'VLM')}*"
         except Exception as e:
             print(f"VLM analysis failed: {e}")
             traceback.print_exc()
@@ -283,7 +357,7 @@ def process_image(image: Optional[Image.Image], garment_type_override: str = "Au
 def process_text_description(description: str) -> Tuple:
-    """Generate pattern from text description."""
     import requests
     hf_token = os.environ.get("HF_TOKEN", "")
@@ -291,7 +365,7 @@ def process_text_description(description: str) -> Tuple:
     if not description.strip():
         return None, "Please enter a garment description.", "{}"
-    # Try VLM-based analysis first
     if hf_token:
         TEXT_PROMPT = f"""You are a professional fashion pattern maker. Based on this garment description, extract precise sewing pattern parameters.
@@ -320,36 +394,33 @@ Return ONLY a JSON object (no markdown, no explanation) with this exact structur
 Only include measurements relevant to the garment type. Use realistic values in cm."""
-        try:
-            models = ["Qwen/Qwen2.5-72B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"]
-            for model_id in models:
-                try:
-                    url = f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions"
-                    headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
-                    payload = {
-                        "model": model_id,
-                        "messages": [{"role": "user", "content": TEXT_PROMPT}],
-                        "max_tokens": 1500, "temperature": 0.1
-                    }
-                    response = requests.post(url, headers=headers, json=payload, timeout=60)
-                    if response.status_code == 200:
-                        content = response.json()['choices'][0]['message']['content']
-                        json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', content)
-                        if json_match:
-                            json_str = json_match.group(1)
-                        else:
-                            json_match = re.search(r'\{[\s\S]*\}', content)
-                            json_str = json_match.group() if json_match else None
-                        if json_str:
-                            analysis = json.loads(json_str)
-                            pattern_image, summary = generate_pattern_from_analysis(analysis)
-                            summary += f"\n\n*Analysis by: {model_id}*"
-                            return pattern_image, summary, json.dumps(analysis, indent=2)
-                except Exception as e:
-                    print(f"Text model {model_id} failed: {e}")
-                    continue
-        except Exception as e:
-            print(f"Text analysis failed: {e}")
     # Fallback: keyword matching
     desc_lower = description.lower()
@@ -417,13 +488,16 @@ with gr.Blocks(css=CSS, title="Garment → 2D Sewing Pattern", theme=gr.themes.S
     gr.HTML("""
     <div class="info-box">
-        <b>How it works:</b> A Vision-Language Model analyzes the garment image to identify type, style, and proportions.
-        These parameters feed into a parametric pattern generator that produces anatomically-correct 2D sewing pattern pieces
         with seam allowances, grain lines, and notches.
         <br><br>
         <b>Based on research:</b>
-        <a href="https://arxiv.org/abs/2412.17811" target="_blank">ChatGarment</a> (VLM → JSON → 2D patterns) &
-        <a href="https://arxiv.org/abs/2602.20700" target="_blank">NGL-Prompter</a> (training-free VLM approach)
     </div>
     """)

 """
 Garment Image → 2D Sewing Pattern Demo
+Uses modern VLMs (via HF Inference Providers) to analyze garment images and extract
 structured parameters, then generates flat 2D sewing pattern pieces.
+Models (verified working April 2026):
+- Qwen/Qwen3.5-9B via Together AI (primary)
+- google/gemma-4-31B-it via Together AI (Gemma 4)
+- moonshotai/Kimi-K2.5 via Together AI (fallback)
 Approach inspired by:
 - ChatGarment (arxiv:2412.17811): VLM → JSON → GarmentCode → 2D patterns
 - NGL-Prompter (arxiv:2602.20700): Training-free VLM → semantic params → patterns
 Only include measurements relevant to the garment type (e.g., skip pant_length for a shirt).
 """
+# Model configurations: (model_id, provider, display_name)
+# Verified working via HF Inference Providers (April 2026)
+VISION_MODELS = [
+    ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
+    ("google/gemma-4-31B-it", "together", "Gemma 4 31B"),
+    ("moonshotai/Kimi-K2.5", "together", "Kimi K2.5"),
+]
+TEXT_MODELS = [
+    ("Qwen/Qwen3.5-9B", "together", "Qwen 3.5 9B"),
+    ("google/gemma-4-31B-it", "together", "Gemma 4 31B"),
+    ("moonshotai/Kimi-K2.5", "together", "Kimi K2.5"),
+]
+def _extract_response_text(message: dict) -> str:
+    """
+    Extract text from a model response message.
+    Newer models (Qwen3.5, Gemma4) use 'reasoning' field for chain-of-thought
+    and 'content' for the final answer. We prefer 'content' when non-empty.
+    """
+    content = message.get('content', '') or ''
+    reasoning = message.get('reasoning', '') or ''
+    # Prefer content (final answer) over reasoning
+    if content.strip():
+        return content.strip()
+    if reasoning.strip():
+        return reasoning.strip()
+    return ''
+def _extract_json_from_text(text: str) -> Optional[str]:
+    """Extract JSON object from text that may contain markdown or other wrappers."""
+    # Try markdown code block first
+    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text)
+    if json_match:
+        return json_match.group(1)
+    # Try raw JSON object
+    json_match = re.search(r'\{[\s\S]*\}', text)
+    if json_match:
+        return json_match.group()
+    return None
 def analyze_with_vlm(image: Image.Image) -> Dict:
+    """
+    Analyze garment image using modern VLMs via HF Inference Providers.
+    Tries models in priority order:
+    1. Qwen 3.5 9B (fast, good structured output)
+    2. Gemma 4 31B (Google, strong vision)
+    3. Kimi K2.5 (Moonshot AI, good general VLM)
+    """
     import requests
     import base64
     from io import BytesIO
     if not hf_token:
         return None
+    # Resize image if too large (save bandwidth & speed)
+    max_dim = 1024
+    if max(image.size) > max_dim:
+        ratio = max_dim / max(image.size)
+        new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
+        image = image.resize(new_size, Image.LANCZOS)
     buf = BytesIO()
     image_rgb = image.convert('RGB')
     image_rgb.save(buf, format='JPEG', quality=85)
     img_b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
+    for model_id, provider, display_name in VISION_MODELS:
         try:
+            url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
+            headers = {
+                "Authorization": f"Bearer {hf_token}",
+                "Content-Type": "application/json"
+            }
             payload = {
                 "model": model_id,
                 "messages": [{
                 "temperature": 0.1,
             }
+            print(f"[VLM] Trying {display_name} ({model_id}) via {provider}...")
+            response = requests.post(url, headers=headers, json=payload, timeout=180)
             if response.status_code == 200:
                 result = response.json()
+                message = result['choices'][0]['message']
+                text = _extract_response_text(message)
+                if not text:
+                    print(f"[VLM] {display_name}: empty response")
+                    continue
+                json_str = _extract_json_from_text(text)
+                if not json_str:
+                    print(f"[VLM] {display_name}: no JSON found in response")
+                    continue
                 analysis = json.loads(json_str)
+                analysis['_model_used'] = f"{display_name} ({model_id})"
+                print(f"[VLM] ✅ {display_name}: detected '{analysis.get('garment_type', '?')}'")
                 return analysis
+            else:
+                print(f"[VLM] {display_name}: HTTP {response.status_code}")
+        except json.JSONDecodeError as e:
+            print(f"[VLM] {display_name}: JSON parse error: {e}")
+            continue
+        except requests.exceptions.Timeout:
+            print(f"[VLM] {display_name}: timeout (180s)")
+            continue
         except Exception as e:
+            print(f"[VLM] {display_name} failed: {e}")
             continue
     return None
         try:
             analysis = analyze_with_vlm(image)
             if analysis:
+                model_info = f"\n\n*🤖 Analysis by: {analysis.get('_model_used', 'VLM')}*"
         except Exception as e:
             print(f"VLM analysis failed: {e}")
             traceback.print_exc()
 def process_text_description(description: str) -> Tuple:
+    """Generate pattern from text description using VLM."""
     import requests
     hf_token = os.environ.get("HF_TOKEN", "")
     if not description.strip():
         return None, "Please enter a garment description.", "{}"
+    # Try VLM-based analysis
     if hf_token:
         TEXT_PROMPT = f"""You are a professional fashion pattern maker. Based on this garment description, extract precise sewing pattern parameters.
 Only include measurements relevant to the garment type. Use realistic values in cm."""
+        for model_id, provider, display_name in TEXT_MODELS:
+            try:
+                url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
+                headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
+                payload = {
+                    "model": model_id,
+                    "messages": [{"role": "user", "content": TEXT_PROMPT}],
+                    "max_tokens": 2000, "temperature": 0.1
+                }
+                print(f"[Text] Trying {display_name} via {provider}...")
+                response = requests.post(url, headers=headers, json=payload, timeout=90)
+                if response.status_code == 200:
+                    message = response.json()['choices'][0]['message']
+                    text = _extract_response_text(message)
+                    json_str = _extract_json_from_text(text)
+                    if json_str:
+                        analysis = json.loads(json_str)
+                        pattern_image, summary = generate_pattern_from_analysis(analysis)
+                        summary += f"\n\n*🤖 Analysis by: {display_name} ({model_id})*"
+                        return pattern_image, summary, json.dumps(analysis, indent=2)
+            except Exception as e:
+                print(f"[Text] {display_name} failed: {e}")
+                continue
     # Fallback: keyword matching
     desc_lower = description.lower()
     gr.HTML("""
     <div class="info-box">
+        <b>How it works:</b> A Vision-Language Model analyzes the garment to identify type, style, and proportions.
+        These parameters feed into a parametric pattern generator that produces 2D sewing pattern pieces
         with seam allowances, grain lines, and notches.
         <br><br>
+        <b>Powered by:</b> Qwen 3.5 · Gemma 4 · Kimi K2.5 via
+        <a href="https://huggingface.co/docs/inference-providers" target="_blank">HF Inference Providers</a>
+        <br>
         <b>Based on research:</b>
+        <a href="https://arxiv.org/abs/2412.17811" target="_blank">ChatGarment</a> &
+        <a href="https://arxiv.org/abs/2602.20700" target="_blank">NGL-Prompter</a>
     </div>
     """)