Spaces:

Elliot89
/

Universal_Cross-Domain_Vision_Model

Running

App Files Files Community

Elliot89 commited on 1 day ago

Commit

07c2bbf

verified ·

1 Parent(s): 6d327cd

Upload 3 files

Browse files

Files changed (3) hide show

README.md +46 -6
app.py +224 -0
requirements.txt +38 -0

README.md CHANGED Viewed

@@ -1,13 +1,53 @@
 ---
 title: Universal Cross-Domain Vision Model
-emoji: 🏆
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Universal Cross-Domain Vision Model
+emoji: 🏥🎾
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: "4.0.0"
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Universal Cross-Domain Vision Model
+A BiomedCLIP-powered vision model that classifies images across **medical** and **sports** domains using multi-modal attention fusion.
+## How to deploy to Hugging Face Spaces
+1. Create a new Space at https://huggingface.co/new-space
+   - SDK: **Gradio**
+   - Visibility: Public or Private
+2. Upload these files to the Space repository:
+   ```
+   app.py
+   requirements.txt
+   README_HF_SPACES.md   ← rename this to README.md in the Space
+   ```
+3. Upload your checkpoint:
+   ```
+   universal_vision_checkpoints/best_model_phase1.pt
+   ```
+   > For large files (>1 GB) use Git LFS:
+   > ```bash
+   > git lfs install
+   > git lfs track "*.pt"
+   > git add .gitattributes
+   > ```
+4. Set the environment variable in Space Settings → Variables:
+   ```
+   CHECKPOINT_PATH = universal_vision_checkpoints/best_model_phase1.pt
+   ```
+5. The Space will build automatically. First build takes ~5 minutes.
+## Classes
+| Domain   | Classes |
+|----------|---------|
+| Medical  | Normal, Pneumonia, COVID-19, Tuberculosis, Cardiomegaly, Rib Fracture, Lung Mass, Pleural Effusion |
+| Sports   | Running, Jumping, Swimming, Cycling, Tennis, Football |

app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""
+Universal Cross-Domain Vision Model — Gradio Demo
+==================================================
+Runs locally:   python app.py
+HF Spaces:      push this folder to a Space (SDK: gradio)
+The app loads the trained BiomedCLIP checkpoint and classifies uploaded images
+across medical (8 pathologies) and sports (6 action categories) domains.
+"""
+import os
+import io
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import gradio as gr
+# ─────────────────────────────────────────────────────────────────────────────
+# Configuration
+# ─────────────────────────────────────────────────────────────────────────────
+CHECKPOINT_PATH = os.environ.get(
+    "CHECKPOINT_PATH",
+    os.path.join(os.path.dirname(__file__), "..", "universal_vision_checkpoints", "best_model_phase1.pt"),
+)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MEDICAL_CLASSES = [
+    "Normal",
+    "Pneumonia",
+    "COVID-19",
+    "Tuberculosis",
+    "Cardiomegaly",
+    "Rib Fracture",
+    "Lung Mass",
+    "Pleural Effusion",
+]
+SPORTS_CLASSES = [
+    "Running",
+    "Jumping",
+    "Swimming",
+    "Cycling",
+    "Tennis",
+    "Football",
+]
+ALL_CLASSES = MEDICAL_CLASSES + SPORTS_CLASSES
+# ─────────────────────────────────────────────────────────────────────────────
+# Model Definition  (must match training architecture)
+# ─────────────────────────────────────────────────────────────────────────────
+class BiomedCLIPMultiModalFusion(nn.Module):
+    """Lightweight inference-only wrapper matching the training architecture."""
+    def __init__(self, embed_dim: int = 512, num_classes: int = len(ALL_CLASSES), dropout: float = 0.2):
+        super().__init__()
+        self.embed_dim = embed_dim
+        # Domain discriminator (kept for architecture compatibility)
+        self.domain_discriminator = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(embed_dim // 2, 2),
+        )
+        # Multi-head attention fusion
+        self.attention = nn.MultiheadAttention(
+            embed_dim=embed_dim, num_heads=8, dropout=dropout, batch_first=True
+        )
+        # Feed-forward network
+        self.ffn = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(embed_dim * 4, embed_dim),
+            nn.Dropout(dropout),
+        )
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        # Classifier head
+        self.classifier = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim // 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(embed_dim // 2, num_classes),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [B, embed_dim]  — pre-extracted image features
+        x = x.unsqueeze(1)                         # [B, 1, D]
+        attn_out, _ = self.attention(x, x, x)
+        x = self.norm1(x + attn_out)
+        ffn_out = self.ffn(x)
+        fused = self.norm2(x + ffn_out).squeeze(1) # [B, D]
+        return self.classifier(fused)
+# ─────────────────────────────────────────────────────────────────────────────
+# Load model + backbone
+# ─────────────────────────────────────────────────────────────────────────────
+_model = None
+_backbone = None
+_preprocess = None
+def _load_models():
+    global _model, _backbone, _preprocess
+    if _model is not None:
+        return
+    print(f"[INFO] Loading models on {DEVICE} …")
+    # Try BiomedCLIP first, fall back to standard CLIP
+    try:
+        import open_clip
+        _backbone, _preprocess, _ = open_clip.create_model_and_transforms(
+            "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"
+        )
+        embed_dim = 512
+        print("[INFO] BiomedCLIP backbone loaded.")
+    except Exception as e:
+        print(f"[WARN] BiomedCLIP failed ({e}), using CLIP-ViT-B/32.")
+        import open_clip
+        _backbone, _, _preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
+        embed_dim = 512
+    _backbone = _backbone.to(DEVICE).eval()
+    # Build fusion model
+    _model = BiomedCLIPMultiModalFusion(embed_dim=embed_dim, num_classes=len(ALL_CLASSES))
+    # Load checkpoint weights (graceful fallback if checkpoint is missing)
+    if os.path.isfile(CHECKPOINT_PATH):
+        try:
+            ckpt = torch.load(CHECKPOINT_PATH, map_location=DEVICE, weights_only=False)
+            state = ckpt.get("model_state_dict", ckpt)
+            _model.load_state_dict(state, strict=False)
+            print(f"[INFO] Checkpoint loaded from {CHECKPOINT_PATH}")
+        except Exception as e:
+            print(f"[WARN] Could not load checkpoint: {e}. Running with random weights.")
+    else:
+        print(f"[WARN] Checkpoint not found at {CHECKPOINT_PATH}. Running with random weights.")
+    _model = _model.to(DEVICE).eval()
+    print("[INFO] Model ready.")
+# ─────────────────────────────────────────────────────────────────────────────
+# Inference
+# ─────────────────────────────────────────────────────────────────────────────
+def predict(image: Image.Image) -> dict:
+    """Run inference on a PIL image. Returns a {label: confidence} dict."""
+    _load_models()
+    # Pre-process
+    tensor = _preprocess(image).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        features = _backbone.encode_image(tensor)          # [1, D]
+        features = F.normalize(features.float(), dim=-1)
+        logits = _model(features)                          # [1, num_classes]
+        probs = F.softmax(logits, dim=-1).squeeze(0).cpu().numpy()
+    return {label: float(prob) for label, prob in zip(ALL_CLASSES, probs)}
+def classify(image):
+    if image is None:
+        return {}
+    try:
+        pil_image = Image.fromarray(image).convert("RGB")
+        scores = predict(pil_image)
+        # Sort by confidence descending
+        return dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))
+    except Exception as e:
+        return {"Error": str(e)}
+# ─────────────────────────────────────────────────────────────────────────────
+# Gradio Interface
+# ─────────────────────────────────────────────────────────────────────────────
+DESCRIPTION = """
+## 🏥🎾 Universal Cross-Domain Vision Model
+Classifies images across **medical** (X-ray pathologies) and **sports** domains using a
+BiomedCLIP backbone with multi-modal attention fusion.
+**Medical classes:** Normal, Pneumonia, COVID-19, Tuberculosis, Cardiomegaly, Rib Fracture, Lung Mass, Pleural Effusion
+**Sports classes:** Running, Jumping, Swimming, Cycling, Tennis, Football
+Upload any image to get started.
+"""
+with gr.Blocks(title="Universal Vision Model", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=1):
+            img_input = gr.Image(label="Upload Image", type="numpy")
+            submit_btn = gr.Button("Classify", variant="primary")
+        with gr.Column(scale=1):
+            label_output = gr.Label(num_top_classes=8, label="Predictions")
+    submit_btn.click(fn=classify, inputs=img_input, outputs=label_output)
+    img_input.change(fn=classify, inputs=img_input, outputs=label_output)
+    gr.Examples(
+        examples=[],  # Add example image paths here if available
+        inputs=img_input,
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", 7860)),
+        share=False,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+# ── Core ML ──────────────────────────────────────────────────────────────────
+torch>=2.1.0
+torchvision>=0.16.0
+timm>=0.9.12
+# ── Vision-Language Models ────────────────────────────────────────────────────
+transformers>=4.35.0
+open-clip-torch>=2.24.0
+# ── Medical Datasets ──────────────────────────────────────────────────────────
+medmnist>=2.2.0
+# ── Image Processing ──────────────────────────────────────────────────────────
+Pillow>=10.0.0
+opencv-python-headless>=4.8.0
+albumentations>=1.3.0
+# ── Data / Metrics ────────────────────────────────────────────────────────────
+numpy>=1.24.0
+scikit-learn>=1.3.0
+scipy>=1.11.0
+pandas>=2.0.0
+# ── Visualisation ────────────────────────────────────────────────────────────
+matplotlib>=3.7.0
+seaborn>=0.12.0
+tqdm>=4.65.0
+# ── Web Demo ──────────────────────────────────────────────────────────────────
+gradio>=4.0.0
+# ── REST API ──────────────────────────────────────────────────────────────────
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+python-multipart>=0.0.6
+# ── Utilities ────────────────────────────────────────────────────────────────
+huggingface_hub>=0.19.0