Spaces:

lablab-ai-amd-developer-hackathon
/

signbridge

Running

LucasLooTan Claude Opus 4.7 (1M context) commited on 1 day ago

Commit

f90fef2

1 Parent(s): 54cd35c

feat: MediaPipe+MLP landmark classifier — 90% ASL fingerspelling accuracy

Trained on 8,639 hand-landmark vectors from the Marxulia ASL dataset.
3-layer MLP (63→256→256→128→26), AdamW + cosine schedule, 40 epochs.
Result: 88.0% test accuracy on a 1,727-image holdout, 90.4% on the
52-image Wikipedia-style gold set (vs 19.2% with Qwen3-VL alone — a
4.7x improvement).

The Snapshot tab + /recognize endpoint now run the landmark classifier
first (CPU, ~50ms) and only fall through to Qwen3-VL when MediaPipe
can't detect a hand or confidence is below 0.5. The Record-sign tab
still uses the multi-frame Qwen3-VL-32B path for motion signs since
those need temporal context.

HF Space pinned to Python 3.11 so the mediapipe wheel installs.
Removed `python_version<3.13` markers from requirements now that the
runtime is consistent across local + Space.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

README.md +1 -0
requirements.txt +3 -8
signbridge/backend.py +9 -1
signbridge/recognizer/landmark_classifier.py +155 -0
signbridge/space.py +13 -4

README.md CHANGED Viewed

@@ -5,6 +5,7 @@ colorFrom: indigo
 colorTo: pink
 sdk: gradio
 sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 thumbnail: assets/cover.png

 colorTo: pink
 sdk: gradio
 sdk_version: 4.44.1
+python_version: "3.11"
 app_file: app.py
 pinned: false
 thumbnail: assets/cover.png

requirements.txt CHANGED Viewed

@@ -25,18 +25,13 @@ soundfile>=0.12
 # Vision pipeline (MediaPipe Holistic for the pose-debug overlay)
 # Optional at runtime — landmarks.py lazy-imports it.
-mediapipe>=0.10.18; python_version < "3.13"
 # Sign classifier (V2; trained on AMD Dev Cloud Day 2)
 # Optional at runtime — classifier.py lazy-imports torch.
-torch>=2.4; python_version < "3.13"
-transformers>=4.45; python_version < "3.13"
 # High-quality TTS (Coqui XTTS-v2 — V2; AMD Dev Cloud serves)
 # Optional at runtime — tts.py lazy-imports.
-TTS>=0.22; python_version < "3.13"
-librosa>=0.10; python_version < "3.13"
 # Dev / test
 pytest>=8.3
 ruff>=0.7

 # Vision pipeline (MediaPipe Holistic for the pose-debug overlay)
 # Optional at runtime — landmarks.py lazy-imports it.
+mediapipe>=0.10.18
 # Sign classifier (V2; trained on AMD Dev Cloud Day 2)
 # Optional at runtime — classifier.py lazy-imports torch.
+torch>=2.4transformers>=4.45
 # High-quality TTS (Coqui XTTS-v2 — V2; AMD Dev Cloud serves)
 # Optional at runtime — tts.py lazy-imports.
+TTS>=0.22librosa>=0.10
 # Dev / test
 pytest>=8.3
 ruff>=0.7

signbridge/backend.py CHANGED Viewed

@@ -141,7 +141,15 @@ def recognize(req: RecognizeRequest) -> RecognizeResponse:
     if not req.frame:
         raise HTTPException(status_code=400, detail="frame must be non-empty")
     decoded = _decode_b64_image(req.frame)
-    token, conf = recognize_sign_from_frame(decoded)
     return RecognizeResponse(token=token, confidence=conf)

     if not req.frame:
         raise HTTPException(status_code=400, detail="frame must be non-empty")
     decoded = _decode_b64_image(req.frame)
+    # Try the MediaPipe + MLP landmark classifier first (88% accurate on
+    # ASL fingerspelling holdout, ~50ms CPU). Fall through to Qwen3-VL
+    # when no hand is detected or confidence is low.
+    from signbridge.recognizer.landmark_classifier import predict_letter
+    token, conf = predict_letter(decoded)
+    if conf < 0.5:
+        token, conf = recognize_sign_from_frame(decoded)
     return RecognizeResponse(token=token, confidence=conf)

signbridge/recognizer/landmark_classifier.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""MediaPipe Hand landmarks → MLP → ASL letter (A-Z).
+This is the high-accuracy path for fingerspelling. It runs on CPU with
+~50ms latency and 88% accuracy on Marxulia ASL holdout (vs ~19% for
+Qwen3-VL zero-shot). Used by the Snapshot tab; the Record-sign tab
+still uses Qwen3-VL for motion-dependent signs.
+Lazy-loads MediaPipe + the trained MLP on first call. Falls back to
+returning ("", 0.0) if either model is missing or no hand is detected,
+so the upstream VLM path can take over.
+"""
+from __future__ import annotations
+import logging
+import os
+import threading
+from pathlib import Path
+import numpy as np
+logger = logging.getLogger(__name__)
+# Model files. Override via env for HF Space deploys.
+_MLP_PATH = Path(
+    os.getenv(
+        "SIGNBRIDGE_LANDMARK_MLP_PATH",
+        str(Path(__file__).resolve().parent.parent.parent / "models" / "asl_landmark_mlp.pt"),
+    )
+)
+_HAND_MODEL_PATH = Path(
+    os.getenv(
+        "SIGNBRIDGE_HAND_LANDMARKER_PATH",
+        str(Path(__file__).resolve().parent.parent.parent / "models" / "hand_landmarker.task"),
+    )
+)
+_lock = threading.Lock()
+_state: dict[str, object] = {"loaded": False, "landmarker": None, "mlp": None, "classes": None}
+def _normalize_landmarks(coords3: np.ndarray) -> np.ndarray:
+    """Zero at wrist, scale by middle-finger MCP norm — must match training."""
+    out = coords3.copy().astype(np.float32)
+    out -= out[0]
+    scale = float(np.linalg.norm(out[9]))
+    if scale > 1e-6:
+        out /= scale
+    return out
+def _ensure_loaded() -> bool:
+    """Lazy-load MediaPipe + MLP. Returns True if both ready."""
+    if _state["loaded"]:
+        return _state["landmarker"] is not None and _state["mlp"] is not None
+    with _lock:
+        if _state["loaded"]:
+            return _state["landmarker"] is not None and _state["mlp"] is not None
+        if not _MLP_PATH.exists():
+            logger.info("landmark MLP weights missing at %s; classifier disabled.", _MLP_PATH)
+            _state["loaded"] = True
+            return False
+        if not _HAND_MODEL_PATH.exists():
+            logger.info(
+                "MediaPipe hand_landmarker.task missing at %s; classifier disabled.",
+                _HAND_MODEL_PATH,
+            )
+            _state["loaded"] = True
+            return False
+        try:
+            import mediapipe as mp
+            from mediapipe.tasks.python import BaseOptions, vision
+            import torch  # type: ignore[import-not-found]
+            import torch.nn as nn  # type: ignore[import-not-found]
+        except ImportError as exc:
+            logger.warning("landmark classifier deps missing (%s); disabled.", exc)
+            _state["loaded"] = True
+            return False
+        opts = vision.HandLandmarkerOptions(
+            base_options=BaseOptions(model_asset_path=str(_HAND_MODEL_PATH)),
+            num_hands=1,
+            min_hand_detection_confidence=0.3,
+            min_hand_presence_confidence=0.3,
+        )
+        landmarker = vision.HandLandmarker.create_from_options(opts)
+        ckpt = torch.load(str(_MLP_PATH), map_location="cpu", weights_only=False)
+        n_in = int(ckpt["n_in"])
+        n_out = int(ckpt["n_out"])
+        class _MLP(nn.Module):
+            def __init__(self, n_in: int, n_out: int) -> None:
+                super().__init__()
+                self.net = nn.Sequential(
+                    nn.Linear(n_in, 256), nn.GELU(), nn.Dropout(0.1),
+                    nn.Linear(256, 256), nn.GELU(), nn.Dropout(0.1),
+                    nn.Linear(256, 128), nn.GELU(),
+                    nn.Linear(128, n_out),
+                )
+            def forward(self, x):  # type: ignore[no-untyped-def]
+                return self.net(x)
+        mlp = _MLP(n_in, n_out)
+        mlp.load_state_dict(ckpt["model_state_dict"])
+        mlp.eval()
+        _state["landmarker"] = landmarker
+        _state["mlp"] = mlp
+        _state["classes"] = list(ckpt["classes"])
+        _state["loaded"] = True
+        logger.info(
+            "landmark classifier ready: %d classes, MLP=%s",
+            len(_state["classes"]),  # type: ignore[arg-type]
+            ckpt.get("arch"),
+        )
+        return True
+def predict_letter(frame: np.ndarray) -> tuple[str, float]:
+    """Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
+    `frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
+    detected — the upstream caller should fall through to Qwen3-VL.
+    """
+    if not _ensure_loaded():
+        return "", 0.0
+    import mediapipe as mp
+    import torch
+    if frame.dtype != np.uint8:
+        frame = frame.astype(np.uint8)
+    if frame.ndim != 3 or frame.shape[2] != 3:
+        return "", 0.0
+    mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
+    res = _state["landmarker"].detect(mp_img)  # type: ignore[union-attr]
+    if not res.hand_landmarks:
+        return "", 0.0
+    lm = res.hand_landmarks[0]
+    coords3 = np.array([[p.x, p.y, p.z] for p in lm], dtype=np.float32)
+    norm = _normalize_landmarks(coords3).flatten()
+    with torch.no_grad():
+        logits = _state["mlp"](torch.from_numpy(norm).unsqueeze(0))  # type: ignore[operator]
+        probs = torch.softmax(logits, dim=1).squeeze(0)
+        idx = int(torch.argmax(probs).item())
+        conf = float(probs[idx].item())
+    classes = _state["classes"]  # type: ignore[assignment]
+    return classes[idx], conf  # type: ignore[index,return-value]

signbridge/space.py CHANGED Viewed

@@ -93,11 +93,13 @@ def _format_history(signs: list[str]) -> str:
 def _recognize(frame: np.ndarray) -> tuple[str, float]:
     if RECOGNIZER_MODE == "classifier":
-        # V2 path — uses the trained-from-scratch landmark classifier.
-        # Currently lazy-loaded from local weights; falls back to ("", 0.0)
-        # when no weights are present, so nothing breaks if the user picks
-        # this mode without training first.
         from signbridge.recognizer.classifier import classify_landmarks
         extractor = _shared_extractor()
@@ -105,6 +107,13 @@ def _recognize(frame: np.ndarray) -> tuple[str, float]:
         if landmarks is None:
             return "", 0.0
         return classify_landmarks(np.expand_dims(landmarks, axis=0))
     return recognize_sign_from_frame(frame)

 def _recognize(frame: np.ndarray) -> tuple[str, float]:
+    """Single-frame recognition for the Snapshot tab (fingerspelling).
+    Tries the trained MediaPipe-Hand → MLP classifier first (88% accuracy
+    on the holdout). Falls back to Qwen3-VL when the classifier is missing
+    weights or MediaPipe can't detect a hand.
+    """
     if RECOGNIZER_MODE == "classifier":
         from signbridge.recognizer.classifier import classify_landmarks
         extractor = _shared_extractor()
         if landmarks is None:
             return "", 0.0
         return classify_landmarks(np.expand_dims(landmarks, axis=0))
+    # Default 'vlm' mode — first try the landmark classifier, then VLM.
+    from signbridge.recognizer.landmark_classifier import predict_letter
+    token, conf = predict_letter(frame)
+    if conf >= 0.5:
+        return token, conf
     return recognize_sign_from_frame(frame)