Spaces:

lablab-ai-amd-developer-hackathon
/

signbridge

Running

LucasLooTan Claude Opus 4.7 (1M context) commited on 1 day ago

Commit

e7597f3

1 Parent(s): 9ad7573

fix: snapshot mode (drop streaming) — HF proxy can't sustain frame uploads

HF Space logs were full of starlette.requests.ClientDisconnect on the
upload_file endpoint: every gradio .stream() frame upload was getting
chopped before completion. Result: stash_count=0 forever, capture
button always saw frame_present=False, recognition never fired on the
deployed Space (worked fine locally on unlimited bandwidth).

Switch:
- gr.Image now non-streaming (one upload per click).
- webcam.change() handler runs recognition + auto-clears the value so
the Webcam component re-mounts.
- New JS auto-clicks the "Click to Access Webcam" placeholder after
the first user-gesture grant — browser remembers permission, so
per-letter UX stays one click.
- Confidence threshold raised to 0.75 + show top-3 alternatives in
the status text so users can see ambiguity ("U 65%, R 29%").
- Recognition path now logs MediaPipe hand detection result + MLP
top-3 + composer/TTS calls end-to-end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

signbridge/recognizer/landmark_classifier.py +19 -0
signbridge/space.py +108 -107

signbridge/recognizer/landmark_classifier.py CHANGED Viewed

@@ -160,12 +160,21 @@ def _ensure_loaded() -> bool:
         return True
 def predict_letter(frame: np.ndarray) -> tuple[str, float]:
     """Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
     `frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
     detected — the upstream caller should fall through to Qwen3-VL.
     """
     if not _ensure_loaded():
         return "", 0.0
@@ -180,6 +189,7 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
     mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
     res = _state["landmarker"].detect(mp_img)  # type: ignore[union-attr]
     if not res.hand_landmarks:
         return "", 0.0
     lm = res.hand_landmarks[0]
@@ -192,4 +202,13 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
         conf = float(probs[idx].item())
     classes = _state["classes"]  # type: ignore[assignment]
     return classes[idx], conf  # type: ignore[index,return-value]

         return True
+# Per-call top-k cache for the most recent prediction. Lets the UI
+# surface alternative letters when the top-1 confidence is borderline.
+last_top3: list[tuple[str, float]] = []
 def predict_letter(frame: np.ndarray) -> tuple[str, float]:
     """Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
     `frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
     detected — the upstream caller should fall through to Qwen3-VL.
+    Side effect: updates `last_top3` with the top-3 alternatives so the UI
+    can show ambiguity when the top-1 is borderline.
     """
+    global last_top3
+    last_top3 = []
     if not _ensure_loaded():
         return "", 0.0
     mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
     res = _state["landmarker"].detect(mp_img)  # type: ignore[union-attr]
     if not res.hand_landmarks:
+        print(f"[mp+mlp] hand NOT detected in frame {frame.shape}", flush=True)
         return "", 0.0
     lm = res.hand_landmarks[0]
         conf = float(probs[idx].item())
     classes = _state["classes"]  # type: ignore[assignment]
+    # Top-3 alternatives for debugging ambiguous classifications.
+    top_vals, top_idx = torch.topk(probs, k=min(3, len(classes)))
+    top3 = [(classes[int(i)], float(v)) for v, i in zip(top_vals, top_idx)]
+    last_top3.clear()
+    last_top3.extend(top3)
+    print(
+        f"[mp+mlp] hand OK; top3={[(t, round(c, 2)) for t, c in top3]}",
+        flush=True,
+    )
     return classes[idx], conf  # type: ignore[index,return-value]

signbridge/space.py CHANGED Viewed

@@ -82,37 +82,10 @@ class _SessionState:
     last_audio_path: str | None = None
-# Single-user demo: one global latest-frame variable populated by the
-# .stream() handler. The Take-image button reads from here.
-_latest_frame: np.ndarray | None = None
-_frame_lock = threading.Lock()
-_stash_count = 0
 def _new_session() -> _SessionState:
     return _SessionState()
-def _stash_frame(frame: np.ndarray | None) -> int:
-    """Webcam .stream() callback. Fires every ~500ms (gradio's internal
-    setInterval in Webcam.svelte) once `recording=true`. Writes the
-    latest live frame to the global cache. Returns _stash_count so we
-    can wire a real (hidden) output — empty outputs=[] silently
-    disables the handler in gradio 4.44.1."""
-    global _latest_frame, _stash_count
-    if frame is None:
-        return _stash_count
-    with _frame_lock:
-        _latest_frame = frame
-        _stash_count += 1
-        if _stash_count == 1 or _stash_count % 30 == 0:
-            print(
-                f"[stash] fired #{_stash_count} shape={frame.shape}",
-                flush=True,
-            )
-    return _stash_count
 def _format_history(signs: list[str]) -> str:
     if not signs:
         return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
@@ -132,16 +105,23 @@ def _recognize(frame: np.ndarray) -> tuple[str, float]:
         extractor = _shared_extractor()
         _, landmarks = extractor.extract(frame)
         if landmarks is None:
             return "", 0.0
-        return classify_landmarks(np.expand_dims(landmarks, axis=0))
     # Default 'vlm' mode — first try the landmark classifier, then VLM.
     from signbridge.recognizer.landmark_classifier import predict_letter
     token, conf = predict_letter(frame)
     if conf >= 0.5:
         return token, conf
-    return recognize_sign_from_frame(frame)
 _extractor_singleton: LandmarkExtractor | None = None
@@ -163,39 +143,55 @@ def _shared_extractor() -> LandmarkExtractor:
         return _extractor_singleton
-def _capture_sign(state: _SessionState) -> tuple[str, str, _SessionState]:
-    """Take-image button handler. Reads the latest streamed frame from
-    the global cache, runs recognition, appends to history."""
-    with _frame_lock:
-        frame = _latest_frame
     print(
-        f"[capture] stash_count={_stash_count} frame_present={frame is not None}",
         flush=True,
     )
     if frame is None:
-        return (
-            "_no frame yet — wait a moment for the camera to start streaming, then try again_",
-            _format_history(state.sign_history),
-            state,
-        )
     token, confidence = _recognize(frame)
-    print(f"[capture] recognised token={token!r} conf={confidence:.2f}", flush=True)
-    if not token or confidence < 0.5:
-        return (
-            "_couldn't recognise that one — try centering the gesture and a plain background_",
-            _format_history(state.sign_history),
-            state,
         )
     state.sign_history.append(token)
-    return (
-        f"detected: **{token}** ({confidence:.0%})",
-        _format_history(state.sign_history),
-        state,
-    )
 def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
@@ -207,11 +203,16 @@ def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
 def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
     if not state.sign_history:
         return "(no signs captured yet)", None, state
     sentence = compose_sentence(list(state.sign_history))
     state.last_sentence = sentence
     state.last_audio_path = synthesize_speech(sentence)
     return sentence, state.last_audio_path, state
@@ -264,42 +265,45 @@ _WEBCAM_BUTTON_LABEL_CSS = """
     font-size: 13px;
     color: #1e1b4b;
 }
-/* Snapshot tab uses streaming + a custom Take-image button. We hide
-   gradio's built-in controls so the user only sees the live preview
-   and our button. A small JS snippet auto-clicks the (hidden) record
-   toggle once after permission is granted, which makes Webcam.svelte
-   start dispatching the .stream() event every 500ms. The
-   "Click to Access Webcam" placeholder is a separate DOM node and
-   stays visible — browsers require a user gesture for getUserMedia(). */
-.signbridge-webcam-snapshot .source-selection,
-.signbridge-webcam-snapshot .controls,
-.signbridge-webcam-snapshot .button-wrap {
     display: none !important;
 }
 """
-# JS injected at app load. Runs in the browser. Polls for gradio's
-# hidden record button inside our snapshot webcam and clicks it once
-# per mount, which flips Webcam.svelte's `recording=true` and starts
-# the .stream() frame loop. Without this, .stream() never fires —
-# gradio gates frame dispatch on the record toggle.
-_AUTO_ARM_STREAM_JS = """
 () => {
-    const SELECTOR = '.signbridge-webcam-snapshot .button-wrap > button';
     const tick = () => {
         document.querySelectorAll(SELECTOR).forEach((btn) => {
-            if (btn.dataset.signbridgeArmed) return;
-            // Only arm a freshly-mounted (not-yet-recording) button.
-            const titleDiv = btn.querySelector('div[title]');
-            if (titleDiv && titleDiv.title === 'start recording') {
-                btn.click();
-                btn.dataset.signbridgeArmed = '1';
-                console.log('[signbridge] auto-armed webcam stream');
             }
         });
     };
-    setInterval(tick, 500);
 }
 """
@@ -309,7 +313,7 @@ def build_demo() -> gr.Blocks:
         title="SignBridge",
         theme=gr.themes.Soft(),
         css=_WEBCAM_BUTTON_LABEL_CSS,
-        js=_AUTO_ARM_STREAM_JS,
     ) as demo:
         gr.Markdown(
             "# 🤟 SignBridge — real-time ASL → English speech\n"
@@ -333,34 +337,33 @@ def build_demo() -> gr.Blocks:
                         gr.HTML(
                             '<div class="signbridge-webcam-help">'
                             '<b>How it works:</b> '
-                            '<b>1.</b> click the preview once to grant camera access · '
                             '<b>2.</b> sign a letter (A–Z) · '
-                            '<b>3.</b> click <b>📸 Take image</b> — recognition is automatic · '
                             '<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
                             "</div>"
                         )
-                        # streaming=True keeps the live preview running
-                        # continuously. _AUTO_ARM_STREAM_JS clicks the
-                        # hidden record button after permission grant
-                        # so Webcam.svelte starts dispatching frames
-                        # via the .stream() event (gated on
-                        # `recording=true`). We hide the record/stop
-                        # controls via CSS so the user only sees a
-                        # clean preview + our Take-image button.
                         webcam = gr.Image(
                             sources=["webcam"],
-                            streaming=True,
-                            label="Sign here",
                             height=420,
                             type="numpy",
                             elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
                         )
                         with gr.Row():
-                            capture_btn = gr.Button(
-                                "📸 Take image", variant="primary", size="lg"
-                            )
                             clear_btn = gr.Button(
-                                "🧹 Clear", variant="secondary", size="lg"
                             )
                         latest = gr.Markdown(value="")
@@ -382,19 +385,17 @@ def build_demo() -> gr.Blocks:
                             "Spell out a word letter-by-letter, then press Speak."
                         )
-                # Hidden Number sink for the .stream() handler — empty
-                # outputs=[] silently disables it in gradio 4.44.1.
-                _stash_sink = gr.Number(value=0, visible=False)
-                webcam.stream(
-                    fn=_stash_frame,
-                    inputs=[webcam],
-                    outputs=[_stash_sink],
-                    show_progress="hidden",
-                )
-                capture_btn.click(
-                    fn=_capture_sign,
-                    inputs=[state],
-                    outputs=[latest, history, state],
                 )
                 speak_btn.click(
                     fn=_speak,

     last_audio_path: str | None = None
 def _new_session() -> _SessionState:
     return _SessionState()
 def _format_history(signs: list[str]) -> str:
     if not signs:
         return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
         extractor = _shared_extractor()
         _, landmarks = extractor.extract(frame)
         if landmarks is None:
+            print("[recognize] holistic: no landmarks detected", flush=True)
             return "", 0.0
+        token, conf = classify_landmarks(np.expand_dims(landmarks, axis=0))
+        print(f"[recognize] holistic-classifier: token={token!r} conf={conf:.2f}", flush=True)
+        return token, conf
     # Default 'vlm' mode — first try the landmark classifier, then VLM.
     from signbridge.recognizer.landmark_classifier import predict_letter
     token, conf = predict_letter(frame)
+    print(f"[recognize] mediapipe+MLP: token={token!r} conf={conf:.2f}", flush=True)
     if conf >= 0.5:
         return token, conf
+    print("[recognize] MLP below threshold; falling through to VLM", flush=True)
+    vtoken, vconf = recognize_sign_from_frame(frame)
+    print(f"[recognize] VLM result: token={vtoken!r} conf={vconf:.2f}", flush=True)
+    return vtoken, vconf
 _extractor_singleton: LandmarkExtractor | None = None
         return _extractor_singleton
+_MIN_CONF_ACCEPT = 0.75   # ≥ this → token accepted into history
+_MIN_CONF_SHOW = 0.50     # below this → "couldn't recognise"
+def _on_snapshot(
+    frame: np.ndarray | None, state: _SessionState
+) -> tuple[str, str, _SessionState, "gr.components.Image"]:
+    """Webcam .change() callback. Fires once per user snapshot in
+    non-streaming mode. Recognises the frame, appends to history,
+    then returns gr.update(value=None) so the Webcam re-mounts and
+    _AUTO_ACCESS_WEBCAM_JS auto-clicks the access placeholder.
+    Bounce guard: when we set value=None below, gradio dispatches
+    another .change() with frame=None. The first branch makes that
+    a no-op so we don't loop forever."""
     print(
+        f"[snapshot] frame_present={frame is not None}"
+        + (f" shape={frame.shape}" if frame is not None else ""),
         flush=True,
     )
     if frame is None:
+        return ("", _format_history(state.sign_history), state, gr.update())
     token, confidence = _recognize(frame)
+    print(f"[snapshot] recognised token={token!r} conf={confidence:.2f}", flush=True)
+    from signbridge.recognizer import landmark_classifier as lc
+    top3 = list(lc.last_top3)
+    top3_str = ", ".join(f"`{t}` ({c:.0%})" for t, c in top3) if top3 else ""
+    if not token or confidence < _MIN_CONF_SHOW:
+        msg = "_couldn't recognise — try centering your hand on a plain background_"
+        if top3_str:
+            msg += f"  \nbest guesses: {top3_str}"
+        return (msg, _format_history(state.sign_history), state, gr.update(value=None))
+    if confidence < _MIN_CONF_ACCEPT:
+        msg = (
+            f"_low confidence on **{token}** ({confidence:.0%}) — re-sign with a clearer pose._  \n"
+            f"top alternatives: {top3_str}"
         )
+        return (msg, _format_history(state.sign_history), state, gr.update(value=None))
     state.sign_history.append(token)
+    status = f"detected: **{token}** ({confidence:.0%})"
+    if top3_str:
+        status += f"  \nalternatives: {top3_str}"
+    return (status, _format_history(state.sign_history), state, gr.update(value=None))
 def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
 def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
     if not state.sign_history:
+        print("[speak] no signs to compose; returning empty.", flush=True)
         return "(no signs captured yet)", None, state
+    print(f"[speak] composing from {len(state.sign_history)} tokens: {state.sign_history}", flush=True)
     sentence = compose_sentence(list(state.sign_history))
+    print(f"[speak] composed sentence: {sentence!r}", flush=True)
     state.last_sentence = sentence
+    print("[speak] synthesising speech...", flush=True)
     state.last_audio_path = synthesize_speech(sentence)
+    print(f"[speak] audio_path={state.last_audio_path}", flush=True)
     return sentence, state.last_audio_path, state
     font-size: 13px;
     color: #1e1b4b;
 }
+/* Snapshot tab uses gradio's built-in snapshot camera button as the
+   sole capture trigger. Streaming had to be dropped because HF Space's
+   proxy can't sustain the per-500ms upload rate. Source-select dropdown
+   is hidden to keep the UI clean. */
+.signbridge-webcam-snapshot .source-selection {
     display: none !important;
 }
 """
+# JS injected at app load. Runs in the browser.
+#
+# Non-streaming gr.Image webcam unmounts the Webcam component each time
+# the value clears (gradio's ImageUploader.svelte: shows the captured
+# image when value!=null, shows Webcam only when value==null). Each
+# remount re-renders the "Click to Access Webcam" placeholder. After
+# the first user-gesture grant, the browser remembers permission, so
+# we can programmatically click that placeholder to snap straight back
+# to live preview — making per-letter UX a single click on the camera
+# button instead of click-allow-then-camera.
+_AUTO_ACCESS_WEBCAM_JS = """
 () => {
+    const SELECTOR = '.signbridge-webcam-snapshot button[title="grant webcam access" i], .signbridge-webcam-snapshot div[title="grant webcam access" i] button';
+    let firstGrantSeen = false;
     const tick = () => {
         document.querySelectorAll(SELECTOR).forEach((btn) => {
+            if (btn.dataset.signbridgeAutoaccessed) return;
+            if (!firstGrantSeen) {
+                // Wait for the user's first click — getUserMedia needs a
+                // genuine user gesture initially. Mark seen on next tick.
+                firstGrantSeen = true;
+                return;
             }
+            btn.click();
+            btn.dataset.signbridgeAutoaccessed = '1';
+            console.log('[signbridge] auto-accessed re-mounted webcam');
         });
     };
+    setInterval(tick, 300);
 }
 """
         title="SignBridge",
         theme=gr.themes.Soft(),
         css=_WEBCAM_BUTTON_LABEL_CSS,
+        js=_AUTO_ACCESS_WEBCAM_JS,
     ) as demo:
         gr.Markdown(
             "# 🤟 SignBridge — real-time ASL → English speech\n"
                         gr.HTML(
                             '<div class="signbridge-webcam-help">'
                             '<b>How it works:</b> '
+                            '<b>1.</b> click the preview once to grant camera access (one-time) · '
                             '<b>2.</b> sign a letter (A–Z) · '
+                            '<b>3.</b> click the <b>📷 camera button</b> in the preview — recognition is automatic, then the preview re-arms · '
                             '<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
                             "</div>"
                         )
+                        # streaming=True was deployable locally but HF
+                        # Space's proxy can't sustain the per-500ms
+                        # frame uploads — every upload_file POST hit
+                        # ClientDisconnect, no frame ever reached
+                        # Python. Switching to non-streaming snapshot
+                        # mode: one upload per click, reliable on HF.
+                        # The webcam re-mounts after auto-clear; the
+                        # _AUTO_ACCESS_WEBCAM_JS injected at app load
+                        # re-clicks the access placeholder so per-letter
+                        # UX stays a single click on gradio's snapshot
+                        # camera button (no double-grant per letter).
                         webcam = gr.Image(
                             sources=["webcam"],
+                            label="Sign here — click the 📷 camera button",
                             height=420,
                             type="numpy",
                             elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
                         )
                         with gr.Row():
                             clear_btn = gr.Button(
+                                "🧹 Clear history", variant="secondary", size="lg"
                             )
                         latest = gr.Markdown(value="")
                             "Spell out a word letter-by-letter, then press Speak."
                         )
+                # In non-streaming mode, .change() fires once per user
+                # snapshot (camera button click). We get the frame
+                # directly as the input — no global cache or stash
+                # plumbing needed. Auto-clear the value at the end so
+                # gradio re-mounts the Webcam component, which together
+                # with _AUTO_ACCESS_WEBCAM_JS makes per-letter UX one
+                # click.
+                webcam.change(
+                    fn=_on_snapshot,
+                    inputs=[webcam, state],
+                    outputs=[latest, history, state, webcam],
                 )
                 speak_btn.click(
                     fn=_speak,