LucasLooTan Claude Opus 4.7 (1M context) commited on
Commit
e7597f3
·
1 Parent(s): 9ad7573

fix: snapshot mode (drop streaming) — HF proxy can't sustain frame uploads

Browse files

HF Space logs were full of starlette.requests.ClientDisconnect on the
upload_file endpoint: every gradio .stream() frame upload was getting
chopped before completion. Result: stash_count=0 forever, capture
button always saw frame_present=False, recognition never fired on the
deployed Space (worked fine locally on unlimited bandwidth).

Switch:
- gr.Image now non-streaming (one upload per click).
- webcam.change() handler runs recognition + auto-clears the value so
the Webcam component re-mounts.
- New JS auto-clicks the "Click to Access Webcam" placeholder after
the first user-gesture grant — browser remembers permission, so
per-letter UX stays one click.
- Confidence threshold raised to 0.75 + show top-3 alternatives in
the status text so users can see ambiguity ("U 65%, R 29%").
- Recognition path now logs MediaPipe hand detection result + MLP
top-3 + composer/TTS calls end-to-end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

signbridge/recognizer/landmark_classifier.py CHANGED
@@ -160,12 +160,21 @@ def _ensure_loaded() -> bool:
160
  return True
161
 
162
 
 
 
 
 
 
163
  def predict_letter(frame: np.ndarray) -> tuple[str, float]:
164
  """Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
165
 
166
  `frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
167
  detected — the upstream caller should fall through to Qwen3-VL.
 
 
168
  """
 
 
169
  if not _ensure_loaded():
170
  return "", 0.0
171
 
@@ -180,6 +189,7 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
180
  mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
181
  res = _state["landmarker"].detect(mp_img) # type: ignore[union-attr]
182
  if not res.hand_landmarks:
 
183
  return "", 0.0
184
 
185
  lm = res.hand_landmarks[0]
@@ -192,4 +202,13 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
192
  conf = float(probs[idx].item())
193
 
194
  classes = _state["classes"] # type: ignore[assignment]
 
 
 
 
 
 
 
 
 
195
  return classes[idx], conf # type: ignore[index,return-value]
 
160
  return True
161
 
162
 
163
+ # Per-call top-k cache for the most recent prediction. Lets the UI
164
+ # surface alternative letters when the top-1 confidence is borderline.
165
+ last_top3: list[tuple[str, float]] = []
166
+
167
+
168
  def predict_letter(frame: np.ndarray) -> tuple[str, float]:
169
  """Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
170
 
171
  `frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
172
  detected — the upstream caller should fall through to Qwen3-VL.
173
+ Side effect: updates `last_top3` with the top-3 alternatives so the UI
174
+ can show ambiguity when the top-1 is borderline.
175
  """
176
+ global last_top3
177
+ last_top3 = []
178
  if not _ensure_loaded():
179
  return "", 0.0
180
 
 
189
  mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
190
  res = _state["landmarker"].detect(mp_img) # type: ignore[union-attr]
191
  if not res.hand_landmarks:
192
+ print(f"[mp+mlp] hand NOT detected in frame {frame.shape}", flush=True)
193
  return "", 0.0
194
 
195
  lm = res.hand_landmarks[0]
 
202
  conf = float(probs[idx].item())
203
 
204
  classes = _state["classes"] # type: ignore[assignment]
205
+ # Top-3 alternatives for debugging ambiguous classifications.
206
+ top_vals, top_idx = torch.topk(probs, k=min(3, len(classes)))
207
+ top3 = [(classes[int(i)], float(v)) for v, i in zip(top_vals, top_idx)]
208
+ last_top3.clear()
209
+ last_top3.extend(top3)
210
+ print(
211
+ f"[mp+mlp] hand OK; top3={[(t, round(c, 2)) for t, c in top3]}",
212
+ flush=True,
213
+ )
214
  return classes[idx], conf # type: ignore[index,return-value]
signbridge/space.py CHANGED
@@ -82,37 +82,10 @@ class _SessionState:
82
  last_audio_path: str | None = None
83
 
84
 
85
- # Single-user demo: one global latest-frame variable populated by the
86
- # .stream() handler. The Take-image button reads from here.
87
- _latest_frame: np.ndarray | None = None
88
- _frame_lock = threading.Lock()
89
- _stash_count = 0
90
-
91
-
92
  def _new_session() -> _SessionState:
93
  return _SessionState()
94
 
95
 
96
- def _stash_frame(frame: np.ndarray | None) -> int:
97
- """Webcam .stream() callback. Fires every ~500ms (gradio's internal
98
- setInterval in Webcam.svelte) once `recording=true`. Writes the
99
- latest live frame to the global cache. Returns _stash_count so we
100
- can wire a real (hidden) output — empty outputs=[] silently
101
- disables the handler in gradio 4.44.1."""
102
- global _latest_frame, _stash_count
103
- if frame is None:
104
- return _stash_count
105
- with _frame_lock:
106
- _latest_frame = frame
107
- _stash_count += 1
108
- if _stash_count == 1 or _stash_count % 30 == 0:
109
- print(
110
- f"[stash] fired #{_stash_count} shape={frame.shape}",
111
- flush=True,
112
- )
113
- return _stash_count
114
-
115
-
116
  def _format_history(signs: list[str]) -> str:
117
  if not signs:
118
  return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
@@ -132,16 +105,23 @@ def _recognize(frame: np.ndarray) -> tuple[str, float]:
132
  extractor = _shared_extractor()
133
  _, landmarks = extractor.extract(frame)
134
  if landmarks is None:
 
135
  return "", 0.0
136
- return classify_landmarks(np.expand_dims(landmarks, axis=0))
 
 
137
 
138
  # Default 'vlm' mode — first try the landmark classifier, then VLM.
139
  from signbridge.recognizer.landmark_classifier import predict_letter
140
 
141
  token, conf = predict_letter(frame)
 
142
  if conf >= 0.5:
143
  return token, conf
144
- return recognize_sign_from_frame(frame)
 
 
 
145
 
146
 
147
  _extractor_singleton: LandmarkExtractor | None = None
@@ -163,39 +143,55 @@ def _shared_extractor() -> LandmarkExtractor:
163
  return _extractor_singleton
164
 
165
 
166
- def _capture_sign(state: _SessionState) -> tuple[str, str, _SessionState]:
167
- """Take-image button handler. Reads the latest streamed frame from
168
- the global cache, runs recognition, appends to history."""
169
- with _frame_lock:
170
- frame = _latest_frame
 
 
 
 
 
 
 
 
 
 
171
  print(
172
- f"[capture] stash_count={_stash_count} frame_present={frame is not None}",
 
173
  flush=True,
174
  )
175
 
176
  if frame is None:
177
- return (
178
- "_no frame yet — wait a moment for the camera to start streaming, then try again_",
179
- _format_history(state.sign_history),
180
- state,
181
- )
182
 
183
  token, confidence = _recognize(frame)
184
- print(f"[capture] recognised token={token!r} conf={confidence:.2f}", flush=True)
185
-
186
- if not token or confidence < 0.5:
187
- return (
188
- "_couldn't recognise that one try centering the gesture and a plain background_",
189
- _format_history(state.sign_history),
190
- state,
 
 
 
 
 
 
 
 
 
191
  )
 
192
 
193
  state.sign_history.append(token)
194
- return (
195
- f"detected: **{token}** ({confidence:.0%})",
196
- _format_history(state.sign_history),
197
- state,
198
- )
199
 
200
 
201
  def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
@@ -207,11 +203,16 @@ def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
207
 
208
  def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
209
  if not state.sign_history:
 
210
  return "(no signs captured yet)", None, state
211
 
 
212
  sentence = compose_sentence(list(state.sign_history))
 
213
  state.last_sentence = sentence
 
214
  state.last_audio_path = synthesize_speech(sentence)
 
215
  return sentence, state.last_audio_path, state
216
 
217
 
@@ -264,42 +265,45 @@ _WEBCAM_BUTTON_LABEL_CSS = """
264
  font-size: 13px;
265
  color: #1e1b4b;
266
  }
267
- /* Snapshot tab uses streaming + a custom Take-image button. We hide
268
- gradio's built-in controls so the user only sees the live preview
269
- and our button. A small JS snippet auto-clicks the (hidden) record
270
- toggle once after permission is granted, which makes Webcam.svelte
271
- start dispatching the .stream() event every 500ms. The
272
- "Click to Access Webcam" placeholder is a separate DOM node and
273
- stays visible — browsers require a user gesture for getUserMedia(). */
274
- .signbridge-webcam-snapshot .source-selection,
275
- .signbridge-webcam-snapshot .controls,
276
- .signbridge-webcam-snapshot .button-wrap {
277
  display: none !important;
278
  }
279
  """
280
 
281
 
282
- # JS injected at app load. Runs in the browser. Polls for gradio's
283
- # hidden record button inside our snapshot webcam and clicks it once
284
- # per mount, which flips Webcam.svelte's `recording=true` and starts
285
- # the .stream() frame loop. Without this, .stream() never fires
286
- # gradio gates frame dispatch on the record toggle.
287
- _AUTO_ARM_STREAM_JS = """
 
 
 
 
 
288
  () => {
289
- const SELECTOR = '.signbridge-webcam-snapshot .button-wrap > button';
 
290
  const tick = () => {
291
  document.querySelectorAll(SELECTOR).forEach((btn) => {
292
- if (btn.dataset.signbridgeArmed) return;
293
- // Only arm a freshly-mounted (not-yet-recording) button.
294
- const titleDiv = btn.querySelector('div[title]');
295
- if (titleDiv && titleDiv.title === 'start recording') {
296
- btn.click();
297
- btn.dataset.signbridgeArmed = '1';
298
- console.log('[signbridge] auto-armed webcam stream');
299
  }
 
 
 
300
  });
301
  };
302
- setInterval(tick, 500);
303
  }
304
  """
305
 
@@ -309,7 +313,7 @@ def build_demo() -> gr.Blocks:
309
  title="SignBridge",
310
  theme=gr.themes.Soft(),
311
  css=_WEBCAM_BUTTON_LABEL_CSS,
312
- js=_AUTO_ARM_STREAM_JS,
313
  ) as demo:
314
  gr.Markdown(
315
  "# 🤟 SignBridge — real-time ASL → English speech\n"
@@ -333,34 +337,33 @@ def build_demo() -> gr.Blocks:
333
  gr.HTML(
334
  '<div class="signbridge-webcam-help">'
335
  '<b>How it works:</b> '
336
- '<b>1.</b> click the preview once to grant camera access · '
337
  '<b>2.</b> sign a letter (A–Z) · '
338
- '<b>3.</b> click <b>📸 Take image</b> — recognition is automatic · '
339
  '<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
340
  "</div>"
341
  )
342
- # streaming=True keeps the live preview running
343
- # continuously. _AUTO_ARM_STREAM_JS clicks the
344
- # hidden record button after permission grant
345
- # so Webcam.svelte starts dispatching frames
346
- # via the .stream() event (gated on
347
- # `recording=true`). We hide the record/stop
348
- # controls via CSS so the user only sees a
349
- # clean preview + our Take-image button.
 
 
 
350
  webcam = gr.Image(
351
  sources=["webcam"],
352
- streaming=True,
353
- label="Sign here",
354
  height=420,
355
  type="numpy",
356
  elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
357
  )
358
  with gr.Row():
359
- capture_btn = gr.Button(
360
- "📸 Take image", variant="primary", size="lg"
361
- )
362
  clear_btn = gr.Button(
363
- "🧹 Clear", variant="secondary", size="lg"
364
  )
365
  latest = gr.Markdown(value="")
366
 
@@ -382,19 +385,17 @@ def build_demo() -> gr.Blocks:
382
  "Spell out a word letter-by-letter, then press Speak."
383
  )
384
 
385
- # Hidden Number sink for the .stream() handler empty
386
- # outputs=[] silently disables it in gradio 4.44.1.
387
- _stash_sink = gr.Number(value=0, visible=False)
388
- webcam.stream(
389
- fn=_stash_frame,
390
- inputs=[webcam],
391
- outputs=[_stash_sink],
392
- show_progress="hidden",
393
- )
394
- capture_btn.click(
395
- fn=_capture_sign,
396
- inputs=[state],
397
- outputs=[latest, history, state],
398
  )
399
  speak_btn.click(
400
  fn=_speak,
 
82
  last_audio_path: str | None = None
83
 
84
 
 
 
 
 
 
 
 
85
  def _new_session() -> _SessionState:
86
  return _SessionState()
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def _format_history(signs: list[str]) -> str:
90
  if not signs:
91
  return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
 
105
  extractor = _shared_extractor()
106
  _, landmarks = extractor.extract(frame)
107
  if landmarks is None:
108
+ print("[recognize] holistic: no landmarks detected", flush=True)
109
  return "", 0.0
110
+ token, conf = classify_landmarks(np.expand_dims(landmarks, axis=0))
111
+ print(f"[recognize] holistic-classifier: token={token!r} conf={conf:.2f}", flush=True)
112
+ return token, conf
113
 
114
  # Default 'vlm' mode — first try the landmark classifier, then VLM.
115
  from signbridge.recognizer.landmark_classifier import predict_letter
116
 
117
  token, conf = predict_letter(frame)
118
+ print(f"[recognize] mediapipe+MLP: token={token!r} conf={conf:.2f}", flush=True)
119
  if conf >= 0.5:
120
  return token, conf
121
+ print("[recognize] MLP below threshold; falling through to VLM", flush=True)
122
+ vtoken, vconf = recognize_sign_from_frame(frame)
123
+ print(f"[recognize] VLM result: token={vtoken!r} conf={vconf:.2f}", flush=True)
124
+ return vtoken, vconf
125
 
126
 
127
  _extractor_singleton: LandmarkExtractor | None = None
 
143
  return _extractor_singleton
144
 
145
 
146
+ _MIN_CONF_ACCEPT = 0.75 # this token accepted into history
147
+ _MIN_CONF_SHOW = 0.50 # below this "couldn't recognise"
148
+
149
+
150
+ def _on_snapshot(
151
+ frame: np.ndarray | None, state: _SessionState
152
+ ) -> tuple[str, str, _SessionState, "gr.components.Image"]:
153
+ """Webcam .change() callback. Fires once per user snapshot in
154
+ non-streaming mode. Recognises the frame, appends to history,
155
+ then returns gr.update(value=None) so the Webcam re-mounts and
156
+ _AUTO_ACCESS_WEBCAM_JS auto-clicks the access placeholder.
157
+
158
+ Bounce guard: when we set value=None below, gradio dispatches
159
+ another .change() with frame=None. The first branch makes that
160
+ a no-op so we don't loop forever."""
161
  print(
162
+ f"[snapshot] frame_present={frame is not None}"
163
+ + (f" shape={frame.shape}" if frame is not None else ""),
164
  flush=True,
165
  )
166
 
167
  if frame is None:
168
+ return ("", _format_history(state.sign_history), state, gr.update())
 
 
 
 
169
 
170
  token, confidence = _recognize(frame)
171
+ print(f"[snapshot] recognised token={token!r} conf={confidence:.2f}", flush=True)
172
+
173
+ from signbridge.recognizer import landmark_classifier as lc
174
+ top3 = list(lc.last_top3)
175
+ top3_str = ", ".join(f"`{t}` ({c:.0%})" for t, c in top3) if top3 else ""
176
+
177
+ if not token or confidence < _MIN_CONF_SHOW:
178
+ msg = "_couldn't recognise — try centering your hand on a plain background_"
179
+ if top3_str:
180
+ msg += f" \nbest guesses: {top3_str}"
181
+ return (msg, _format_history(state.sign_history), state, gr.update(value=None))
182
+
183
+ if confidence < _MIN_CONF_ACCEPT:
184
+ msg = (
185
+ f"_low confidence on **{token}** ({confidence:.0%}) — re-sign with a clearer pose._ \n"
186
+ f"top alternatives: {top3_str}"
187
  )
188
+ return (msg, _format_history(state.sign_history), state, gr.update(value=None))
189
 
190
  state.sign_history.append(token)
191
+ status = f"detected: **{token}** ({confidence:.0%})"
192
+ if top3_str:
193
+ status += f" \nalternatives: {top3_str}"
194
+ return (status, _format_history(state.sign_history), state, gr.update(value=None))
 
195
 
196
 
197
  def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
 
203
 
204
  def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
205
  if not state.sign_history:
206
+ print("[speak] no signs to compose; returning empty.", flush=True)
207
  return "(no signs captured yet)", None, state
208
 
209
+ print(f"[speak] composing from {len(state.sign_history)} tokens: {state.sign_history}", flush=True)
210
  sentence = compose_sentence(list(state.sign_history))
211
+ print(f"[speak] composed sentence: {sentence!r}", flush=True)
212
  state.last_sentence = sentence
213
+ print("[speak] synthesising speech...", flush=True)
214
  state.last_audio_path = synthesize_speech(sentence)
215
+ print(f"[speak] audio_path={state.last_audio_path}", flush=True)
216
  return sentence, state.last_audio_path, state
217
 
218
 
 
265
  font-size: 13px;
266
  color: #1e1b4b;
267
  }
268
+ /* Snapshot tab uses gradio's built-in snapshot camera button as the
269
+ sole capture trigger. Streaming had to be dropped because HF Space's
270
+ proxy can't sustain the per-500ms upload rate. Source-select dropdown
271
+ is hidden to keep the UI clean. */
272
+ .signbridge-webcam-snapshot .source-selection {
 
 
 
 
 
273
  display: none !important;
274
  }
275
  """
276
 
277
 
278
+ # JS injected at app load. Runs in the browser.
279
+ #
280
+ # Non-streaming gr.Image webcam unmounts the Webcam component each time
281
+ # the value clears (gradio's ImageUploader.svelte: shows the captured
282
+ # image when value!=null, shows Webcam only when value==null). Each
283
+ # remount re-renders the "Click to Access Webcam" placeholder. After
284
+ # the first user-gesture grant, the browser remembers permission, so
285
+ # we can programmatically click that placeholder to snap straight back
286
+ # to live preview — making per-letter UX a single click on the camera
287
+ # button instead of click-allow-then-camera.
288
+ _AUTO_ACCESS_WEBCAM_JS = """
289
  () => {
290
+ const SELECTOR = '.signbridge-webcam-snapshot button[title="grant webcam access" i], .signbridge-webcam-snapshot div[title="grant webcam access" i] button';
291
+ let firstGrantSeen = false;
292
  const tick = () => {
293
  document.querySelectorAll(SELECTOR).forEach((btn) => {
294
+ if (btn.dataset.signbridgeAutoaccessed) return;
295
+ if (!firstGrantSeen) {
296
+ // Wait for the user's first click — getUserMedia needs a
297
+ // genuine user gesture initially. Mark seen on next tick.
298
+ firstGrantSeen = true;
299
+ return;
 
300
  }
301
+ btn.click();
302
+ btn.dataset.signbridgeAutoaccessed = '1';
303
+ console.log('[signbridge] auto-accessed re-mounted webcam');
304
  });
305
  };
306
+ setInterval(tick, 300);
307
  }
308
  """
309
 
 
313
  title="SignBridge",
314
  theme=gr.themes.Soft(),
315
  css=_WEBCAM_BUTTON_LABEL_CSS,
316
+ js=_AUTO_ACCESS_WEBCAM_JS,
317
  ) as demo:
318
  gr.Markdown(
319
  "# 🤟 SignBridge — real-time ASL → English speech\n"
 
337
  gr.HTML(
338
  '<div class="signbridge-webcam-help">'
339
  '<b>How it works:</b> '
340
+ '<b>1.</b> click the preview once to grant camera access (one-time) · '
341
  '<b>2.</b> sign a letter (A–Z) · '
342
+ '<b>3.</b> click the <b>📷 camera button</b> in the preview — recognition is automatic, then the preview re-arms · '
343
  '<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
344
  "</div>"
345
  )
346
+ # streaming=True was deployable locally but HF
347
+ # Space's proxy can't sustain the per-500ms
348
+ # frame uploads every upload_file POST hit
349
+ # ClientDisconnect, no frame ever reached
350
+ # Python. Switching to non-streaming snapshot
351
+ # mode: one upload per click, reliable on HF.
352
+ # The webcam re-mounts after auto-clear; the
353
+ # _AUTO_ACCESS_WEBCAM_JS injected at app load
354
+ # re-clicks the access placeholder so per-letter
355
+ # UX stays a single click on gradio's snapshot
356
+ # camera button (no double-grant per letter).
357
  webcam = gr.Image(
358
  sources=["webcam"],
359
+ label="Sign here — click the 📷 camera button",
 
360
  height=420,
361
  type="numpy",
362
  elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
363
  )
364
  with gr.Row():
 
 
 
365
  clear_btn = gr.Button(
366
+ "🧹 Clear history", variant="secondary", size="lg"
367
  )
368
  latest = gr.Markdown(value="")
369
 
 
385
  "Spell out a word letter-by-letter, then press Speak."
386
  )
387
 
388
+ # In non-streaming mode, .change() fires once per user
389
+ # snapshot (camera button click). We get the frame
390
+ # directly as the input — no global cache or stash
391
+ # plumbing needed. Auto-clear the value at the end so
392
+ # gradio re-mounts the Webcam component, which together
393
+ # with _AUTO_ACCESS_WEBCAM_JS makes per-letter UX one
394
+ # click.
395
+ webcam.change(
396
+ fn=_on_snapshot,
397
+ inputs=[webcam, state],
398
+ outputs=[latest, history, state, webcam],
 
 
399
  )
400
  speak_btn.click(
401
  fn=_speak,