fix: snapshot mode (drop streaming) — HF proxy can't sustain frame uploads
Browse filesHF Space logs were full of starlette.requests.ClientDisconnect on the
upload_file endpoint: every gradio .stream() frame upload was getting
chopped before completion. Result: stash_count=0 forever, capture
button always saw frame_present=False, recognition never fired on the
deployed Space (worked fine locally on unlimited bandwidth).
Switch:
- gr.Image now non-streaming (one upload per click).
- webcam.change() handler runs recognition + auto-clears the value so
the Webcam component re-mounts.
- New JS auto-clicks the "Click to Access Webcam" placeholder after
the first user-gesture grant — browser remembers permission, so
per-letter UX stays one click.
- Confidence threshold raised to 0.75 + show top-3 alternatives in
the status text so users can see ambiguity ("U 65%, R 29%").
- Recognition path now logs MediaPipe hand detection result + MLP
top-3 + composer/TTS calls end-to-end.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- signbridge/recognizer/landmark_classifier.py +19 -0
- signbridge/space.py +108 -107
|
@@ -160,12 +160,21 @@ def _ensure_loaded() -> bool:
|
|
| 160 |
return True
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
def predict_letter(frame: np.ndarray) -> tuple[str, float]:
|
| 164 |
"""Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
|
| 165 |
|
| 166 |
`frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
|
| 167 |
detected — the upstream caller should fall through to Qwen3-VL.
|
|
|
|
|
|
|
| 168 |
"""
|
|
|
|
|
|
|
| 169 |
if not _ensure_loaded():
|
| 170 |
return "", 0.0
|
| 171 |
|
|
@@ -180,6 +189,7 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
|
|
| 180 |
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
| 181 |
res = _state["landmarker"].detect(mp_img) # type: ignore[union-attr]
|
| 182 |
if not res.hand_landmarks:
|
|
|
|
| 183 |
return "", 0.0
|
| 184 |
|
| 185 |
lm = res.hand_landmarks[0]
|
|
@@ -192,4 +202,13 @@ def predict_letter(frame: np.ndarray) -> tuple[str, float]:
|
|
| 192 |
conf = float(probs[idx].item())
|
| 193 |
|
| 194 |
classes = _state["classes"] # type: ignore[assignment]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
return classes[idx], conf # type: ignore[index,return-value]
|
|
|
|
| 160 |
return True
|
| 161 |
|
| 162 |
|
| 163 |
+
# Per-call top-k cache for the most recent prediction. Lets the UI
|
| 164 |
+
# surface alternative letters when the top-1 confidence is borderline.
|
| 165 |
+
last_top3: list[tuple[str, float]] = []
|
| 166 |
+
|
| 167 |
+
|
| 168 |
def predict_letter(frame: np.ndarray) -> tuple[str, float]:
|
| 169 |
"""Single-frame letter prediction. Returns (letter, confidence) or ("", 0.0).
|
| 170 |
|
| 171 |
`frame` is an HxWx3 uint8 RGB array. Returns ("", 0.0) when no hand is
|
| 172 |
detected — the upstream caller should fall through to Qwen3-VL.
|
| 173 |
+
Side effect: updates `last_top3` with the top-3 alternatives so the UI
|
| 174 |
+
can show ambiguity when the top-1 is borderline.
|
| 175 |
"""
|
| 176 |
+
global last_top3
|
| 177 |
+
last_top3 = []
|
| 178 |
if not _ensure_loaded():
|
| 179 |
return "", 0.0
|
| 180 |
|
|
|
|
| 189 |
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
| 190 |
res = _state["landmarker"].detect(mp_img) # type: ignore[union-attr]
|
| 191 |
if not res.hand_landmarks:
|
| 192 |
+
print(f"[mp+mlp] hand NOT detected in frame {frame.shape}", flush=True)
|
| 193 |
return "", 0.0
|
| 194 |
|
| 195 |
lm = res.hand_landmarks[0]
|
|
|
|
| 202 |
conf = float(probs[idx].item())
|
| 203 |
|
| 204 |
classes = _state["classes"] # type: ignore[assignment]
|
| 205 |
+
# Top-3 alternatives for debugging ambiguous classifications.
|
| 206 |
+
top_vals, top_idx = torch.topk(probs, k=min(3, len(classes)))
|
| 207 |
+
top3 = [(classes[int(i)], float(v)) for v, i in zip(top_vals, top_idx)]
|
| 208 |
+
last_top3.clear()
|
| 209 |
+
last_top3.extend(top3)
|
| 210 |
+
print(
|
| 211 |
+
f"[mp+mlp] hand OK; top3={[(t, round(c, 2)) for t, c in top3]}",
|
| 212 |
+
flush=True,
|
| 213 |
+
)
|
| 214 |
return classes[idx], conf # type: ignore[index,return-value]
|
|
@@ -82,37 +82,10 @@ class _SessionState:
|
|
| 82 |
last_audio_path: str | None = None
|
| 83 |
|
| 84 |
|
| 85 |
-
# Single-user demo: one global latest-frame variable populated by the
|
| 86 |
-
# .stream() handler. The Take-image button reads from here.
|
| 87 |
-
_latest_frame: np.ndarray | None = None
|
| 88 |
-
_frame_lock = threading.Lock()
|
| 89 |
-
_stash_count = 0
|
| 90 |
-
|
| 91 |
-
|
| 92 |
def _new_session() -> _SessionState:
|
| 93 |
return _SessionState()
|
| 94 |
|
| 95 |
|
| 96 |
-
def _stash_frame(frame: np.ndarray | None) -> int:
|
| 97 |
-
"""Webcam .stream() callback. Fires every ~500ms (gradio's internal
|
| 98 |
-
setInterval in Webcam.svelte) once `recording=true`. Writes the
|
| 99 |
-
latest live frame to the global cache. Returns _stash_count so we
|
| 100 |
-
can wire a real (hidden) output — empty outputs=[] silently
|
| 101 |
-
disables the handler in gradio 4.44.1."""
|
| 102 |
-
global _latest_frame, _stash_count
|
| 103 |
-
if frame is None:
|
| 104 |
-
return _stash_count
|
| 105 |
-
with _frame_lock:
|
| 106 |
-
_latest_frame = frame
|
| 107 |
-
_stash_count += 1
|
| 108 |
-
if _stash_count == 1 or _stash_count % 30 == 0:
|
| 109 |
-
print(
|
| 110 |
-
f"[stash] fired #{_stash_count} shape={frame.shape}",
|
| 111 |
-
flush=True,
|
| 112 |
-
)
|
| 113 |
-
return _stash_count
|
| 114 |
-
|
| 115 |
-
|
| 116 |
def _format_history(signs: list[str]) -> str:
|
| 117 |
if not signs:
|
| 118 |
return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
|
|
@@ -132,16 +105,23 @@ def _recognize(frame: np.ndarray) -> tuple[str, float]:
|
|
| 132 |
extractor = _shared_extractor()
|
| 133 |
_, landmarks = extractor.extract(frame)
|
| 134 |
if landmarks is None:
|
|
|
|
| 135 |
return "", 0.0
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Default 'vlm' mode — first try the landmark classifier, then VLM.
|
| 139 |
from signbridge.recognizer.landmark_classifier import predict_letter
|
| 140 |
|
| 141 |
token, conf = predict_letter(frame)
|
|
|
|
| 142 |
if conf >= 0.5:
|
| 143 |
return token, conf
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
_extractor_singleton: LandmarkExtractor | None = None
|
|
@@ -163,39 +143,55 @@ def _shared_extractor() -> LandmarkExtractor:
|
|
| 163 |
return _extractor_singleton
|
| 164 |
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
print(
|
| 172 |
-
f"[
|
|
|
|
| 173 |
flush=True,
|
| 174 |
)
|
| 175 |
|
| 176 |
if frame is None:
|
| 177 |
-
return (
|
| 178 |
-
"_no frame yet — wait a moment for the camera to start streaming, then try again_",
|
| 179 |
-
_format_history(state.sign_history),
|
| 180 |
-
state,
|
| 181 |
-
)
|
| 182 |
|
| 183 |
token, confidence = _recognize(frame)
|
| 184 |
-
print(f"[
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
)
|
|
|
|
| 192 |
|
| 193 |
state.sign_history.append(token)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
)
|
| 199 |
|
| 200 |
|
| 201 |
def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
|
|
@@ -207,11 +203,16 @@ def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
|
|
| 207 |
|
| 208 |
def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
|
| 209 |
if not state.sign_history:
|
|
|
|
| 210 |
return "(no signs captured yet)", None, state
|
| 211 |
|
|
|
|
| 212 |
sentence = compose_sentence(list(state.sign_history))
|
|
|
|
| 213 |
state.last_sentence = sentence
|
|
|
|
| 214 |
state.last_audio_path = synthesize_speech(sentence)
|
|
|
|
| 215 |
return sentence, state.last_audio_path, state
|
| 216 |
|
| 217 |
|
|
@@ -264,42 +265,45 @@ _WEBCAM_BUTTON_LABEL_CSS = """
|
|
| 264 |
font-size: 13px;
|
| 265 |
color: #1e1b4b;
|
| 266 |
}
|
| 267 |
-
/* Snapshot tab uses
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
"Click to Access Webcam" placeholder is a separate DOM node and
|
| 273 |
-
stays visible — browsers require a user gesture for getUserMedia(). */
|
| 274 |
-
.signbridge-webcam-snapshot .source-selection,
|
| 275 |
-
.signbridge-webcam-snapshot .controls,
|
| 276 |
-
.signbridge-webcam-snapshot .button-wrap {
|
| 277 |
display: none !important;
|
| 278 |
}
|
| 279 |
"""
|
| 280 |
|
| 281 |
|
| 282 |
-
# JS injected at app load. Runs in the browser.
|
| 283 |
-
#
|
| 284 |
-
#
|
| 285 |
-
# the
|
| 286 |
-
#
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
() => {
|
| 289 |
-
const SELECTOR = '.signbridge-webcam-snapshot
|
|
|
|
| 290 |
const tick = () => {
|
| 291 |
document.querySelectorAll(SELECTOR).forEach((btn) => {
|
| 292 |
-
if (btn.dataset.
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
console.log('[signbridge] auto-armed webcam stream');
|
| 299 |
}
|
|
|
|
|
|
|
|
|
|
| 300 |
});
|
| 301 |
};
|
| 302 |
-
setInterval(tick,
|
| 303 |
}
|
| 304 |
"""
|
| 305 |
|
|
@@ -309,7 +313,7 @@ def build_demo() -> gr.Blocks:
|
|
| 309 |
title="SignBridge",
|
| 310 |
theme=gr.themes.Soft(),
|
| 311 |
css=_WEBCAM_BUTTON_LABEL_CSS,
|
| 312 |
-
js=
|
| 313 |
) as demo:
|
| 314 |
gr.Markdown(
|
| 315 |
"# 🤟 SignBridge — real-time ASL → English speech\n"
|
|
@@ -333,34 +337,33 @@ def build_demo() -> gr.Blocks:
|
|
| 333 |
gr.HTML(
|
| 334 |
'<div class="signbridge-webcam-help">'
|
| 335 |
'<b>How it works:</b> '
|
| 336 |
-
'<b>1.</b> click the preview once to grant camera access · '
|
| 337 |
'<b>2.</b> sign a letter (A–Z) · '
|
| 338 |
-
'<b>3.</b> click <b>
|
| 339 |
'<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
|
| 340 |
"</div>"
|
| 341 |
)
|
| 342 |
-
# streaming=True
|
| 343 |
-
#
|
| 344 |
-
#
|
| 345 |
-
#
|
| 346 |
-
#
|
| 347 |
-
#
|
| 348 |
-
#
|
| 349 |
-
#
|
|
|
|
|
|
|
|
|
|
| 350 |
webcam = gr.Image(
|
| 351 |
sources=["webcam"],
|
| 352 |
-
|
| 353 |
-
label="Sign here",
|
| 354 |
height=420,
|
| 355 |
type="numpy",
|
| 356 |
elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
|
| 357 |
)
|
| 358 |
with gr.Row():
|
| 359 |
-
capture_btn = gr.Button(
|
| 360 |
-
"📸 Take image", variant="primary", size="lg"
|
| 361 |
-
)
|
| 362 |
clear_btn = gr.Button(
|
| 363 |
-
"🧹 Clear", variant="secondary", size="lg"
|
| 364 |
)
|
| 365 |
latest = gr.Markdown(value="")
|
| 366 |
|
|
@@ -382,19 +385,17 @@ def build_demo() -> gr.Blocks:
|
|
| 382 |
"Spell out a word letter-by-letter, then press Speak."
|
| 383 |
)
|
| 384 |
|
| 385 |
-
#
|
| 386 |
-
#
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
inputs=[state],
|
| 397 |
-
outputs=[latest, history, state],
|
| 398 |
)
|
| 399 |
speak_btn.click(
|
| 400 |
fn=_speak,
|
|
|
|
| 82 |
last_audio_path: str | None = None
|
| 83 |
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def _new_session() -> _SessionState:
|
| 86 |
return _SessionState()
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def _format_history(signs: list[str]) -> str:
|
| 90 |
if not signs:
|
| 91 |
return "_(no signs captured yet — try signing the letter A and pressing Capture)_"
|
|
|
|
| 105 |
extractor = _shared_extractor()
|
| 106 |
_, landmarks = extractor.extract(frame)
|
| 107 |
if landmarks is None:
|
| 108 |
+
print("[recognize] holistic: no landmarks detected", flush=True)
|
| 109 |
return "", 0.0
|
| 110 |
+
token, conf = classify_landmarks(np.expand_dims(landmarks, axis=0))
|
| 111 |
+
print(f"[recognize] holistic-classifier: token={token!r} conf={conf:.2f}", flush=True)
|
| 112 |
+
return token, conf
|
| 113 |
|
| 114 |
# Default 'vlm' mode — first try the landmark classifier, then VLM.
|
| 115 |
from signbridge.recognizer.landmark_classifier import predict_letter
|
| 116 |
|
| 117 |
token, conf = predict_letter(frame)
|
| 118 |
+
print(f"[recognize] mediapipe+MLP: token={token!r} conf={conf:.2f}", flush=True)
|
| 119 |
if conf >= 0.5:
|
| 120 |
return token, conf
|
| 121 |
+
print("[recognize] MLP below threshold; falling through to VLM", flush=True)
|
| 122 |
+
vtoken, vconf = recognize_sign_from_frame(frame)
|
| 123 |
+
print(f"[recognize] VLM result: token={vtoken!r} conf={vconf:.2f}", flush=True)
|
| 124 |
+
return vtoken, vconf
|
| 125 |
|
| 126 |
|
| 127 |
_extractor_singleton: LandmarkExtractor | None = None
|
|
|
|
| 143 |
return _extractor_singleton
|
| 144 |
|
| 145 |
|
| 146 |
+
_MIN_CONF_ACCEPT = 0.75 # ≥ this → token accepted into history
|
| 147 |
+
_MIN_CONF_SHOW = 0.50 # below this → "couldn't recognise"
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _on_snapshot(
|
| 151 |
+
frame: np.ndarray | None, state: _SessionState
|
| 152 |
+
) -> tuple[str, str, _SessionState, "gr.components.Image"]:
|
| 153 |
+
"""Webcam .change() callback. Fires once per user snapshot in
|
| 154 |
+
non-streaming mode. Recognises the frame, appends to history,
|
| 155 |
+
then returns gr.update(value=None) so the Webcam re-mounts and
|
| 156 |
+
_AUTO_ACCESS_WEBCAM_JS auto-clicks the access placeholder.
|
| 157 |
+
|
| 158 |
+
Bounce guard: when we set value=None below, gradio dispatches
|
| 159 |
+
another .change() with frame=None. The first branch makes that
|
| 160 |
+
a no-op so we don't loop forever."""
|
| 161 |
print(
|
| 162 |
+
f"[snapshot] frame_present={frame is not None}"
|
| 163 |
+
+ (f" shape={frame.shape}" if frame is not None else ""),
|
| 164 |
flush=True,
|
| 165 |
)
|
| 166 |
|
| 167 |
if frame is None:
|
| 168 |
+
return ("", _format_history(state.sign_history), state, gr.update())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
token, confidence = _recognize(frame)
|
| 171 |
+
print(f"[snapshot] recognised token={token!r} conf={confidence:.2f}", flush=True)
|
| 172 |
+
|
| 173 |
+
from signbridge.recognizer import landmark_classifier as lc
|
| 174 |
+
top3 = list(lc.last_top3)
|
| 175 |
+
top3_str = ", ".join(f"`{t}` ({c:.0%})" for t, c in top3) if top3 else ""
|
| 176 |
+
|
| 177 |
+
if not token or confidence < _MIN_CONF_SHOW:
|
| 178 |
+
msg = "_couldn't recognise — try centering your hand on a plain background_"
|
| 179 |
+
if top3_str:
|
| 180 |
+
msg += f" \nbest guesses: {top3_str}"
|
| 181 |
+
return (msg, _format_history(state.sign_history), state, gr.update(value=None))
|
| 182 |
+
|
| 183 |
+
if confidence < _MIN_CONF_ACCEPT:
|
| 184 |
+
msg = (
|
| 185 |
+
f"_low confidence on **{token}** ({confidence:.0%}) — re-sign with a clearer pose._ \n"
|
| 186 |
+
f"top alternatives: {top3_str}"
|
| 187 |
)
|
| 188 |
+
return (msg, _format_history(state.sign_history), state, gr.update(value=None))
|
| 189 |
|
| 190 |
state.sign_history.append(token)
|
| 191 |
+
status = f"detected: **{token}** ({confidence:.0%})"
|
| 192 |
+
if top3_str:
|
| 193 |
+
status += f" \nalternatives: {top3_str}"
|
| 194 |
+
return (status, _format_history(state.sign_history), state, gr.update(value=None))
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
def _show_landmarks(frame: np.ndarray | None) -> np.ndarray | None:
|
|
|
|
| 203 |
|
| 204 |
def _speak(state: _SessionState) -> tuple[str, str | None, _SessionState]:
|
| 205 |
if not state.sign_history:
|
| 206 |
+
print("[speak] no signs to compose; returning empty.", flush=True)
|
| 207 |
return "(no signs captured yet)", None, state
|
| 208 |
|
| 209 |
+
print(f"[speak] composing from {len(state.sign_history)} tokens: {state.sign_history}", flush=True)
|
| 210 |
sentence = compose_sentence(list(state.sign_history))
|
| 211 |
+
print(f"[speak] composed sentence: {sentence!r}", flush=True)
|
| 212 |
state.last_sentence = sentence
|
| 213 |
+
print("[speak] synthesising speech...", flush=True)
|
| 214 |
state.last_audio_path = synthesize_speech(sentence)
|
| 215 |
+
print(f"[speak] audio_path={state.last_audio_path}", flush=True)
|
| 216 |
return sentence, state.last_audio_path, state
|
| 217 |
|
| 218 |
|
|
|
|
| 265 |
font-size: 13px;
|
| 266 |
color: #1e1b4b;
|
| 267 |
}
|
| 268 |
+
/* Snapshot tab uses gradio's built-in snapshot camera button as the
|
| 269 |
+
sole capture trigger. Streaming had to be dropped because HF Space's
|
| 270 |
+
proxy can't sustain the per-500ms upload rate. Source-select dropdown
|
| 271 |
+
is hidden to keep the UI clean. */
|
| 272 |
+
.signbridge-webcam-snapshot .source-selection {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
display: none !important;
|
| 274 |
}
|
| 275 |
"""
|
| 276 |
|
| 277 |
|
| 278 |
+
# JS injected at app load. Runs in the browser.
|
| 279 |
+
#
|
| 280 |
+
# Non-streaming gr.Image webcam unmounts the Webcam component each time
|
| 281 |
+
# the value clears (gradio's ImageUploader.svelte: shows the captured
|
| 282 |
+
# image when value!=null, shows Webcam only when value==null). Each
|
| 283 |
+
# remount re-renders the "Click to Access Webcam" placeholder. After
|
| 284 |
+
# the first user-gesture grant, the browser remembers permission, so
|
| 285 |
+
# we can programmatically click that placeholder to snap straight back
|
| 286 |
+
# to live preview — making per-letter UX a single click on the camera
|
| 287 |
+
# button instead of click-allow-then-camera.
|
| 288 |
+
_AUTO_ACCESS_WEBCAM_JS = """
|
| 289 |
() => {
|
| 290 |
+
const SELECTOR = '.signbridge-webcam-snapshot button[title="grant webcam access" i], .signbridge-webcam-snapshot div[title="grant webcam access" i] button';
|
| 291 |
+
let firstGrantSeen = false;
|
| 292 |
const tick = () => {
|
| 293 |
document.querySelectorAll(SELECTOR).forEach((btn) => {
|
| 294 |
+
if (btn.dataset.signbridgeAutoaccessed) return;
|
| 295 |
+
if (!firstGrantSeen) {
|
| 296 |
+
// Wait for the user's first click — getUserMedia needs a
|
| 297 |
+
// genuine user gesture initially. Mark seen on next tick.
|
| 298 |
+
firstGrantSeen = true;
|
| 299 |
+
return;
|
|
|
|
| 300 |
}
|
| 301 |
+
btn.click();
|
| 302 |
+
btn.dataset.signbridgeAutoaccessed = '1';
|
| 303 |
+
console.log('[signbridge] auto-accessed re-mounted webcam');
|
| 304 |
});
|
| 305 |
};
|
| 306 |
+
setInterval(tick, 300);
|
| 307 |
}
|
| 308 |
"""
|
| 309 |
|
|
|
|
| 313 |
title="SignBridge",
|
| 314 |
theme=gr.themes.Soft(),
|
| 315 |
css=_WEBCAM_BUTTON_LABEL_CSS,
|
| 316 |
+
js=_AUTO_ACCESS_WEBCAM_JS,
|
| 317 |
) as demo:
|
| 318 |
gr.Markdown(
|
| 319 |
"# 🤟 SignBridge — real-time ASL → English speech\n"
|
|
|
|
| 337 |
gr.HTML(
|
| 338 |
'<div class="signbridge-webcam-help">'
|
| 339 |
'<b>How it works:</b> '
|
| 340 |
+
'<b>1.</b> click the preview once to grant camera access (one-time) · '
|
| 341 |
'<b>2.</b> sign a letter (A–Z) · '
|
| 342 |
+
'<b>3.</b> click the <b>📷 camera button</b> in the preview — recognition is automatic, then the preview re-arms · '
|
| 343 |
'<b>4.</b> repeat for the next letter, then press <b>🔊 Speak</b>.'
|
| 344 |
"</div>"
|
| 345 |
)
|
| 346 |
+
# streaming=True was deployable locally but HF
|
| 347 |
+
# Space's proxy can't sustain the per-500ms
|
| 348 |
+
# frame uploads — every upload_file POST hit
|
| 349 |
+
# ClientDisconnect, no frame ever reached
|
| 350 |
+
# Python. Switching to non-streaming snapshot
|
| 351 |
+
# mode: one upload per click, reliable on HF.
|
| 352 |
+
# The webcam re-mounts after auto-clear; the
|
| 353 |
+
# _AUTO_ACCESS_WEBCAM_JS injected at app load
|
| 354 |
+
# re-clicks the access placeholder so per-letter
|
| 355 |
+
# UX stays a single click on gradio's snapshot
|
| 356 |
+
# camera button (no double-grant per letter).
|
| 357 |
webcam = gr.Image(
|
| 358 |
sources=["webcam"],
|
| 359 |
+
label="Sign here — click the 📷 camera button",
|
|
|
|
| 360 |
height=420,
|
| 361 |
type="numpy",
|
| 362 |
elem_classes=["signbridge-webcam", "signbridge-webcam-snapshot"],
|
| 363 |
)
|
| 364 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
| 365 |
clear_btn = gr.Button(
|
| 366 |
+
"🧹 Clear history", variant="secondary", size="lg"
|
| 367 |
)
|
| 368 |
latest = gr.Markdown(value="")
|
| 369 |
|
|
|
|
| 385 |
"Spell out a word letter-by-letter, then press Speak."
|
| 386 |
)
|
| 387 |
|
| 388 |
+
# In non-streaming mode, .change() fires once per user
|
| 389 |
+
# snapshot (camera button click). We get the frame
|
| 390 |
+
# directly as the input — no global cache or stash
|
| 391 |
+
# plumbing needed. Auto-clear the value at the end so
|
| 392 |
+
# gradio re-mounts the Webcam component, which together
|
| 393 |
+
# with _AUTO_ACCESS_WEBCAM_JS makes per-letter UX one
|
| 394 |
+
# click.
|
| 395 |
+
webcam.change(
|
| 396 |
+
fn=_on_snapshot,
|
| 397 |
+
inputs=[webcam, state],
|
| 398 |
+
outputs=[latest, history, state, webcam],
|
|
|
|
|
|
|
| 399 |
)
|
| 400 |
speak_btn.click(
|
| 401 |
fn=_speak,
|