LucasLooTan commited on
Commit
5fb88cc
·
1 Parent(s): 4838599

feat(vlm): add recognize_sign_from_frames for multi-image VLM input

Browse files

New function takes an ordered list of >=2 frames and sends them in a
single OpenAI-compatible chat-completions call with N image_url
entries plus the multi-frame prompt. Same vocab filter and confidence
semantics as the single-frame path; raises ValueError for <2 frames.

Enables the upcoming Hold-to-record demo flow that turns SignBridge
from a fingerspelling-only demo into one that handles motion ASL
signs (HELLO, THANK_YOU, PLEASE, EAT, ...) which a single still
frame fundamentally cannot capture.

Files changed (2) hide show
  1. signbridge/recognizer/vlm.py +52 -1
  2. tests/test_vlm.py +96 -0
signbridge/recognizer/vlm.py CHANGED
@@ -29,7 +29,10 @@ import numpy as np
29
  # Closed vocabulary the VLM is asked to choose from. Imported from the
30
  # shared `signbridge.vocab` module so the recognizer and the trained
31
  # classifier (`signbridge.recognizer.classifier`) can never drift.
32
- from signbridge.recognizer.prompts import build_single_frame_prompt
 
 
 
33
  from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
34
 
35
  logger = logging.getLogger(__name__)
@@ -160,3 +163,51 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
160
  if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
161
  return "", 0.0
162
  return token, 0.85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Closed vocabulary the VLM is asked to choose from. Imported from the
30
  # shared `signbridge.vocab` module so the recognizer and the trained
31
  # classifier (`signbridge.recognizer.classifier`) can never drift.
32
+ from signbridge.recognizer.prompts import (
33
+ build_multi_frame_prompt,
34
+ build_single_frame_prompt,
35
+ )
36
  from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
37
 
38
  logger = logging.getLogger(__name__)
 
163
  if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
164
  return "", 0.0
165
  return token, 0.85
166
+
167
+
168
+ def recognize_sign_from_frames(frames: list[np.ndarray]) -> tuple[str, float]:
169
+ """Run the VLM on an ordered sequence of frames (multi-image prompt).
170
+
171
+ Returns (token, confidence). Confidence semantics match the single-frame
172
+ path: 0.85 when the VLM emits an in-vocab token, 0.0 otherwise.
173
+
174
+ Raises:
175
+ ValueError: if fewer than 2 frames are supplied (use the single-frame
176
+ entry point for one frame).
177
+ """
178
+ if len(frames) < 2:
179
+ raise ValueError(
180
+ f"recognize_sign_from_frames requires at least 2 frames, got {len(frames)}"
181
+ )
182
+
183
+ client, model = _resolve_client()
184
+ if client is None:
185
+ return "", 0.0
186
+
187
+ prompt = build_multi_frame_prompt(len(frames))
188
+ content: list[dict[str, object]] = [{"type": "text", "text": prompt}]
189
+ for frame in frames:
190
+ content.append(
191
+ {
192
+ "type": "image_url",
193
+ "image_url": {"url": _frame_to_data_url(frame)},
194
+ }
195
+ )
196
+
197
+ try:
198
+ resp = client.chat.completions.create( # type: ignore[attr-defined]
199
+ model=model,
200
+ messages=[{"role": "user", "content": content}],
201
+ temperature=0.0,
202
+ max_tokens=10,
203
+ )
204
+ raw = (resp.choices[0].message.content or "").strip()
205
+ token = _normalise(raw)
206
+ except Exception as exc: # noqa: BLE001 — broad at the boundary on purpose
207
+ # Same credential-leak guard as the single-frame path.
208
+ logger.warning("multi-frame VLM recognition failed: %s", type(exc).__name__)
209
+ return "", 0.0
210
+
211
+ if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
212
+ return "", 0.0
213
+ return token, 0.85
tests/test_vlm.py CHANGED
@@ -152,3 +152,99 @@ class TestRecognizeSignFromFrame:
152
  token, conf = recognize_sign_from_frame(frame)
153
  assert token == ""
154
  assert conf == 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  token, conf = recognize_sign_from_frame(frame)
153
  assert token == ""
154
  assert conf == 0.0
155
+
156
+
157
+ class TestRecognizeSignFromFrames:
158
+ def test_too_few_frames_raises(self):
159
+ from signbridge.recognizer.vlm import recognize_sign_from_frames
160
+
161
+ with pytest.raises(ValueError):
162
+ recognize_sign_from_frames([])
163
+ with pytest.raises(ValueError):
164
+ recognize_sign_from_frames([np.zeros((32, 32, 3), dtype=np.uint8)])
165
+
166
+ def test_no_client_returns_empty(self):
167
+ from signbridge.recognizer.vlm import recognize_sign_from_frames
168
+
169
+ frames = [np.full((32, 32, 3), 200, dtype=np.uint8) for _ in range(4)]
170
+ token, conf = recognize_sign_from_frames(frames)
171
+ assert token == ""
172
+ assert conf == 0.0
173
+
174
+ def test_with_mock_client(self, monkeypatch: pytest.MonkeyPatch) -> None:
175
+ from signbridge.recognizer import vlm
176
+
177
+ captured: dict = {}
178
+
179
+ class _FakeChoice:
180
+ def __init__(self, content: str) -> None:
181
+ self.message = type("M", (), {"content": content})()
182
+
183
+ class _FakeResp:
184
+ def __init__(self, content: str) -> None:
185
+ self.choices = [_FakeChoice(content)]
186
+
187
+ class _FakeClient:
188
+ class chat: # noqa: N801
189
+ class completions: # noqa: N801
190
+ @staticmethod
191
+ def create(**kwargs: object) -> _FakeResp:
192
+ captured.update(kwargs)
193
+ return _FakeResp("hello")
194
+
195
+ monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
196
+ frames = [np.full((32, 32, 3), 100 + i, dtype=np.uint8) for i in range(4)]
197
+ token, conf = vlm.recognize_sign_from_frames(frames)
198
+ assert token == "hello"
199
+ assert conf == 0.85
200
+ # Verify multi-image payload shape: 1 message with 1 text + 4 image_urls
201
+ msgs = captured["messages"]
202
+ assert len(msgs) == 1
203
+ content = msgs[0]["content"]
204
+ assert sum(1 for c in content if c["type"] == "text") == 1
205
+ assert sum(1 for c in content if c["type"] == "image_url") == 4
206
+
207
+ def test_off_vocab_token_suppressed(self, monkeypatch: pytest.MonkeyPatch) -> None:
208
+ from signbridge.recognizer import vlm
209
+
210
+ class _FakeClient:
211
+ class chat: # noqa: N801
212
+ class completions: # noqa: N801
213
+ @staticmethod
214
+ def create(**_: object) -> object:
215
+ return type(
216
+ "R",
217
+ (),
218
+ {
219
+ "choices": [
220
+ type(
221
+ "C",
222
+ (),
223
+ {"message": type("M", (), {"content": "fingerspelling"})()},
224
+ )()
225
+ ]
226
+ },
227
+ )()
228
+
229
+ monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
230
+ frames = [np.full((32, 32, 3), 100, dtype=np.uint8) for _ in range(4)]
231
+ token, conf = vlm.recognize_sign_from_frames(frames)
232
+ # 'fingerspelling' is not in VOCAB_SET → suppressed
233
+ assert token == ""
234
+ assert conf == 0.0
235
+
236
+ def test_provider_failure_returns_empty(self, monkeypatch: pytest.MonkeyPatch) -> None:
237
+ from signbridge.recognizer import vlm
238
+
239
+ class _FailingClient:
240
+ class chat: # noqa: N801
241
+ class completions: # noqa: N801
242
+ @staticmethod
243
+ def create(**_: object) -> object:
244
+ raise RuntimeError("boom")
245
+
246
+ monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FailingClient(), "test"))
247
+ frames = [np.full((32, 32, 3), 0, dtype=np.uint8) for _ in range(3)]
248
+ token, conf = vlm.recognize_sign_from_frames(frames)
249
+ assert token == ""
250
+ assert conf == 0.0