Spaces:
Build error
Build error
Commit ·
5fb88cc
1
Parent(s): 4838599
feat(vlm): add recognize_sign_from_frames for multi-image VLM input
Browse filesNew function takes an ordered list of >=2 frames and sends them in a
single OpenAI-compatible chat-completions call with N image_url
entries plus the multi-frame prompt. Same vocab filter and confidence
semantics as the single-frame path; raises ValueError for <2 frames.
Enables the upcoming Hold-to-record demo flow that turns SignBridge
from a fingerspelling-only demo into one that handles motion ASL
signs (HELLO, THANK_YOU, PLEASE, EAT, ...) which a single still
frame fundamentally cannot capture.
- signbridge/recognizer/vlm.py +52 -1
- tests/test_vlm.py +96 -0
signbridge/recognizer/vlm.py
CHANGED
|
@@ -29,7 +29,10 @@ import numpy as np
|
|
| 29 |
# Closed vocabulary the VLM is asked to choose from. Imported from the
|
| 30 |
# shared `signbridge.vocab` module so the recognizer and the trained
|
| 31 |
# classifier (`signbridge.recognizer.classifier`) can never drift.
|
| 32 |
-
from signbridge.recognizer.prompts import
|
|
|
|
|
|
|
|
|
|
| 33 |
from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
|
| 34 |
|
| 35 |
logger = logging.getLogger(__name__)
|
|
@@ -160,3 +163,51 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
|
|
| 160 |
if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
|
| 161 |
return "", 0.0
|
| 162 |
return token, 0.85
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Closed vocabulary the VLM is asked to choose from. Imported from the
|
| 30 |
# shared `signbridge.vocab` module so the recognizer and the trained
|
| 31 |
# classifier (`signbridge.recognizer.classifier`) can never drift.
|
| 32 |
+
from signbridge.recognizer.prompts import (
|
| 33 |
+
build_multi_frame_prompt,
|
| 34 |
+
build_single_frame_prompt,
|
| 35 |
+
)
|
| 36 |
from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
|
| 37 |
|
| 38 |
logger = logging.getLogger(__name__)
|
|
|
|
| 163 |
if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
|
| 164 |
return "", 0.0
|
| 165 |
return token, 0.85
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def recognize_sign_from_frames(frames: list[np.ndarray]) -> tuple[str, float]:
|
| 169 |
+
"""Run the VLM on an ordered sequence of frames (multi-image prompt).
|
| 170 |
+
|
| 171 |
+
Returns (token, confidence). Confidence semantics match the single-frame
|
| 172 |
+
path: 0.85 when the VLM emits an in-vocab token, 0.0 otherwise.
|
| 173 |
+
|
| 174 |
+
Raises:
|
| 175 |
+
ValueError: if fewer than 2 frames are supplied (use the single-frame
|
| 176 |
+
entry point for one frame).
|
| 177 |
+
"""
|
| 178 |
+
if len(frames) < 2:
|
| 179 |
+
raise ValueError(
|
| 180 |
+
f"recognize_sign_from_frames requires at least 2 frames, got {len(frames)}"
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
client, model = _resolve_client()
|
| 184 |
+
if client is None:
|
| 185 |
+
return "", 0.0
|
| 186 |
+
|
| 187 |
+
prompt = build_multi_frame_prompt(len(frames))
|
| 188 |
+
content: list[dict[str, object]] = [{"type": "text", "text": prompt}]
|
| 189 |
+
for frame in frames:
|
| 190 |
+
content.append(
|
| 191 |
+
{
|
| 192 |
+
"type": "image_url",
|
| 193 |
+
"image_url": {"url": _frame_to_data_url(frame)},
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
resp = client.chat.completions.create( # type: ignore[attr-defined]
|
| 199 |
+
model=model,
|
| 200 |
+
messages=[{"role": "user", "content": content}],
|
| 201 |
+
temperature=0.0,
|
| 202 |
+
max_tokens=10,
|
| 203 |
+
)
|
| 204 |
+
raw = (resp.choices[0].message.content or "").strip()
|
| 205 |
+
token = _normalise(raw)
|
| 206 |
+
except Exception as exc: # noqa: BLE001 — broad at the boundary on purpose
|
| 207 |
+
# Same credential-leak guard as the single-frame path.
|
| 208 |
+
logger.warning("multi-frame VLM recognition failed: %s", type(exc).__name__)
|
| 209 |
+
return "", 0.0
|
| 210 |
+
|
| 211 |
+
if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
|
| 212 |
+
return "", 0.0
|
| 213 |
+
return token, 0.85
|
tests/test_vlm.py
CHANGED
|
@@ -152,3 +152,99 @@ class TestRecognizeSignFromFrame:
|
|
| 152 |
token, conf = recognize_sign_from_frame(frame)
|
| 153 |
assert token == ""
|
| 154 |
assert conf == 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
token, conf = recognize_sign_from_frame(frame)
|
| 153 |
assert token == ""
|
| 154 |
assert conf == 0.0
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class TestRecognizeSignFromFrames:
|
| 158 |
+
def test_too_few_frames_raises(self):
|
| 159 |
+
from signbridge.recognizer.vlm import recognize_sign_from_frames
|
| 160 |
+
|
| 161 |
+
with pytest.raises(ValueError):
|
| 162 |
+
recognize_sign_from_frames([])
|
| 163 |
+
with pytest.raises(ValueError):
|
| 164 |
+
recognize_sign_from_frames([np.zeros((32, 32, 3), dtype=np.uint8)])
|
| 165 |
+
|
| 166 |
+
def test_no_client_returns_empty(self):
|
| 167 |
+
from signbridge.recognizer.vlm import recognize_sign_from_frames
|
| 168 |
+
|
| 169 |
+
frames = [np.full((32, 32, 3), 200, dtype=np.uint8) for _ in range(4)]
|
| 170 |
+
token, conf = recognize_sign_from_frames(frames)
|
| 171 |
+
assert token == ""
|
| 172 |
+
assert conf == 0.0
|
| 173 |
+
|
| 174 |
+
def test_with_mock_client(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
| 175 |
+
from signbridge.recognizer import vlm
|
| 176 |
+
|
| 177 |
+
captured: dict = {}
|
| 178 |
+
|
| 179 |
+
class _FakeChoice:
|
| 180 |
+
def __init__(self, content: str) -> None:
|
| 181 |
+
self.message = type("M", (), {"content": content})()
|
| 182 |
+
|
| 183 |
+
class _FakeResp:
|
| 184 |
+
def __init__(self, content: str) -> None:
|
| 185 |
+
self.choices = [_FakeChoice(content)]
|
| 186 |
+
|
| 187 |
+
class _FakeClient:
|
| 188 |
+
class chat: # noqa: N801
|
| 189 |
+
class completions: # noqa: N801
|
| 190 |
+
@staticmethod
|
| 191 |
+
def create(**kwargs: object) -> _FakeResp:
|
| 192 |
+
captured.update(kwargs)
|
| 193 |
+
return _FakeResp("hello")
|
| 194 |
+
|
| 195 |
+
monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
|
| 196 |
+
frames = [np.full((32, 32, 3), 100 + i, dtype=np.uint8) for i in range(4)]
|
| 197 |
+
token, conf = vlm.recognize_sign_from_frames(frames)
|
| 198 |
+
assert token == "hello"
|
| 199 |
+
assert conf == 0.85
|
| 200 |
+
# Verify multi-image payload shape: 1 message with 1 text + 4 image_urls
|
| 201 |
+
msgs = captured["messages"]
|
| 202 |
+
assert len(msgs) == 1
|
| 203 |
+
content = msgs[0]["content"]
|
| 204 |
+
assert sum(1 for c in content if c["type"] == "text") == 1
|
| 205 |
+
assert sum(1 for c in content if c["type"] == "image_url") == 4
|
| 206 |
+
|
| 207 |
+
def test_off_vocab_token_suppressed(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
| 208 |
+
from signbridge.recognizer import vlm
|
| 209 |
+
|
| 210 |
+
class _FakeClient:
|
| 211 |
+
class chat: # noqa: N801
|
| 212 |
+
class completions: # noqa: N801
|
| 213 |
+
@staticmethod
|
| 214 |
+
def create(**_: object) -> object:
|
| 215 |
+
return type(
|
| 216 |
+
"R",
|
| 217 |
+
(),
|
| 218 |
+
{
|
| 219 |
+
"choices": [
|
| 220 |
+
type(
|
| 221 |
+
"C",
|
| 222 |
+
(),
|
| 223 |
+
{"message": type("M", (), {"content": "fingerspelling"})()},
|
| 224 |
+
)()
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
)()
|
| 228 |
+
|
| 229 |
+
monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
|
| 230 |
+
frames = [np.full((32, 32, 3), 100, dtype=np.uint8) for _ in range(4)]
|
| 231 |
+
token, conf = vlm.recognize_sign_from_frames(frames)
|
| 232 |
+
# 'fingerspelling' is not in VOCAB_SET → suppressed
|
| 233 |
+
assert token == ""
|
| 234 |
+
assert conf == 0.0
|
| 235 |
+
|
| 236 |
+
def test_provider_failure_returns_empty(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
| 237 |
+
from signbridge.recognizer import vlm
|
| 238 |
+
|
| 239 |
+
class _FailingClient:
|
| 240 |
+
class chat: # noqa: N801
|
| 241 |
+
class completions: # noqa: N801
|
| 242 |
+
@staticmethod
|
| 243 |
+
def create(**_: object) -> object:
|
| 244 |
+
raise RuntimeError("boom")
|
| 245 |
+
|
| 246 |
+
monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FailingClient(), "test"))
|
| 247 |
+
frames = [np.full((32, 32, 3), 0, dtype=np.uint8) for _ in range(3)]
|
| 248 |
+
token, conf = vlm.recognize_sign_from_frames(frames)
|
| 249 |
+
assert token == ""
|
| 250 |
+
assert conf == 0.0
|