Spaces:

lablab-ai-amd-developer-hackathon
/

signbridge

Build error

LucasLooTan commited on about 15 hours ago

Commit

5fb88cc

1 Parent(s): 4838599

feat(vlm): add recognize_sign_from_frames for multi-image VLM input

New function takes an ordered list of >=2 frames and sends them in a
single OpenAI-compatible chat-completions call with N image_url
entries plus the multi-frame prompt. Same vocab filter and confidence
semantics as the single-frame path; raises ValueError for <2 frames.

Enables the upcoming Hold-to-record demo flow that turns SignBridge
from a fingerspelling-only demo into one that handles motion ASL
signs (HELLO, THANK_YOU, PLEASE, EAT, ...) which a single still
frame fundamentally cannot capture.

Files changed (2) hide show

signbridge/recognizer/vlm.py +52 -1
tests/test_vlm.py +96 -0

signbridge/recognizer/vlm.py CHANGED Viewed

@@ -29,7 +29,10 @@ import numpy as np
 # Closed vocabulary the VLM is asked to choose from. Imported from the
 # shared `signbridge.vocab` module so the recognizer and the trained
 # classifier (`signbridge.recognizer.classifier`) can never drift.
-from signbridge.recognizer.prompts import build_single_frame_prompt
 from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
 logger = logging.getLogger(__name__)
@@ -160,3 +163,51 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
     if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
         return "", 0.0
     return token, 0.85

 # Closed vocabulary the VLM is asked to choose from. Imported from the
 # shared `signbridge.vocab` module so the recognizer and the trained
 # classifier (`signbridge.recognizer.classifier`) can never drift.
+from signbridge.recognizer.prompts import (
+    build_multi_frame_prompt,
+    build_single_frame_prompt,
+)
 from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
 logger = logging.getLogger(__name__)
     if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
         return "", 0.0
     return token, 0.85
+def recognize_sign_from_frames(frames: list[np.ndarray]) -> tuple[str, float]:
+    """Run the VLM on an ordered sequence of frames (multi-image prompt).
+    Returns (token, confidence). Confidence semantics match the single-frame
+    path: 0.85 when the VLM emits an in-vocab token, 0.0 otherwise.
+    Raises:
+        ValueError: if fewer than 2 frames are supplied (use the single-frame
+            entry point for one frame).
+    """
+    if len(frames) < 2:
+        raise ValueError(
+            f"recognize_sign_from_frames requires at least 2 frames, got {len(frames)}"
+        )
+    client, model = _resolve_client()
+    if client is None:
+        return "", 0.0
+    prompt = build_multi_frame_prompt(len(frames))
+    content: list[dict[str, object]] = [{"type": "text", "text": prompt}]
+    for frame in frames:
+        content.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": _frame_to_data_url(frame)},
+            }
+        )
+    try:
+        resp = client.chat.completions.create(  # type: ignore[attr-defined]
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            temperature=0.0,
+            max_tokens=10,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        token = _normalise(raw)
+    except Exception as exc:  # noqa: BLE001 — broad at the boundary on purpose
+        # Same credential-leak guard as the single-frame path.
+        logger.warning("multi-frame VLM recognition failed: %s", type(exc).__name__)
+        return "", 0.0
+    if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
+        return "", 0.0
+    return token, 0.85

tests/test_vlm.py CHANGED Viewed

@@ -152,3 +152,99 @@ class TestRecognizeSignFromFrame:
         token, conf = recognize_sign_from_frame(frame)
         assert token == ""
         assert conf == 0.0

         token, conf = recognize_sign_from_frame(frame)
         assert token == ""
         assert conf == 0.0
+class TestRecognizeSignFromFrames:
+    def test_too_few_frames_raises(self):
+        from signbridge.recognizer.vlm import recognize_sign_from_frames
+        with pytest.raises(ValueError):
+            recognize_sign_from_frames([])
+        with pytest.raises(ValueError):
+            recognize_sign_from_frames([np.zeros((32, 32, 3), dtype=np.uint8)])
+    def test_no_client_returns_empty(self):
+        from signbridge.recognizer.vlm import recognize_sign_from_frames
+        frames = [np.full((32, 32, 3), 200, dtype=np.uint8) for _ in range(4)]
+        token, conf = recognize_sign_from_frames(frames)
+        assert token == ""
+        assert conf == 0.0
+    def test_with_mock_client(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from signbridge.recognizer import vlm
+        captured: dict = {}
+        class _FakeChoice:
+            def __init__(self, content: str) -> None:
+                self.message = type("M", (), {"content": content})()
+        class _FakeResp:
+            def __init__(self, content: str) -> None:
+                self.choices = [_FakeChoice(content)]
+        class _FakeClient:
+            class chat:  # noqa: N801
+                class completions:  # noqa: N801
+                    @staticmethod
+                    def create(**kwargs: object) -> _FakeResp:
+                        captured.update(kwargs)
+                        return _FakeResp("hello")
+        monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
+        frames = [np.full((32, 32, 3), 100 + i, dtype=np.uint8) for i in range(4)]
+        token, conf = vlm.recognize_sign_from_frames(frames)
+        assert token == "hello"
+        assert conf == 0.85
+        # Verify multi-image payload shape: 1 message with 1 text + 4 image_urls
+        msgs = captured["messages"]
+        assert len(msgs) == 1
+        content = msgs[0]["content"]
+        assert sum(1 for c in content if c["type"] == "text") == 1
+        assert sum(1 for c in content if c["type"] == "image_url") == 4
+    def test_off_vocab_token_suppressed(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from signbridge.recognizer import vlm
+        class _FakeClient:
+            class chat:  # noqa: N801
+                class completions:  # noqa: N801
+                    @staticmethod
+                    def create(**_: object) -> object:
+                        return type(
+                            "R",
+                            (),
+                            {
+                                "choices": [
+                                    type(
+                                        "C",
+                                        (),
+                                        {"message": type("M", (), {"content": "fingerspelling"})()},
+                                    )()
+                                ]
+                            },
+                        )()
+        monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))
+        frames = [np.full((32, 32, 3), 100, dtype=np.uint8) for _ in range(4)]
+        token, conf = vlm.recognize_sign_from_frames(frames)
+        # 'fingerspelling' is not in VOCAB_SET → suppressed
+        assert token == ""
+        assert conf == 0.0
+    def test_provider_failure_returns_empty(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from signbridge.recognizer import vlm
+        class _FailingClient:
+            class chat:  # noqa: N801
+                class completions:  # noqa: N801
+                    @staticmethod
+                    def create(**_: object) -> object:
+                        raise RuntimeError("boom")
+        monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FailingClient(), "test"))
+        frames = [np.full((32, 32, 3), 0, dtype=np.uint8) for _ in range(3)]
+        token, conf = vlm.recognize_sign_from_frames(frames)
+        assert token == ""
+        assert conf == 0.0