Spaces:

lablab-ai-amd-developer-hackathon
/

signbridge

Build error

LucasLooTan commited on about 21 hours ago

Commit

4838599

1 Parent(s): d20d5a9

feat(prompts): NVIDIA-pattern single-frame + multi-frame VLM prompts

Per the deep-research finding (NVIDIA Vision-Language-Model Prompt
Engineering Guide, March 2025): closed-vocabulary forcing + domain
priming + sequential frame markers. Builds ASL-specific prompts that
explicitly distinguish static fingerspelling from motion-dependent
lexical signs.

- Single-frame builder (build_single_frame_prompt) replaces the existing
inline _PROMPT in vlm.py. Adds the ASL domain-priming preface; same
closed-vocabulary forcing and output contract as before.
- Multi-frame builder (build_multi_frame_prompt) generates 'Frame 1:
... Frame N: ...' markers so the VLM treats the inputs as a temporal
sequence. Used by the upcoming Hold-to-record burst-capture pipeline
in Task 4.

Files changed (3) hide show

signbridge/recognizer/prompts.py +71 -0
signbridge/recognizer/vlm.py +2 -17
tests/test_prompts.py +66 -0

signbridge/recognizer/prompts.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""VLM prompt builders for SignBridge.
+Two builders share a common closed-vocabulary block and a common output
+contract ("ONE token only, no explanation"). Single-frame and multi-frame
+prompts diverge only in the framing — multi-frame adds explicit frame
+markers and a 'motion across frames' priming line.
+Patterns from NVIDIA's Vision-Language-Model Prompt Engineering Guide
+(March 2025): closed-vocabulary forcing, domain priming preface,
+sequential frame markers for temporal content.
+"""
+from __future__ import annotations
+from signbridge.vocab import VOCAB_PROMPT_LITERAL
+_DOMAIN_PRIME = (
+    "American Sign Language (ASL) is a visual language with one-handed "
+    "and two-handed signs. Fingerspelled letters (A-Z) and numbers (0-9) "
+    "are typically held STATIC for ~0.5 seconds. Lexical signs (hello, "
+    "thank_you, please, eat, drink, ...) are typically MOTION over ~1-2 "
+    "seconds — the meaning is in the movement, not a single frame."
+)
+_OUTPUT_CONTRACT = (
+    "Reply with EXACTLY ONE token from this list, no other text, no "
+    "quotes, no explanation:\n"
+    f"{VOCAB_PROMPT_LITERAL}\n\n"
+    "Rules:\n"
+    "- Single uppercase letter (A-Z) for fingerspelling letters.\n"
+    "- Single digit (0-9) for fingerspelled numbers.\n"
+    "- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
+    "- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
+    "- Do NOT explain. Do NOT add punctuation. Single token only."
+)
+def build_single_frame_prompt() -> str:
+    """Prompt for single-image sign recognition (fingerspelled letters etc.)."""
+    return (
+        f"{_DOMAIN_PRIME}\n\n"
+        "Look at this image of a single signed gesture and identify which "
+        "ASL sign or fingerspelled letter is shown.\n\n"
+        f"{_OUTPUT_CONTRACT}"
+    )
+def build_multi_frame_prompt(n_frames: int) -> str:
+    """Prompt for multi-image sign recognition (motion-dependent signs).
+    Generates explicit 'Frame 1: ... Frame N: ...' markers so the VLM treats
+    the inputs as a temporal sequence rather than independent samples.
+    """
+    if n_frames < 2:
+        raise ValueError(
+            f"multi-frame prompt requires n_frames >= 2, got {n_frames}"
+        )
+    frame_markers = "\n".join(
+        f"- Frame {i}: image {i} of {n_frames}" for i in range(1, n_frames + 1)
+    )
+    return (
+        f"{_DOMAIN_PRIME}\n\n"
+        f"You are about to see {n_frames} images captured in temporal order, "
+        "spaced evenly over a 1.5-second window. The images are frames from "
+        "a single gesture. The meaning is the MOTION across the frames, not "
+        "any single image.\n\n"
+        f"{frame_markers}\n\n"
+        "Identify the ASL sign or fingerspelled letter being made across "
+        "this sequence.\n\n"
+        f"{_OUTPUT_CONTRACT}"
+    )

signbridge/recognizer/vlm.py CHANGED Viewed

@@ -29,28 +29,13 @@ import numpy as np
 # Closed vocabulary the VLM is asked to choose from. Imported from the
 # shared `signbridge.vocab` module so the recognizer and the trained
 # classifier (`signbridge.recognizer.classifier`) can never drift.
-from signbridge.vocab import VOCAB_PROMPT_LITERAL as _VLM_VOCAB
 from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
 logger = logging.getLogger(__name__)
 DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
-_PROMPT = (
-    "You are an expert in American Sign Language (ASL). Look at this image of a "
-    "single signed gesture. Identify which ASL sign or fingerspelled letter is "
-    "being shown.\n\n"
-    "Reply with EXACTLY ONE token from this list, no other text, no quotes, no "
-    "explanation:\n"
-    f"{_VLM_VOCAB}\n\n"
-    "Rules:\n"
-    "- Single uppercase letter (A-Z) for fingerspelling letters.\n"
-    "- Single digit (0-9) for fingerspelled numbers.\n"
-    "- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
-    "- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
-    "- Do NOT explain. Do NOT add punctuation. Single token only."
-)
 @lru_cache(maxsize=4)
 def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
@@ -151,7 +136,7 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
                 {
                     "role": "user",
                     "content": [
-                        {"type": "text", "text": _PROMPT},
                         {"type": "image_url", "image_url": {"url": data_url}},
                     ],
                 }

 # Closed vocabulary the VLM is asked to choose from. Imported from the
 # shared `signbridge.vocab` module so the recognizer and the trained
 # classifier (`signbridge.recognizer.classifier`) can never drift.
+from signbridge.recognizer.prompts import build_single_frame_prompt
 from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
 logger = logging.getLogger(__name__)
 DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
 @lru_cache(maxsize=4)
 def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
                 {
                     "role": "user",
                     "content": [
+                        {"type": "text", "text": build_single_frame_prompt()},
                         {"type": "image_url", "image_url": {"url": data_url}},
                     ],
                 }

tests/test_prompts.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Tests for the prompt-builder module."""
+from signbridge.recognizer.prompts import (
+    build_single_frame_prompt,
+    build_multi_frame_prompt,
+)
+from signbridge.vocab import VOCAB_SET  # noqa: F401 — reserved for future vocab-coverage tests
+class TestSingleFramePrompt:
+    def test_returns_string(self):
+        assert isinstance(build_single_frame_prompt(), str)
+    def test_mentions_asl(self):
+        assert "ASL" in build_single_frame_prompt()
+    def test_includes_vocab_tokens(self):
+        prompt = build_single_frame_prompt()
+        for token in ("A", "Z", "0", "9", "hello", "thank_you", "unknown"):
+            assert token in prompt, f"missing {token!r}"
+    def test_demands_single_token_output(self):
+        prompt = build_single_frame_prompt()
+        assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
+    def test_includes_domain_priming(self):
+        prompt = build_single_frame_prompt()
+        assert "ASL" in prompt
+        assert any(p in prompt for p in ("fingerspelled", "fingerspelling"))
+class TestMultiFramePrompt:
+    def test_returns_string(self):
+        assert isinstance(build_multi_frame_prompt(4), str)
+    def test_includes_frame_markers(self):
+        prompt = build_multi_frame_prompt(4)
+        for i in range(1, 5):
+            assert f"Frame {i}" in prompt, f"missing 'Frame {i}' marker"
+    def test_mentions_motion(self):
+        prompt = build_multi_frame_prompt(4)
+        assert "motion" in prompt.lower() or "across" in prompt.lower()
+    def test_demands_single_token_output(self):
+        prompt = build_multi_frame_prompt(4)
+        assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
+    def test_includes_vocab(self):
+        prompt = build_multi_frame_prompt(4)
+        for token in ("A", "hello", "unknown"):
+            assert token in prompt
+    def test_n_frames_parameter(self):
+        p2 = build_multi_frame_prompt(2)
+        p4 = build_multi_frame_prompt(4)
+        assert p2 != p4
+        assert "Frame 1" in p2 and "Frame 2" in p2 and "Frame 3" not in p2
+        assert "Frame 4" in p4
+    def test_too_few_frames_raises(self):
+        import pytest
+        with pytest.raises(ValueError):
+            build_multi_frame_prompt(1)
+        with pytest.raises(ValueError):
+            build_multi_frame_prompt(0)