LucasLooTan commited on
Commit
4838599
·
1 Parent(s): d20d5a9

feat(prompts): NVIDIA-pattern single-frame + multi-frame VLM prompts

Browse files

Per the deep-research finding (NVIDIA Vision-Language-Model Prompt
Engineering Guide, March 2025): closed-vocabulary forcing + domain
priming + sequential frame markers. Builds ASL-specific prompts that
explicitly distinguish static fingerspelling from motion-dependent
lexical signs.

- Single-frame builder (build_single_frame_prompt) replaces the existing
inline _PROMPT in vlm.py. Adds the ASL domain-priming preface; same
closed-vocabulary forcing and output contract as before.
- Multi-frame builder (build_multi_frame_prompt) generates 'Frame 1:
... Frame N: ...' markers so the VLM treats the inputs as a temporal
sequence. Used by the upcoming Hold-to-record burst-capture pipeline
in Task 4.

signbridge/recognizer/prompts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """VLM prompt builders for SignBridge.
2
+
3
+ Two builders share a common closed-vocabulary block and a common output
4
+ contract ("ONE token only, no explanation"). Single-frame and multi-frame
5
+ prompts diverge only in the framing — multi-frame adds explicit frame
6
+ markers and a 'motion across frames' priming line.
7
+
8
+ Patterns from NVIDIA's Vision-Language-Model Prompt Engineering Guide
9
+ (March 2025): closed-vocabulary forcing, domain priming preface,
10
+ sequential frame markers for temporal content.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from signbridge.vocab import VOCAB_PROMPT_LITERAL
16
+
17
+ _DOMAIN_PRIME = (
18
+ "American Sign Language (ASL) is a visual language with one-handed "
19
+ "and two-handed signs. Fingerspelled letters (A-Z) and numbers (0-9) "
20
+ "are typically held STATIC for ~0.5 seconds. Lexical signs (hello, "
21
+ "thank_you, please, eat, drink, ...) are typically MOTION over ~1-2 "
22
+ "seconds — the meaning is in the movement, not a single frame."
23
+ )
24
+
25
+ _OUTPUT_CONTRACT = (
26
+ "Reply with EXACTLY ONE token from this list, no other text, no "
27
+ "quotes, no explanation:\n"
28
+ f"{VOCAB_PROMPT_LITERAL}\n\n"
29
+ "Rules:\n"
30
+ "- Single uppercase letter (A-Z) for fingerspelling letters.\n"
31
+ "- Single digit (0-9) for fingerspelled numbers.\n"
32
+ "- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
33
+ "- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
34
+ "- Do NOT explain. Do NOT add punctuation. Single token only."
35
+ )
36
+
37
+
38
+ def build_single_frame_prompt() -> str:
39
+ """Prompt for single-image sign recognition (fingerspelled letters etc.)."""
40
+ return (
41
+ f"{_DOMAIN_PRIME}\n\n"
42
+ "Look at this image of a single signed gesture and identify which "
43
+ "ASL sign or fingerspelled letter is shown.\n\n"
44
+ f"{_OUTPUT_CONTRACT}"
45
+ )
46
+
47
+
48
+ def build_multi_frame_prompt(n_frames: int) -> str:
49
+ """Prompt for multi-image sign recognition (motion-dependent signs).
50
+
51
+ Generates explicit 'Frame 1: ... Frame N: ...' markers so the VLM treats
52
+ the inputs as a temporal sequence rather than independent samples.
53
+ """
54
+ if n_frames < 2:
55
+ raise ValueError(
56
+ f"multi-frame prompt requires n_frames >= 2, got {n_frames}"
57
+ )
58
+ frame_markers = "\n".join(
59
+ f"- Frame {i}: image {i} of {n_frames}" for i in range(1, n_frames + 1)
60
+ )
61
+ return (
62
+ f"{_DOMAIN_PRIME}\n\n"
63
+ f"You are about to see {n_frames} images captured in temporal order, "
64
+ "spaced evenly over a 1.5-second window. The images are frames from "
65
+ "a single gesture. The meaning is the MOTION across the frames, not "
66
+ "any single image.\n\n"
67
+ f"{frame_markers}\n\n"
68
+ "Identify the ASL sign or fingerspelled letter being made across "
69
+ "this sequence.\n\n"
70
+ f"{_OUTPUT_CONTRACT}"
71
+ )
signbridge/recognizer/vlm.py CHANGED
@@ -29,28 +29,13 @@ import numpy as np
29
  # Closed vocabulary the VLM is asked to choose from. Imported from the
30
  # shared `signbridge.vocab` module so the recognizer and the trained
31
  # classifier (`signbridge.recognizer.classifier`) can never drift.
32
- from signbridge.vocab import VOCAB_PROMPT_LITERAL as _VLM_VOCAB
33
  from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
34
 
35
  logger = logging.getLogger(__name__)
36
 
37
  DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
38
 
39
- _PROMPT = (
40
- "You are an expert in American Sign Language (ASL). Look at this image of a "
41
- "single signed gesture. Identify which ASL sign or fingerspelled letter is "
42
- "being shown.\n\n"
43
- "Reply with EXACTLY ONE token from this list, no other text, no quotes, no "
44
- "explanation:\n"
45
- f"{_VLM_VOCAB}\n\n"
46
- "Rules:\n"
47
- "- Single uppercase letter (A-Z) for fingerspelling letters.\n"
48
- "- Single digit (0-9) for fingerspelled numbers.\n"
49
- "- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
50
- "- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
51
- "- Do NOT explain. Do NOT add punctuation. Single token only."
52
- )
53
-
54
 
55
  @lru_cache(maxsize=4)
56
  def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
@@ -151,7 +136,7 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
151
  {
152
  "role": "user",
153
  "content": [
154
- {"type": "text", "text": _PROMPT},
155
  {"type": "image_url", "image_url": {"url": data_url}},
156
  ],
157
  }
 
29
  # Closed vocabulary the VLM is asked to choose from. Imported from the
30
  # shared `signbridge.vocab` module so the recognizer and the trained
31
  # classifier (`signbridge.recognizer.classifier`) can never drift.
32
+ from signbridge.recognizer.prompts import build_single_frame_prompt
33
  from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
34
 
35
  logger = logging.getLogger(__name__)
36
 
37
  DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  @lru_cache(maxsize=4)
41
  def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
 
136
  {
137
  "role": "user",
138
  "content": [
139
+ {"type": "text", "text": build_single_frame_prompt()},
140
  {"type": "image_url", "image_url": {"url": data_url}},
141
  ],
142
  }
tests/test_prompts.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the prompt-builder module."""
2
+
3
+ from signbridge.recognizer.prompts import (
4
+ build_single_frame_prompt,
5
+ build_multi_frame_prompt,
6
+ )
7
+ from signbridge.vocab import VOCAB_SET # noqa: F401 — reserved for future vocab-coverage tests
8
+
9
+
10
+ class TestSingleFramePrompt:
11
+ def test_returns_string(self):
12
+ assert isinstance(build_single_frame_prompt(), str)
13
+
14
+ def test_mentions_asl(self):
15
+ assert "ASL" in build_single_frame_prompt()
16
+
17
+ def test_includes_vocab_tokens(self):
18
+ prompt = build_single_frame_prompt()
19
+ for token in ("A", "Z", "0", "9", "hello", "thank_you", "unknown"):
20
+ assert token in prompt, f"missing {token!r}"
21
+
22
+ def test_demands_single_token_output(self):
23
+ prompt = build_single_frame_prompt()
24
+ assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
25
+
26
+ def test_includes_domain_priming(self):
27
+ prompt = build_single_frame_prompt()
28
+ assert "ASL" in prompt
29
+ assert any(p in prompt for p in ("fingerspelled", "fingerspelling"))
30
+
31
+
32
+ class TestMultiFramePrompt:
33
+ def test_returns_string(self):
34
+ assert isinstance(build_multi_frame_prompt(4), str)
35
+
36
+ def test_includes_frame_markers(self):
37
+ prompt = build_multi_frame_prompt(4)
38
+ for i in range(1, 5):
39
+ assert f"Frame {i}" in prompt, f"missing 'Frame {i}' marker"
40
+
41
+ def test_mentions_motion(self):
42
+ prompt = build_multi_frame_prompt(4)
43
+ assert "motion" in prompt.lower() or "across" in prompt.lower()
44
+
45
+ def test_demands_single_token_output(self):
46
+ prompt = build_multi_frame_prompt(4)
47
+ assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
48
+
49
+ def test_includes_vocab(self):
50
+ prompt = build_multi_frame_prompt(4)
51
+ for token in ("A", "hello", "unknown"):
52
+ assert token in prompt
53
+
54
+ def test_n_frames_parameter(self):
55
+ p2 = build_multi_frame_prompt(2)
56
+ p4 = build_multi_frame_prompt(4)
57
+ assert p2 != p4
58
+ assert "Frame 1" in p2 and "Frame 2" in p2 and "Frame 3" not in p2
59
+ assert "Frame 4" in p4
60
+
61
+ def test_too_few_frames_raises(self):
62
+ import pytest
63
+ with pytest.raises(ValueError):
64
+ build_multi_frame_prompt(1)
65
+ with pytest.raises(ValueError):
66
+ build_multi_frame_prompt(0)