Spaces:
Build error
Build error
Commit ·
4838599
1
Parent(s): d20d5a9
feat(prompts): NVIDIA-pattern single-frame + multi-frame VLM prompts
Browse filesPer the deep-research finding (NVIDIA Vision-Language-Model Prompt
Engineering Guide, March 2025): closed-vocabulary forcing + domain
priming + sequential frame markers. Builds ASL-specific prompts that
explicitly distinguish static fingerspelling from motion-dependent
lexical signs.
- Single-frame builder (build_single_frame_prompt) replaces the existing
inline _PROMPT in vlm.py. Adds the ASL domain-priming preface; same
closed-vocabulary forcing and output contract as before.
- Multi-frame builder (build_multi_frame_prompt) generates 'Frame 1:
... Frame N: ...' markers so the VLM treats the inputs as a temporal
sequence. Used by the upcoming Hold-to-record burst-capture pipeline
in Task 4.
- signbridge/recognizer/prompts.py +71 -0
- signbridge/recognizer/vlm.py +2 -17
- tests/test_prompts.py +66 -0
signbridge/recognizer/prompts.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""VLM prompt builders for SignBridge.
|
| 2 |
+
|
| 3 |
+
Two builders share a common closed-vocabulary block and a common output
|
| 4 |
+
contract ("ONE token only, no explanation"). Single-frame and multi-frame
|
| 5 |
+
prompts diverge only in the framing — multi-frame adds explicit frame
|
| 6 |
+
markers and a 'motion across frames' priming line.
|
| 7 |
+
|
| 8 |
+
Patterns from NVIDIA's Vision-Language-Model Prompt Engineering Guide
|
| 9 |
+
(March 2025): closed-vocabulary forcing, domain priming preface,
|
| 10 |
+
sequential frame markers for temporal content.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from signbridge.vocab import VOCAB_PROMPT_LITERAL
|
| 16 |
+
|
| 17 |
+
_DOMAIN_PRIME = (
|
| 18 |
+
"American Sign Language (ASL) is a visual language with one-handed "
|
| 19 |
+
"and two-handed signs. Fingerspelled letters (A-Z) and numbers (0-9) "
|
| 20 |
+
"are typically held STATIC for ~0.5 seconds. Lexical signs (hello, "
|
| 21 |
+
"thank_you, please, eat, drink, ...) are typically MOTION over ~1-2 "
|
| 22 |
+
"seconds — the meaning is in the movement, not a single frame."
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
_OUTPUT_CONTRACT = (
|
| 26 |
+
"Reply with EXACTLY ONE token from this list, no other text, no "
|
| 27 |
+
"quotes, no explanation:\n"
|
| 28 |
+
f"{VOCAB_PROMPT_LITERAL}\n\n"
|
| 29 |
+
"Rules:\n"
|
| 30 |
+
"- Single uppercase letter (A-Z) for fingerspelling letters.\n"
|
| 31 |
+
"- Single digit (0-9) for fingerspelled numbers.\n"
|
| 32 |
+
"- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
|
| 33 |
+
"- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
|
| 34 |
+
"- Do NOT explain. Do NOT add punctuation. Single token only."
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def build_single_frame_prompt() -> str:
|
| 39 |
+
"""Prompt for single-image sign recognition (fingerspelled letters etc.)."""
|
| 40 |
+
return (
|
| 41 |
+
f"{_DOMAIN_PRIME}\n\n"
|
| 42 |
+
"Look at this image of a single signed gesture and identify which "
|
| 43 |
+
"ASL sign or fingerspelled letter is shown.\n\n"
|
| 44 |
+
f"{_OUTPUT_CONTRACT}"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def build_multi_frame_prompt(n_frames: int) -> str:
|
| 49 |
+
"""Prompt for multi-image sign recognition (motion-dependent signs).
|
| 50 |
+
|
| 51 |
+
Generates explicit 'Frame 1: ... Frame N: ...' markers so the VLM treats
|
| 52 |
+
the inputs as a temporal sequence rather than independent samples.
|
| 53 |
+
"""
|
| 54 |
+
if n_frames < 2:
|
| 55 |
+
raise ValueError(
|
| 56 |
+
f"multi-frame prompt requires n_frames >= 2, got {n_frames}"
|
| 57 |
+
)
|
| 58 |
+
frame_markers = "\n".join(
|
| 59 |
+
f"- Frame {i}: image {i} of {n_frames}" for i in range(1, n_frames + 1)
|
| 60 |
+
)
|
| 61 |
+
return (
|
| 62 |
+
f"{_DOMAIN_PRIME}\n\n"
|
| 63 |
+
f"You are about to see {n_frames} images captured in temporal order, "
|
| 64 |
+
"spaced evenly over a 1.5-second window. The images are frames from "
|
| 65 |
+
"a single gesture. The meaning is the MOTION across the frames, not "
|
| 66 |
+
"any single image.\n\n"
|
| 67 |
+
f"{frame_markers}\n\n"
|
| 68 |
+
"Identify the ASL sign or fingerspelled letter being made across "
|
| 69 |
+
"this sequence.\n\n"
|
| 70 |
+
f"{_OUTPUT_CONTRACT}"
|
| 71 |
+
)
|
signbridge/recognizer/vlm.py
CHANGED
|
@@ -29,28 +29,13 @@ import numpy as np
|
|
| 29 |
# Closed vocabulary the VLM is asked to choose from. Imported from the
|
| 30 |
# shared `signbridge.vocab` module so the recognizer and the trained
|
| 31 |
# classifier (`signbridge.recognizer.classifier`) can never drift.
|
| 32 |
-
from signbridge.
|
| 33 |
from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
|
| 34 |
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
|
| 38 |
|
| 39 |
-
_PROMPT = (
|
| 40 |
-
"You are an expert in American Sign Language (ASL). Look at this image of a "
|
| 41 |
-
"single signed gesture. Identify which ASL sign or fingerspelled letter is "
|
| 42 |
-
"being shown.\n\n"
|
| 43 |
-
"Reply with EXACTLY ONE token from this list, no other text, no quotes, no "
|
| 44 |
-
"explanation:\n"
|
| 45 |
-
f"{_VLM_VOCAB}\n\n"
|
| 46 |
-
"Rules:\n"
|
| 47 |
-
"- Single uppercase letter (A-Z) for fingerspelling letters.\n"
|
| 48 |
-
"- Single digit (0-9) for fingerspelled numbers.\n"
|
| 49 |
-
"- Lowercase word with underscores for full-word signs (e.g. thank_you).\n"
|
| 50 |
-
"- 'unknown' if no sign is visible or the gesture isn't in the list.\n"
|
| 51 |
-
"- Do NOT explain. Do NOT add punctuation. Single token only."
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
|
| 55 |
@lru_cache(maxsize=4)
|
| 56 |
def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
|
|
@@ -151,7 +136,7 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
|
|
| 151 |
{
|
| 152 |
"role": "user",
|
| 153 |
"content": [
|
| 154 |
-
{"type": "text", "text":
|
| 155 |
{"type": "image_url", "image_url": {"url": data_url}},
|
| 156 |
],
|
| 157 |
}
|
|
|
|
| 29 |
# Closed vocabulary the VLM is asked to choose from. Imported from the
|
| 30 |
# shared `signbridge.vocab` module so the recognizer and the trained
|
| 31 |
# classifier (`signbridge.recognizer.classifier`) can never drift.
|
| 32 |
+
from signbridge.recognizer.prompts import build_single_frame_prompt
|
| 33 |
from signbridge.vocab import VOCAB_SET as _VLM_VOCAB_SET
|
| 34 |
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
DEFAULT_VLM_MODEL = os.getenv("SIGNBRIDGE_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
@lru_cache(maxsize=4)
|
| 41 |
def _build_client(provider: str, base_url: str, api_key: str, model: str) -> tuple[object | None, str]:
|
|
|
|
| 136 |
{
|
| 137 |
"role": "user",
|
| 138 |
"content": [
|
| 139 |
+
{"type": "text", "text": build_single_frame_prompt()},
|
| 140 |
{"type": "image_url", "image_url": {"url": data_url}},
|
| 141 |
],
|
| 142 |
}
|
tests/test_prompts.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the prompt-builder module."""
|
| 2 |
+
|
| 3 |
+
from signbridge.recognizer.prompts import (
|
| 4 |
+
build_single_frame_prompt,
|
| 5 |
+
build_multi_frame_prompt,
|
| 6 |
+
)
|
| 7 |
+
from signbridge.vocab import VOCAB_SET # noqa: F401 — reserved for future vocab-coverage tests
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestSingleFramePrompt:
|
| 11 |
+
def test_returns_string(self):
|
| 12 |
+
assert isinstance(build_single_frame_prompt(), str)
|
| 13 |
+
|
| 14 |
+
def test_mentions_asl(self):
|
| 15 |
+
assert "ASL" in build_single_frame_prompt()
|
| 16 |
+
|
| 17 |
+
def test_includes_vocab_tokens(self):
|
| 18 |
+
prompt = build_single_frame_prompt()
|
| 19 |
+
for token in ("A", "Z", "0", "9", "hello", "thank_you", "unknown"):
|
| 20 |
+
assert token in prompt, f"missing {token!r}"
|
| 21 |
+
|
| 22 |
+
def test_demands_single_token_output(self):
|
| 23 |
+
prompt = build_single_frame_prompt()
|
| 24 |
+
assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
|
| 25 |
+
|
| 26 |
+
def test_includes_domain_priming(self):
|
| 27 |
+
prompt = build_single_frame_prompt()
|
| 28 |
+
assert "ASL" in prompt
|
| 29 |
+
assert any(p in prompt for p in ("fingerspelled", "fingerspelling"))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class TestMultiFramePrompt:
|
| 33 |
+
def test_returns_string(self):
|
| 34 |
+
assert isinstance(build_multi_frame_prompt(4), str)
|
| 35 |
+
|
| 36 |
+
def test_includes_frame_markers(self):
|
| 37 |
+
prompt = build_multi_frame_prompt(4)
|
| 38 |
+
for i in range(1, 5):
|
| 39 |
+
assert f"Frame {i}" in prompt, f"missing 'Frame {i}' marker"
|
| 40 |
+
|
| 41 |
+
def test_mentions_motion(self):
|
| 42 |
+
prompt = build_multi_frame_prompt(4)
|
| 43 |
+
assert "motion" in prompt.lower() or "across" in prompt.lower()
|
| 44 |
+
|
| 45 |
+
def test_demands_single_token_output(self):
|
| 46 |
+
prompt = build_multi_frame_prompt(4)
|
| 47 |
+
assert any(s in prompt for s in ("Single token only", "single token", "ONE token"))
|
| 48 |
+
|
| 49 |
+
def test_includes_vocab(self):
|
| 50 |
+
prompt = build_multi_frame_prompt(4)
|
| 51 |
+
for token in ("A", "hello", "unknown"):
|
| 52 |
+
assert token in prompt
|
| 53 |
+
|
| 54 |
+
def test_n_frames_parameter(self):
|
| 55 |
+
p2 = build_multi_frame_prompt(2)
|
| 56 |
+
p4 = build_multi_frame_prompt(4)
|
| 57 |
+
assert p2 != p4
|
| 58 |
+
assert "Frame 1" in p2 and "Frame 2" in p2 and "Frame 3" not in p2
|
| 59 |
+
assert "Frame 4" in p4
|
| 60 |
+
|
| 61 |
+
def test_too_few_frames_raises(self):
|
| 62 |
+
import pytest
|
| 63 |
+
with pytest.raises(ValueError):
|
| 64 |
+
build_multi_frame_prompt(1)
|
| 65 |
+
with pytest.raises(ValueError):
|
| 66 |
+
build_multi_frame_prompt(0)
|