Spaces:

lablab-ai-amd-developer-hackathon
/

signbridge

Build error

LucasLooTan commited on 1 day ago

Commit

819f4c1

1 Parent(s): 3a0014b

feat: end-to-end smoke test + gold-set accuracy harness

- signbridge.scripts.smoke_test: runs recognizer + composer + TTS once,
reports per-stage status. Provider-agnostic (amd/openai/hf/none).
Confirms composer + TTS work even without API keys.
- signbridge.scripts.run_gold_set: scans tests/golden/<token>/*.{jpg,png}
and reports per-class + overall accuracy. Writes timestamped CSV.
Exits non-zero if overall < 75% (the V1 success criterion).

Both ready to run the moment AMD Dev Cloud credentials land in .env.

Files changed (2) hide show

signbridge/scripts/run_gold_set.py +141 -0
signbridge/scripts/smoke_test.py +147 -0

signbridge/scripts/run_gold_set.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Accuracy harness — run the recognizer over a labelled sample folder.
+Folder layout expected:
+    tests/golden/
+        A/<any>.jpg|png    →  expected token "A"
+        B/<any>.jpg|png    →  expected token "B"
+        ...
+        hello/<any>.jpg|png →  expected token "hello"
+Each subdirectory name is the expected token. Every image inside is a sample.
+Output:
+- per-class accuracy (correct / total)
+- overall accuracy
+- a CSV at tests/golden/results-<timestamp>.csv
+Usage:
+    python -m signbridge.scripts.run_gold_set
+    python -m signbridge.scripts.run_gold_set --root tests/golden
+"""
+from __future__ import annotations
+import argparse
+import csv
+import sys
+import time
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+import numpy as np
+from dotenv import load_dotenv
+from PIL import Image
+from signbridge.recognizer.vlm import recognize_sign_from_frame
+VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
+def _iter_samples(root: Path):
+    for cls_dir in sorted(p for p in root.iterdir() if p.is_dir()):
+        cls = cls_dir.name
+        for img_path in sorted(cls_dir.iterdir()):
+            if img_path.suffix.lower() in VALID_EXTS:
+                yield cls, img_path
+def main() -> int:
+    parser = argparse.ArgumentParser(description="SignBridge accuracy harness")
+    parser.add_argument(
+        "--root",
+        type=Path,
+        default=Path("tests/golden"),
+        help="Root folder with one subdirectory per expected token",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="CSV output path (defaults to tests/golden/results-<ts>.csv)",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    if not args.root.exists() or not args.root.is_dir():
+        print(f"error: {args.root} not found or not a directory", file=sys.stderr)
+        print("create it with subdirectories named after expected tokens, e.g.:", file=sys.stderr)
+        print("    tests/golden/A/sample1.jpg", file=sys.stderr)
+        print("    tests/golden/hello/sample2.png", file=sys.stderr)
+        return 2
+    samples = list(_iter_samples(args.root))
+    if not samples:
+        print(f"no images found under {args.root}", file=sys.stderr)
+        return 2
+    out_path = args.output or args.root / f"results-{datetime.now(timezone.utc):%Y%m%dT%H%M%SZ}.csv"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    per_class_correct: dict[str, int] = defaultdict(int)
+    per_class_total: dict[str, int] = defaultdict(int)
+    rows: list[dict[str, str]] = []
+    print(f"running {len(samples)} samples against the configured provider…")
+    t_start = time.perf_counter()
+    for expected, path in samples:
+        per_class_total[expected] += 1
+        img = np.asarray(Image.open(path).convert("RGB"))
+        t0 = time.perf_counter()
+        predicted, confidence = recognize_sign_from_frame(img)
+        dt_ms = (time.perf_counter() - t0) * 1000
+        ok = predicted == expected
+        if ok:
+            per_class_correct[expected] += 1
+        rows.append(
+            {
+                "path": str(path),
+                "expected": expected,
+                "predicted": predicted,
+                "confidence": f"{confidence:.2f}",
+                "latency_ms": f"{dt_ms:.0f}",
+                "correct": "1" if ok else "0",
+            }
+        )
+        print(
+            f"  [{'✓' if ok else '✗'}] {expected:<10} → {predicted!r:<12} "
+            f"conf={confidence:.2f}  {dt_ms:.0f}ms  ({path.name})"
+        )
+    total_correct = sum(per_class_correct.values())
+    total = sum(per_class_total.values())
+    overall = total_correct / total if total else 0.0
+    elapsed = time.perf_counter() - t_start
+    print()
+    print("Per-class accuracy:")
+    for cls in sorted(per_class_total):
+        c = per_class_correct[cls]
+        n = per_class_total[cls]
+        print(f"  {cls:<12} {c}/{n}  ({(c / n) * 100:.0f}%)" if n else f"  {cls:<12} 0/0")
+    print()
+    print(f"Overall: {total_correct}/{total}  ({overall * 100:.1f}%)")
+    print(f"Total wall time: {elapsed:.1f}s  (avg {(elapsed / total) * 1000:.0f}ms per sample)")
+    with out_path.open("w", newline="") as fh:
+        writer = csv.DictWriter(
+            fh,
+            fieldnames=["path", "expected", "predicted", "confidence", "latency_ms", "correct"],
+        )
+        writer.writeheader()
+        writer.writerows(rows)
+    print(f"\nCSV written to {out_path}")
+    # Exit non-zero if accuracy below the V1 success criterion (75%).
+    return 0 if overall >= 0.75 else 1
+if __name__ == "__main__":
+    sys.exit(main())

signbridge/scripts/smoke_test.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""End-to-end smoke test for the SignBridge inference path.
+Run AFTER you've filled in .env with provider credentials. Exercises:
+- /info endpoint (no auth needed)
+- the VLM recognizer with a synthetic frame
+- the LLM composer with a hand-crafted sign sequence
+- the TTS pipeline
+Usage:
+    python -m signbridge.scripts.smoke_test
+    SIGNBRIDGE_PROVIDER=openai python -m signbridge.scripts.smoke_test
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+import numpy as np
+from dotenv import load_dotenv
+from PIL import Image, ImageDraw
+from signbridge.composer.sentence import compose_sentence
+from signbridge.recognizer.vlm import recognize_sign_from_frame
+from signbridge.voice.tts import synthesize_speech
+def _make_synthetic_frame() -> np.ndarray:
+    """Create a 256x256 RGB image with a stylised pose silhouette.
+    Real recognition needs an actual hand/sign image. This synthetic frame
+    is just to confirm the API plumbing works end-to-end — accuracy is
+    expected to be 'unknown' (the VLM returning 'unknown' is the right
+    answer for a stick figure).
+    """
+    img = Image.new("RGB", (256, 256), color=(245, 245, 245))
+    d = ImageDraw.Draw(img)
+    # Stick figure: head + body + arms in "A" sign pose
+    d.ellipse((110, 30, 146, 66), fill=(220, 180, 140), outline="black", width=2)
+    d.line((128, 66, 128, 160), fill="black", width=4)
+    d.line((128, 90, 95, 130), fill="black", width=4)
+    d.line((128, 90, 161, 130), fill="black", width=4)
+    d.ellipse((85, 120, 105, 140), fill=(220, 180, 140), outline="black", width=2)
+    d.ellipse((151, 120, 171, 140), fill=(220, 180, 140), outline="black", width=2)
+    d.text((90, 200), "synthetic test frame", fill="black")
+    return np.asarray(img)
+def _print_provider_info() -> None:
+    provider = os.getenv("SIGNBRIDGE_PROVIDER", "amd")
+    print(f"  provider = {provider}")
+    if provider == "amd":
+        base = os.getenv("AMD_DEV_CLOUD_BASE_URL", "")
+        key = os.getenv("AMD_DEV_CLOUD_API_KEY", "")
+        print(f"  AMD_DEV_CLOUD_BASE_URL = {base or '(unset)'}")
+        print(f"  AMD_DEV_CLOUD_API_KEY  = {'set (' + str(len(key)) + ' chars)' if key else '(unset)'}")
+    elif provider == "openai":
+        print(f"  OPENAI_API_KEY = {'set' if os.getenv('OPENAI_API_KEY') else '(unset)'}")
+    elif provider == "hf":
+        print(f"  HF_TOKEN = {'set' if os.getenv('HF_TOKEN') else '(unset)'}")
+def _step(label: str) -> None:
+    print(f"\n── {label} ──")
+def main() -> int:
+    parser = argparse.ArgumentParser(description="SignBridge end-to-end smoke test")
+    parser.add_argument(
+        "--text",
+        default="My name is Lucas. Hello.",
+        help="Text to synthesise via TTS",
+    )
+    parser.add_argument(
+        "--signs",
+        nargs="+",
+        default=["hello", "name", "L", "U", "C", "A", "S"],
+        help="Sign sequence to compose into a sentence",
+    )
+    parser.add_argument(
+        "--frame",
+        type=Path,
+        default=None,
+        help="Path to a real sign image (PNG/JPG). Default = synthetic frame.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    _step("Provider config")
+    _print_provider_info()
+    _step("VLM recognizer (sign-frame → token)")
+    if args.frame:
+        img = np.asarray(Image.open(args.frame).convert("RGB"))
+        print(f"  using real frame: {args.frame} ({img.shape})")
+    else:
+        img = _make_synthetic_frame()
+        print(f"  using synthetic frame ({img.shape})")
+        print("  (a synthetic stick figure is unlikely to match an ASL sign;")
+        print("   the expected outcome is the VLM returning 'unknown' or empty —")
+        print("   that proves the call worked even when accuracy can't be measured.)")
+    t0 = time.perf_counter()
+    token, conf = recognize_sign_from_frame(img)
+    dt = time.perf_counter() - t0
+    print(f"  → token={token!r} confidence={conf:.2f} latency={dt:.2f}s")
+    _step("LLM composer (sign tokens → English sentence)")
+    print(f"  input signs: {args.signs}")
+    t0 = time.perf_counter()
+    sentence = compose_sentence(args.signs)
+    dt = time.perf_counter() - t0
+    print(f"  → sentence = {sentence!r}  ({dt:.2f}s)")
+    _step("TTS (text → audio)")
+    print(f"  input text: {args.text!r}")
+    t0 = time.perf_counter()
+    audio_path = synthesize_speech(args.text)
+    dt = time.perf_counter() - t0
+    if audio_path:
+        size = Path(audio_path).stat().st_size
+        print(f"  → wrote {audio_path}  ({size:,} bytes, {dt:.2f}s)")
+    else:
+        print("  → no audio (TTS unavailable)")
+    _step("Summary")
+    ok_recognize = bool(token)
+    ok_compose = bool(sentence)
+    ok_tts = bool(audio_path)
+    flags = {
+        "recognizer": "✓" if ok_recognize else "—  (provider may be in stub mode; check creds)",
+        "composer":   "✓" if ok_compose else "✗  composer failed",
+        "tts":        "✓" if ok_tts else "✗  TTS failed",
+    }
+    for k, v in flags.items():
+        print(f"  {k:<10} {v}")
+    # Compose + tts MUST work even with no provider (naive joiner + silent stub).
+    # Recognizer needs a real provider.
+    return 0 if (ok_compose and ok_tts) else 1
+if __name__ == "__main__":
+    sys.exit(main())