Spaces:
Build error
Build error
Commit ·
7f3265d
1
Parent(s): 819f4c1
fix: composite alpha onto white before VLM round-trip
Browse filesWikimedia SVG→PNG renders with a transparent background. A naive
.convert('RGB') turned every transparent pixel black, so the VLM saw
a solid black square and (correctly) said 'unknown'.
- New shared signbridge.imageio module with load_rgb() + array_to_rgb()
helpers that composite alpha onto white at every loader boundary.
- Wired into smoke_test, run_gold_set, backend (b64 decode), and the
vlm recognizer's _frame_to_data_url.
- Confirmed: ASL letter A from Wikimedia is now identified correctly
via Qwen3-VL-8B on HF Inference Providers, conf 0.85.
HF provider also added as a fallback to the composer client resolver,
so we can validate the pipeline end-to-end while waiting for AMD Dev
Cloud credit email.
68 tests still pass; ruff clean.
- signbridge/backend.py +2 -3
- signbridge/composer/sentence.py +16 -0
- signbridge/imageio.py +70 -0
- signbridge/recognizer/vlm.py +11 -3
- signbridge/scripts/run_gold_set.py +2 -3
- signbridge/scripts/smoke_test.py +2 -1
signbridge/backend.py
CHANGED
|
@@ -15,17 +15,16 @@ Endpoints:
|
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import base64
|
| 18 |
-
import io
|
| 19 |
import logging
|
| 20 |
import os
|
| 21 |
|
| 22 |
import numpy as np
|
| 23 |
from fastapi import FastAPI, HTTPException
|
| 24 |
from fastapi.responses import FileResponse
|
| 25 |
-
from PIL import Image
|
| 26 |
from pydantic import BaseModel, Field
|
| 27 |
|
| 28 |
from signbridge.composer.sentence import compose_sentence
|
|
|
|
| 29 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 30 |
from signbridge.voice.tts import synthesize_speech
|
| 31 |
|
|
@@ -69,7 +68,7 @@ def _decode_b64_image(b64: str) -> np.ndarray:
|
|
| 69 |
if b64.startswith("data:"):
|
| 70 |
b64 = b64.split(",", 1)[1]
|
| 71 |
raw = base64.b64decode(b64)
|
| 72 |
-
return
|
| 73 |
except Exception as exc: # noqa: BLE001
|
| 74 |
raise HTTPException(status_code=400, detail=f"bad frame: {exc}") from exc
|
| 75 |
|
|
|
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import base64
|
|
|
|
| 18 |
import logging
|
| 19 |
import os
|
| 20 |
|
| 21 |
import numpy as np
|
| 22 |
from fastapi import FastAPI, HTTPException
|
| 23 |
from fastapi.responses import FileResponse
|
|
|
|
| 24 |
from pydantic import BaseModel, Field
|
| 25 |
|
| 26 |
from signbridge.composer.sentence import compose_sentence
|
| 27 |
+
from signbridge.imageio import load_rgb
|
| 28 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 29 |
from signbridge.voice.tts import synthesize_speech
|
| 30 |
|
|
|
|
| 68 |
if b64.startswith("data:"):
|
| 69 |
b64 = b64.split(",", 1)[1]
|
| 70 |
raw = base64.b64decode(b64)
|
| 71 |
+
return load_rgb(raw)
|
| 72 |
except Exception as exc: # noqa: BLE001
|
| 73 |
raise HTTPException(status_code=400, detail=f"bad frame: {exc}") from exc
|
| 74 |
|
signbridge/composer/sentence.py
CHANGED
|
@@ -63,6 +63,22 @@ def _resolve_client() -> tuple[object | None, str]:
|
|
| 63 |
"SIGNBRIDGE_COMPOSER_MODEL_OPENAI", "gpt-4o-mini"
|
| 64 |
)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; using naive joiner.", provider)
|
| 67 |
return None, composer_model
|
| 68 |
|
|
|
|
| 63 |
"SIGNBRIDGE_COMPOSER_MODEL_OPENAI", "gpt-4o-mini"
|
| 64 |
)
|
| 65 |
|
| 66 |
+
if provider == "hf":
|
| 67 |
+
api_key = os.getenv("HF_TOKEN", "")
|
| 68 |
+
if not api_key:
|
| 69 |
+
logger.info("HF_TOKEN not set; falling back to naive joiner.")
|
| 70 |
+
return None, composer_model
|
| 71 |
+
return (
|
| 72 |
+
OpenAI(
|
| 73 |
+
base_url=os.getenv(
|
| 74 |
+
"HF_INFERENCE_BASE_URL",
|
| 75 |
+
"https://router.huggingface.co/v1",
|
| 76 |
+
),
|
| 77 |
+
api_key=api_key,
|
| 78 |
+
),
|
| 79 |
+
composer_model,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; using naive joiner.", provider)
|
| 83 |
return None, composer_model
|
| 84 |
|
signbridge/imageio.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared image-loading helpers.
|
| 2 |
+
|
| 3 |
+
Centralised so the recognizer, smoke test, gold-set harness, and backend
|
| 4 |
+
all behave the same way on alpha-channel images (e.g. SVG-rendered PNGs
|
| 5 |
+
with transparent backgrounds — those would otherwise come out solid black
|
| 6 |
+
after a naive `.convert("RGB")` and the VLM sees nothing).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import io
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_rgb(source: str | Path | bytes | io.IOBase) -> np.ndarray:
|
| 18 |
+
"""Load an image as an RGB ndarray, compositing any alpha onto white.
|
| 19 |
+
|
| 20 |
+
Accepts a filesystem path, raw bytes, or any file-like object PIL
|
| 21 |
+
knows how to open.
|
| 22 |
+
"""
|
| 23 |
+
from PIL import Image
|
| 24 |
+
|
| 25 |
+
if isinstance(source, (str, Path)):
|
| 26 |
+
img = Image.open(source)
|
| 27 |
+
elif isinstance(source, (bytes, bytearray)):
|
| 28 |
+
img = Image.open(io.BytesIO(bytes(source)))
|
| 29 |
+
else:
|
| 30 |
+
img = Image.open(source)
|
| 31 |
+
|
| 32 |
+
return _composite_to_rgb(img)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def array_to_rgb(arr: np.ndarray) -> np.ndarray:
|
| 36 |
+
"""Convert an arbitrary-shape ndarray (H,W,3 or H,W,4) to RGB on white.
|
| 37 |
+
|
| 38 |
+
Used at the recognizer's API boundary in case a caller hands us a
|
| 39 |
+
pre-decoded RGBA array.
|
| 40 |
+
"""
|
| 41 |
+
from PIL import Image
|
| 42 |
+
|
| 43 |
+
if arr.ndim == 2:
|
| 44 |
+
img = Image.fromarray(arr).convert("RGB")
|
| 45 |
+
return np.asarray(img)
|
| 46 |
+
if arr.shape[-1] == 3:
|
| 47 |
+
return arr if arr.dtype == np.uint8 else arr.astype(np.uint8)
|
| 48 |
+
if arr.shape[-1] == 4:
|
| 49 |
+
img = Image.fromarray(arr, mode="RGBA")
|
| 50 |
+
return _composite_to_rgb(img)
|
| 51 |
+
raise ValueError(f"unsupported array shape for RGB conversion: {arr.shape}")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _composite_to_rgb(img) -> np.ndarray: # noqa: ANN001
|
| 55 |
+
from PIL import Image
|
| 56 |
+
|
| 57 |
+
if img.mode in ("RGBA", "LA"):
|
| 58 |
+
bg = Image.new("RGB", img.size, (255, 255, 255))
|
| 59 |
+
alpha = img.getchannel("A") if img.mode == "RGBA" else img.split()[-1]
|
| 60 |
+
bg.paste(img.convert("RGB"), mask=alpha)
|
| 61 |
+
img = bg
|
| 62 |
+
elif img.mode == "P" and "transparency" in img.info:
|
| 63 |
+
# Palette image with transparent index — also composite.
|
| 64 |
+
rgba = img.convert("RGBA")
|
| 65 |
+
bg = Image.new("RGB", rgba.size, (255, 255, 255))
|
| 66 |
+
bg.paste(rgba, mask=rgba.getchannel("A"))
|
| 67 |
+
img = bg
|
| 68 |
+
elif img.mode != "RGB":
|
| 69 |
+
img = img.convert("RGB")
|
| 70 |
+
return np.asarray(img)
|
signbridge/recognizer/vlm.py
CHANGED
|
@@ -90,15 +90,20 @@ def _resolve_client() -> tuple[object | None, str]:
|
|
| 90 |
if not api_key:
|
| 91 |
logger.info("HF_TOKEN not set; recognizer in stub mode.")
|
| 92 |
return None, DEFAULT_VLM_MODEL
|
|
|
|
|
|
|
| 93 |
return (
|
| 94 |
OpenAI(
|
| 95 |
base_url=os.getenv(
|
| 96 |
"HF_INFERENCE_BASE_URL",
|
| 97 |
-
"https://
|
| 98 |
),
|
| 99 |
api_key=api_key,
|
| 100 |
),
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
|
| 104 |
logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; recognizer in stub mode.", provider)
|
|
@@ -108,7 +113,10 @@ def _resolve_client() -> tuple[object | None, str]:
|
|
| 108 |
def _frame_to_data_url(frame: np.ndarray) -> str:
|
| 109 |
from PIL import Image
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
| 112 |
buf = io.BytesIO()
|
| 113 |
img.save(buf, format="JPEG", quality=85)
|
| 114 |
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
|
|
|
| 90 |
if not api_key:
|
| 91 |
logger.info("HF_TOKEN not set; recognizer in stub mode.")
|
| 92 |
return None, DEFAULT_VLM_MODEL
|
| 93 |
+
# HF Inference Providers — OpenAI-compatible router serving Qwen2-VL,
|
| 94 |
+
# Llama-3.2-Vision, etc. via Together/Fireworks/Hyperbolic backends.
|
| 95 |
return (
|
| 96 |
OpenAI(
|
| 97 |
base_url=os.getenv(
|
| 98 |
"HF_INFERENCE_BASE_URL",
|
| 99 |
+
"https://router.huggingface.co/v1",
|
| 100 |
),
|
| 101 |
api_key=api_key,
|
| 102 |
),
|
| 103 |
+
os.getenv(
|
| 104 |
+
"SIGNBRIDGE_VLM_MODEL_HF",
|
| 105 |
+
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
| 106 |
+
),
|
| 107 |
)
|
| 108 |
|
| 109 |
logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; recognizer in stub mode.", provider)
|
|
|
|
| 113 |
def _frame_to_data_url(frame: np.ndarray) -> str:
|
| 114 |
from PIL import Image
|
| 115 |
|
| 116 |
+
from signbridge.imageio import array_to_rgb
|
| 117 |
+
|
| 118 |
+
rgb = array_to_rgb(frame)
|
| 119 |
+
img = Image.fromarray(rgb)
|
| 120 |
buf = io.BytesIO()
|
| 121 |
img.save(buf, format="JPEG", quality=85)
|
| 122 |
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
signbridge/scripts/run_gold_set.py
CHANGED
|
@@ -29,10 +29,9 @@ from collections import defaultdict
|
|
| 29 |
from datetime import datetime, timezone
|
| 30 |
from pathlib import Path
|
| 31 |
|
| 32 |
-
import numpy as np
|
| 33 |
from dotenv import load_dotenv
|
| 34 |
-
from PIL import Image
|
| 35 |
|
|
|
|
| 36 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 37 |
|
| 38 |
VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
|
|
@@ -87,7 +86,7 @@ def main() -> int:
|
|
| 87 |
t_start = time.perf_counter()
|
| 88 |
for expected, path in samples:
|
| 89 |
per_class_total[expected] += 1
|
| 90 |
-
img =
|
| 91 |
t0 = time.perf_counter()
|
| 92 |
predicted, confidence = recognize_sign_from_frame(img)
|
| 93 |
dt_ms = (time.perf_counter() - t0) * 1000
|
|
|
|
| 29 |
from datetime import datetime, timezone
|
| 30 |
from pathlib import Path
|
| 31 |
|
|
|
|
| 32 |
from dotenv import load_dotenv
|
|
|
|
| 33 |
|
| 34 |
+
from signbridge.imageio import load_rgb
|
| 35 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 36 |
|
| 37 |
VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
|
|
|
|
| 86 |
t_start = time.perf_counter()
|
| 87 |
for expected, path in samples:
|
| 88 |
per_class_total[expected] += 1
|
| 89 |
+
img = load_rgb(path)
|
| 90 |
t0 = time.perf_counter()
|
| 91 |
predicted, confidence = recognize_sign_from_frame(img)
|
| 92 |
dt_ms = (time.perf_counter() - t0) * 1000
|
signbridge/scripts/smoke_test.py
CHANGED
|
@@ -24,6 +24,7 @@ from dotenv import load_dotenv
|
|
| 24 |
from PIL import Image, ImageDraw
|
| 25 |
|
| 26 |
from signbridge.composer.sentence import compose_sentence
|
|
|
|
| 27 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 28 |
from signbridge.voice.tts import synthesize_speech
|
| 29 |
|
|
@@ -95,7 +96,7 @@ def main() -> int:
|
|
| 95 |
|
| 96 |
_step("VLM recognizer (sign-frame → token)")
|
| 97 |
if args.frame:
|
| 98 |
-
img =
|
| 99 |
print(f" using real frame: {args.frame} ({img.shape})")
|
| 100 |
else:
|
| 101 |
img = _make_synthetic_frame()
|
|
|
|
| 24 |
from PIL import Image, ImageDraw
|
| 25 |
|
| 26 |
from signbridge.composer.sentence import compose_sentence
|
| 27 |
+
from signbridge.imageio import load_rgb
|
| 28 |
from signbridge.recognizer.vlm import recognize_sign_from_frame
|
| 29 |
from signbridge.voice.tts import synthesize_speech
|
| 30 |
|
|
|
|
| 96 |
|
| 97 |
_step("VLM recognizer (sign-frame → token)")
|
| 98 |
if args.frame:
|
| 99 |
+
img = load_rgb(args.frame)
|
| 100 |
print(f" using real frame: {args.frame} ({img.shape})")
|
| 101 |
else:
|
| 102 |
img = _make_synthetic_frame()
|