LucasLooTan commited on
Commit
7f3265d
·
1 Parent(s): 819f4c1

fix: composite alpha onto white before VLM round-trip

Browse files

Wikimedia SVG→PNG renders with a transparent background. A naive
.convert('RGB') turned every transparent pixel black, so the VLM saw
a solid black square and (correctly) said 'unknown'.

- New shared signbridge.imageio module with load_rgb() + array_to_rgb()
helpers that composite alpha onto white at every loader boundary.
- Wired into smoke_test, run_gold_set, backend (b64 decode), and the
vlm recognizer's _frame_to_data_url.
- Confirmed: ASL letter A from Wikimedia is now identified correctly
via Qwen3-VL-8B on HF Inference Providers, conf 0.85.

HF provider also added as a fallback to the composer client resolver,
so we can validate the pipeline end-to-end while waiting for AMD Dev
Cloud credit email.

68 tests still pass; ruff clean.

signbridge/backend.py CHANGED
@@ -15,17 +15,16 @@ Endpoints:
15
  from __future__ import annotations
16
 
17
  import base64
18
- import io
19
  import logging
20
  import os
21
 
22
  import numpy as np
23
  from fastapi import FastAPI, HTTPException
24
  from fastapi.responses import FileResponse
25
- from PIL import Image
26
  from pydantic import BaseModel, Field
27
 
28
  from signbridge.composer.sentence import compose_sentence
 
29
  from signbridge.recognizer.vlm import recognize_sign_from_frame
30
  from signbridge.voice.tts import synthesize_speech
31
 
@@ -69,7 +68,7 @@ def _decode_b64_image(b64: str) -> np.ndarray:
69
  if b64.startswith("data:"):
70
  b64 = b64.split(",", 1)[1]
71
  raw = base64.b64decode(b64)
72
- return np.asarray(Image.open(io.BytesIO(raw)).convert("RGB"))
73
  except Exception as exc: # noqa: BLE001
74
  raise HTTPException(status_code=400, detail=f"bad frame: {exc}") from exc
75
 
 
15
  from __future__ import annotations
16
 
17
  import base64
 
18
  import logging
19
  import os
20
 
21
  import numpy as np
22
  from fastapi import FastAPI, HTTPException
23
  from fastapi.responses import FileResponse
 
24
  from pydantic import BaseModel, Field
25
 
26
  from signbridge.composer.sentence import compose_sentence
27
+ from signbridge.imageio import load_rgb
28
  from signbridge.recognizer.vlm import recognize_sign_from_frame
29
  from signbridge.voice.tts import synthesize_speech
30
 
 
68
  if b64.startswith("data:"):
69
  b64 = b64.split(",", 1)[1]
70
  raw = base64.b64decode(b64)
71
+ return load_rgb(raw)
72
  except Exception as exc: # noqa: BLE001
73
  raise HTTPException(status_code=400, detail=f"bad frame: {exc}") from exc
74
 
signbridge/composer/sentence.py CHANGED
@@ -63,6 +63,22 @@ def _resolve_client() -> tuple[object | None, str]:
63
  "SIGNBRIDGE_COMPOSER_MODEL_OPENAI", "gpt-4o-mini"
64
  )
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; using naive joiner.", provider)
67
  return None, composer_model
68
 
 
63
  "SIGNBRIDGE_COMPOSER_MODEL_OPENAI", "gpt-4o-mini"
64
  )
65
 
66
+ if provider == "hf":
67
+ api_key = os.getenv("HF_TOKEN", "")
68
+ if not api_key:
69
+ logger.info("HF_TOKEN not set; falling back to naive joiner.")
70
+ return None, composer_model
71
+ return (
72
+ OpenAI(
73
+ base_url=os.getenv(
74
+ "HF_INFERENCE_BASE_URL",
75
+ "https://router.huggingface.co/v1",
76
+ ),
77
+ api_key=api_key,
78
+ ),
79
+ composer_model,
80
+ )
81
+
82
  logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; using naive joiner.", provider)
83
  return None, composer_model
84
 
signbridge/imageio.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared image-loading helpers.
2
+
3
+ Centralised so the recognizer, smoke test, gold-set harness, and backend
4
+ all behave the same way on alpha-channel images (e.g. SVG-rendered PNGs
5
+ with transparent backgrounds — those would otherwise come out solid black
6
+ after a naive `.convert("RGB")` and the VLM sees nothing).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import io
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+
16
+
17
+ def load_rgb(source: str | Path | bytes | io.IOBase) -> np.ndarray:
18
+ """Load an image as an RGB ndarray, compositing any alpha onto white.
19
+
20
+ Accepts a filesystem path, raw bytes, or any file-like object PIL
21
+ knows how to open.
22
+ """
23
+ from PIL import Image
24
+
25
+ if isinstance(source, (str, Path)):
26
+ img = Image.open(source)
27
+ elif isinstance(source, (bytes, bytearray)):
28
+ img = Image.open(io.BytesIO(bytes(source)))
29
+ else:
30
+ img = Image.open(source)
31
+
32
+ return _composite_to_rgb(img)
33
+
34
+
35
+ def array_to_rgb(arr: np.ndarray) -> np.ndarray:
36
+ """Convert an arbitrary-shape ndarray (H,W,3 or H,W,4) to RGB on white.
37
+
38
+ Used at the recognizer's API boundary in case a caller hands us a
39
+ pre-decoded RGBA array.
40
+ """
41
+ from PIL import Image
42
+
43
+ if arr.ndim == 2:
44
+ img = Image.fromarray(arr).convert("RGB")
45
+ return np.asarray(img)
46
+ if arr.shape[-1] == 3:
47
+ return arr if arr.dtype == np.uint8 else arr.astype(np.uint8)
48
+ if arr.shape[-1] == 4:
49
+ img = Image.fromarray(arr, mode="RGBA")
50
+ return _composite_to_rgb(img)
51
+ raise ValueError(f"unsupported array shape for RGB conversion: {arr.shape}")
52
+
53
+
54
+ def _composite_to_rgb(img) -> np.ndarray: # noqa: ANN001
55
+ from PIL import Image
56
+
57
+ if img.mode in ("RGBA", "LA"):
58
+ bg = Image.new("RGB", img.size, (255, 255, 255))
59
+ alpha = img.getchannel("A") if img.mode == "RGBA" else img.split()[-1]
60
+ bg.paste(img.convert("RGB"), mask=alpha)
61
+ img = bg
62
+ elif img.mode == "P" and "transparency" in img.info:
63
+ # Palette image with transparent index — also composite.
64
+ rgba = img.convert("RGBA")
65
+ bg = Image.new("RGB", rgba.size, (255, 255, 255))
66
+ bg.paste(rgba, mask=rgba.getchannel("A"))
67
+ img = bg
68
+ elif img.mode != "RGB":
69
+ img = img.convert("RGB")
70
+ return np.asarray(img)
signbridge/recognizer/vlm.py CHANGED
@@ -90,15 +90,20 @@ def _resolve_client() -> tuple[object | None, str]:
90
  if not api_key:
91
  logger.info("HF_TOKEN not set; recognizer in stub mode.")
92
  return None, DEFAULT_VLM_MODEL
 
 
93
  return (
94
  OpenAI(
95
  base_url=os.getenv(
96
  "HF_INFERENCE_BASE_URL",
97
- "https://api-inference.huggingface.co/v1",
98
  ),
99
  api_key=api_key,
100
  ),
101
- DEFAULT_VLM_MODEL,
 
 
 
102
  )
103
 
104
  logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; recognizer in stub mode.", provider)
@@ -108,7 +113,10 @@ def _resolve_client() -> tuple[object | None, str]:
108
  def _frame_to_data_url(frame: np.ndarray) -> str:
109
  from PIL import Image
110
 
111
- img = Image.fromarray(frame)
 
 
 
112
  buf = io.BytesIO()
113
  img.save(buf, format="JPEG", quality=85)
114
  b64 = base64.b64encode(buf.getvalue()).decode("ascii")
 
90
  if not api_key:
91
  logger.info("HF_TOKEN not set; recognizer in stub mode.")
92
  return None, DEFAULT_VLM_MODEL
93
+ # HF Inference Providers — OpenAI-compatible router serving Qwen2-VL,
94
+ # Llama-3.2-Vision, etc. via Together/Fireworks/Hyperbolic backends.
95
  return (
96
  OpenAI(
97
  base_url=os.getenv(
98
  "HF_INFERENCE_BASE_URL",
99
+ "https://router.huggingface.co/v1",
100
  ),
101
  api_key=api_key,
102
  ),
103
+ os.getenv(
104
+ "SIGNBRIDGE_VLM_MODEL_HF",
105
+ "meta-llama/Llama-3.2-11B-Vision-Instruct",
106
+ ),
107
  )
108
 
109
  logger.warning("unknown SIGNBRIDGE_PROVIDER=%r; recognizer in stub mode.", provider)
 
113
  def _frame_to_data_url(frame: np.ndarray) -> str:
114
  from PIL import Image
115
 
116
+ from signbridge.imageio import array_to_rgb
117
+
118
+ rgb = array_to_rgb(frame)
119
+ img = Image.fromarray(rgb)
120
  buf = io.BytesIO()
121
  img.save(buf, format="JPEG", quality=85)
122
  b64 = base64.b64encode(buf.getvalue()).decode("ascii")
signbridge/scripts/run_gold_set.py CHANGED
@@ -29,10 +29,9 @@ from collections import defaultdict
29
  from datetime import datetime, timezone
30
  from pathlib import Path
31
 
32
- import numpy as np
33
  from dotenv import load_dotenv
34
- from PIL import Image
35
 
 
36
  from signbridge.recognizer.vlm import recognize_sign_from_frame
37
 
38
  VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
@@ -87,7 +86,7 @@ def main() -> int:
87
  t_start = time.perf_counter()
88
  for expected, path in samples:
89
  per_class_total[expected] += 1
90
- img = np.asarray(Image.open(path).convert("RGB"))
91
  t0 = time.perf_counter()
92
  predicted, confidence = recognize_sign_from_frame(img)
93
  dt_ms = (time.perf_counter() - t0) * 1000
 
29
  from datetime import datetime, timezone
30
  from pathlib import Path
31
 
 
32
  from dotenv import load_dotenv
 
33
 
34
+ from signbridge.imageio import load_rgb
35
  from signbridge.recognizer.vlm import recognize_sign_from_frame
36
 
37
  VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
 
86
  t_start = time.perf_counter()
87
  for expected, path in samples:
88
  per_class_total[expected] += 1
89
+ img = load_rgb(path)
90
  t0 = time.perf_counter()
91
  predicted, confidence = recognize_sign_from_frame(img)
92
  dt_ms = (time.perf_counter() - t0) * 1000
signbridge/scripts/smoke_test.py CHANGED
@@ -24,6 +24,7 @@ from dotenv import load_dotenv
24
  from PIL import Image, ImageDraw
25
 
26
  from signbridge.composer.sentence import compose_sentence
 
27
  from signbridge.recognizer.vlm import recognize_sign_from_frame
28
  from signbridge.voice.tts import synthesize_speech
29
 
@@ -95,7 +96,7 @@ def main() -> int:
95
 
96
  _step("VLM recognizer (sign-frame → token)")
97
  if args.frame:
98
- img = np.asarray(Image.open(args.frame).convert("RGB"))
99
  print(f" using real frame: {args.frame} ({img.shape})")
100
  else:
101
  img = _make_synthetic_frame()
 
24
  from PIL import Image, ImageDraw
25
 
26
  from signbridge.composer.sentence import compose_sentence
27
+ from signbridge.imageio import load_rgb
28
  from signbridge.recognizer.vlm import recognize_sign_from_frame
29
  from signbridge.voice.tts import synthesize_speech
30
 
 
96
 
97
  _step("VLM recognizer (sign-frame → token)")
98
  if args.frame:
99
+ img = load_rgb(args.frame)
100
  print(f" using real frame: {args.frame} ({img.shape})")
101
  else:
102
  img = _make_synthetic_frame()