File size: 7,087 Bytes
18d028b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""Integration tests — exercise multi-step user flows end-to-end."""

from __future__ import annotations

import base64
import io

import numpy as np
import pytest
from fastapi.testclient import TestClient
from PIL import Image

from signbridge.backend import app
from signbridge.space import _capture_sign, _clear, _new_session, _speak


@pytest.fixture()
def client() -> TestClient:
    return TestClient(app)


def _frame(rgb: tuple[int, int, int] = (180, 200, 160), size: int = 96) -> np.ndarray:
    return np.full((size, size, 3), rgb, dtype=np.uint8)


def _frame_b64(rgb: tuple[int, int, int] = (180, 200, 160), size: int = 96) -> str:
    arr = _frame(rgb, size)
    img = Image.fromarray(arr)
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=80)
    return base64.b64encode(buf.getvalue()).decode("ascii")


class TestUserFlowFingerspell:
    """User fingerspells L-U-C-A-S then presses Speak."""

    def test_via_space_helpers(self, monkeypatch: pytest.MonkeyPatch) -> None:
        # Stub VLM to return one letter at a time.
        from signbridge.recognizer import vlm

        responses = iter(["L", "U", "C", "A", "S"])

        class _Resp:
            def __init__(self, c: str) -> None:
                self.choices = [type("C", (), {"message": type("M", (), {"content": c})()})()]

        class _FakeClient:
            class chat:  # noqa: N801
                class completions:  # noqa: N801
                    @staticmethod
                    def create(**_: object) -> _Resp:
                        return _Resp(next(responses))

        monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))

        state = _new_session()
        for _ in range(5):
            _, _, state = _capture_sign(_frame(), state)
        assert state.sign_history == ["L", "U", "C", "A", "S"]

        sentence, audio_path, state = _speak(state)
        # Composer fallback (no API keys for composer in this test) → naive joiner
        assert "Lucas" in sentence
        assert audio_path  # silent-stub WAV exists

    def test_via_backend_endpoints(self, client: TestClient) -> None:
        # Direct multi-step flow over HTTP, exercising every endpoint.
        for _letter in "LUCAS":
            r = client.post("/recognize", json={"frame": _frame_b64()})
            assert r.status_code == 200
            # No API keys → token is "" but endpoint succeeds.
            assert r.json()["token"] == ""

        # Compose a manually-curated sequence
        r = client.post("/compose", json={"signs": ["L", "U", "C", "A", "S"]})
        assert r.status_code == 200
        assert "Lucas" in r.json()["sentence"]

        # Speak
        r = client.post("/speak", json={"text": "My name is Lucas."})
        assert r.status_code == 200
        assert len(r.content) > 0


class TestClearResetsCleanly:
    def test_full_round_trip(self, monkeypatch: pytest.MonkeyPatch) -> None:
        from signbridge.recognizer import vlm

        class _FakeClient:
            class chat:  # noqa: N801
                class completions:  # noqa: N801
                    @staticmethod
                    def create(**_: object):
                        return type(
                            "R",
                            (),
                            {
                                "choices": [
                                    type(
                                        "C",
                                        (),
                                        {"message": type("M", (), {"content": "hello"})()},
                                    )()
                                ]
                            },
                        )()

        monkeypatch.setattr(vlm, "_resolve_client", lambda: (_FakeClient(), "test"))

        state = _new_session()
        _, _, state = _capture_sign(_frame(), state)
        _, _, state = _capture_sign(_frame(), state)
        assert state.sign_history == ["hello", "hello"]

        sentence, audio, state = _speak(state)
        assert sentence
        assert audio

        latest, history, sentence_box, audio_out, state = _clear(state)
        assert state.sign_history == []
        assert state.last_sentence == ""
        assert state.last_audio_path is None
        assert "no signs" in history.lower()


class TestEdgeCases:
    def test_huge_sign_sequence(self, client: TestClient) -> None:
        # 200 fingerspelled letters — make sure compose endpoint doesn't crash.
        signs = list("ABCDEFGHIJ" * 20)
        r = client.post("/compose", json={"signs": signs})
        assert r.status_code == 200
        assert r.json()["sentence"]  # non-empty

    def test_unicode_in_compose(self, client: TestClient) -> None:
        # Synthetic unicode token should pass through naive joiner unscathed.
        r = client.post("/compose", json={"signs": ["héllo", "wörld"]})
        assert r.status_code == 200

    def test_speak_very_long_text(self, client: TestClient) -> None:
        r = client.post("/speak", json={"text": "a " * 500})
        assert r.status_code == 200

    def test_recognize_jpeg_with_data_url_jpg(self, client: TestClient) -> None:
        b64 = _frame_b64()
        r = client.post(
            "/recognize", json={"frame": f"data:image/jpg;base64,{b64}"}
        )
        # Slightly malformed data URL (jpg vs jpeg) — should still work via tolerant decoder.
        assert r.status_code == 200

    def test_recognize_png_frame(self, client: TestClient) -> None:
        arr = _frame()
        img = Image.fromarray(arr)
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        b64 = base64.b64encode(buf.getvalue()).decode("ascii")
        r = client.post("/recognize", json={"frame": b64})
        assert r.status_code == 200

    def test_compose_with_only_punctuation_glosses(self, client: TestClient) -> None:
        # Tokens that are 1 char, lowercase letters — should not be misread as fingerspelling.
        r = client.post("/compose", json={"signs": ["a", "b"]})
        assert r.status_code == 200
        # Naive joiner only treats UPPERCASE single letters as fingerspelling.
        # Lowercase 'a' / 'b' are full glosses → should appear with a space, no concat.
        assert r.json()["sentence"] == "A b."

    def test_health_after_recognize_failure(self, client: TestClient) -> None:
        # Even after a 400, /healthz should still respond.
        client.post("/recognize", json={"frame": "%%%bad%%%"})
        r = client.get("/healthz")
        assert r.status_code == 200


class TestBackendInfoEndpoint:
    def test_info_reflects_env(self, client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
        monkeypatch.setenv("SIGNBRIDGE_PROVIDER", "openai")
        monkeypatch.setenv(
            "SIGNBRIDGE_COMPOSER_MODEL", "meta-llama/Llama-3.1-8B-Instruct"
        )
        r = client.get("/info")
        assert r.status_code == 200
        body = r.json()
        assert body["provider"] == "openai"
        assert body["composer_model"].endswith("Llama-3.1-8B-Instruct")