mekosotto Claude Opus 4.7 (1M context) commited on
Commit
8cd7173
·
2 Parent(s): 26adc32fb3eff2

Merge branch 'feature/real-llm-rationale' into main

Browse files

Real LLM rationale: POST /explain/{bbb,eeg,mri} now returns
source="llm" against OpenRouter free-tier; template kept as
true outage fallback (NEUROBRIDGE_DISABLE_LLM=1 still forces it).

Key changes:
- 401 short-circuits with actionable WARNING (URLs to /keys + /settings/privacy)
- 400 advances to next model (was: bail to template)
- _DEFAULT_FREE_MODEL_CHAIN refreshed with verified-live OpenRouter IDs
- scripts/diagnose_openrouter.py: one-shot reachability probe
- Live integration test (gated by OPENROUTER_API_KEY + slow marker)
- caplog handler attached for non-propagating logger (regression detector)

192/192 tests pass; live verification: BBB/EEG/MRI all source=llm.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

scripts/diagnose_openrouter.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Probe OpenRouter for which free-tier model IDs are reachable today.
2
+
3
+ Reads OPENROUTER_API_KEY from .env (or process env). Issues a single
4
+ 8-token chat completion against a candidate list and prints one line per
5
+ model: status (OK / HTTP-code / exception name) + a 30-char preview of
6
+ the response when OK.
7
+
8
+ Use:
9
+ python scripts/diagnose_openrouter.py
10
+
11
+ Or to probe a custom list:
12
+ python scripts/diagnose_openrouter.py google/gemma-2-9b-it:free meta-llama/llama-3.2-3b-instruct:free
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ # Manually parse .env without python-dotenv (some envs choke on its
21
+ # frame-introspection in heredocs / non-stack-rooted callers).
22
+ _env_path = Path(__file__).resolve().parent.parent / ".env"
23
+ if _env_path.exists():
24
+ for raw in _env_path.read_text().splitlines():
25
+ s = raw.strip()
26
+ if not s or s.startswith("#") or "=" not in s:
27
+ continue
28
+ k, v = s.split("=", 1)
29
+ os.environ.setdefault(k.strip(), v.strip())
30
+
31
+ if not os.environ.get("OPENROUTER_API_KEY"):
32
+ sys.exit("OPENROUTER_API_KEY not set (looked in env and .env)")
33
+
34
+ # Candidate list: well-known stable free-tier IDs as of 2026-Q2.
35
+ # Update by replacing this list — script is a probe, not a config source.
36
+ DEFAULT_CANDIDATES = [
37
+ "google/gemma-2-9b-it:free",
38
+ "google/gemini-2.0-flash-exp:free",
39
+ "meta-llama/llama-3.2-3b-instruct:free",
40
+ "meta-llama/llama-3.3-70b-instruct:free",
41
+ "mistralai/mistral-7b-instruct:free",
42
+ "qwen/qwen-2.5-72b-instruct:free",
43
+ "deepseek/deepseek-r1:free",
44
+ "deepseek/deepseek-chat:free",
45
+ "nousresearch/hermes-3-llama-3.1-405b:free",
46
+ "microsoft/phi-3-mini-128k-instruct:free",
47
+ ]
48
+
49
+ candidates = sys.argv[1:] or DEFAULT_CANDIDATES
50
+
51
+ from openai import ( # noqa: E402 (after env load)
52
+ OpenAI, APIStatusError, APIConnectionError, RateLimitError, APITimeoutError,
53
+ )
54
+
55
+ client = OpenAI(
56
+ base_url="https://openrouter.ai/api/v1",
57
+ api_key=os.environ["OPENROUTER_API_KEY"],
58
+ timeout=15.0,
59
+ )
60
+
61
+ for m in candidates:
62
+ try:
63
+ c = client.chat.completions.create(
64
+ model=m,
65
+ messages=[{"role": "user", "content": "Reply with the single word OK."}],
66
+ max_tokens=8,
67
+ temperature=0,
68
+ )
69
+ text = (c.choices[0].message.content or "").strip()
70
+ print(f" OK {m} → {text[:30]!r}")
71
+ except APIStatusError as e:
72
+ code = getattr(e, "status_code", "?")
73
+ print(f" {code:<5} {m}")
74
+ except RateLimitError:
75
+ print(f" 429 {m} (rate-limited)")
76
+ except (APIConnectionError, APITimeoutError) as e:
77
+ print(f" CONN {m} ({type(e).__name__})")
78
+ except Exception as e:
79
+ print(f" ERR {m} ({type(e).__name__}: {e})")
src/llm/explainer.py CHANGED
@@ -58,18 +58,21 @@ _LLM_TEMPERATURE = 0.3
58
  # 5xx (upstream), we advance to the next model. Network/timeout errors fall
59
  # straight to the deterministic template — switching models won't help.
60
  # Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
61
- # availability on OpenRouter churns; an ID that 404s is skipped silently.
 
 
 
62
  _DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
63
- "inclusionai/ling-2.6-1t:free", # ~1T flagship
64
- "nvidia/nemotron-3-super-120b-a12b:free", # 120B reasoning MoE
65
- "minimax/minimax-m2.5:free",
66
- "tencent/hy3-preview:free", # MoE + reasoning
67
- "google/gemma-4-31b-it:free",
68
- "google/gemma-4-26b-a4b-it:free",
69
- "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
70
- "poolside/laguna-m.1:free",
71
- "poolside/laguna-xs.2:free",
72
- "meta-llama/llama-3.2-3b-instruct:free", # 3B last-resort
73
  )
74
 
75
 
@@ -302,6 +305,26 @@ def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, s
302
  continue
303
  except APIStatusError as e:
304
  status = getattr(e, "status_code", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  # 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
306
  if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
307
  logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)
 
58
  # 5xx (upstream), we advance to the next model. Network/timeout errors fall
59
  # straight to the deterministic template — switching models won't help.
60
  # Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
61
+ # availability on OpenRouter churns; verify with scripts/diagnose_openrouter.py.
62
+ # Last verified: 2026-05-02 via scripts/diagnose_openrouter.py.
63
+ # Entries marked "currently 429" have valid IDs but were quota-exhausted at
64
+ # probe time; kept because OpenRouter rate-limits are per-window and recover.
65
  _DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
66
+ "inclusionai/ling-2.6-1t:free", # ~1T flagship — verified OK, returns content
67
+ "nvidia/nemotron-3-super-120b-a12b:free", # 120B verified OK, returns content
68
+ "minimax/minimax-m2.5:free", # MoE — verified OK, returns content
69
+ "qwen/qwen3-next-80b-a3b-instruct:free", # 80B currently 429 but valid id
70
+ "google/gemma-4-31b-it:free", # 31B — currently 429 but valid id
71
+ "google/gemma-4-26b-a4b-it:free", # 26B MoE — currently 429 but valid id
72
+ "tencent/hy3-preview:free", # MoE preview — verified OK
73
+ "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", # 30B reasoning — verified OK
74
+ "nvidia/nemotron-3-nano-30b-a3b:free", # 30B — verified OK
75
+ "poolside/laguna-xs.2:free", # smallest — verified OK
76
  )
77
 
78
 
 
305
  continue
306
  except APIStatusError as e:
307
  status = getattr(e, "status_code", None)
308
+ # 401 = unauthorized — the key is bad, no model in this chain
309
+ # will succeed. Surface a loud, actionable hint and bail.
310
+ if status == 401:
311
+ logger.warning(
312
+ "OpenRouter 401 unauthorized on %s. The OPENROUTER_API_KEY "
313
+ "is rejected — verify it is current at "
314
+ "https://openrouter.ai/keys and that free-model data-sharing "
315
+ "is enabled at https://openrouter.ai/settings/privacy. "
316
+ "Falling back to deterministic template.",
317
+ model,
318
+ )
319
+ return None
320
+ # 400 = malformed prompt for this specific model (e.g. it
321
+ # rejected our system role). Skip this model, try the next.
322
+ if status == 400:
323
+ logger.info(
324
+ "OpenRouter 400 on %s (likely prompt-shape mismatch); "
325
+ "advancing to next free model.", model,
326
+ )
327
+ continue
328
  # 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
329
  if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
330
  logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)
tests/llm/test_explainer.py CHANGED
@@ -128,3 +128,219 @@ class TestModalityDispatch:
128
  # Should not raise; should produce a non-empty rationale
129
  assert result["source"] == "template"
130
  assert result["rationale"], "rationale must be non-empty"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # Should not raise; should produce a non-empty rationale
129
  assert result["source"] == "template"
130
  assert result["rationale"], "rationale must be non-empty"
131
+
132
+
133
+ class TestAuthFailureShortCircuits:
134
+ """A 401 from OpenRouter means the key is unauthorized — every model
135
+ in the chain will fail the same way, so we must short-circuit instead
136
+ of burning the full chain on every request."""
137
+
138
+ def test_401_short_circuits_to_template_after_one_attempt(self, monkeypatch):
139
+ from src.llm import explainer as ex
140
+ from openai import APIStatusError
141
+ import httpx
142
+
143
+ monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
144
+ monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
145
+
146
+ attempts: list[str] = []
147
+
148
+ def _raise_401(**kwargs):
149
+ attempts.append(kwargs["model"])
150
+ req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
151
+ resp = httpx.Response(status_code=401, request=req)
152
+ raise APIStatusError(message="No auth credentials found", response=resp, body={})
153
+
154
+ class _StubCompletions:
155
+ create = staticmethod(_raise_401)
156
+
157
+ class _StubChat:
158
+ completions = _StubCompletions()
159
+
160
+ class _StubClient:
161
+ chat = _StubChat()
162
+ def __init__(self, **kwargs):
163
+ pass
164
+
165
+ # Must patch on the `openai` module — the explainer does
166
+ # `from openai import OpenAI` *inside* the function (see
167
+ # src/llm/explainer.py:269-275), so any module-level attribute
168
+ # on `src.llm.explainer` would be a no-op.
169
+ monkeypatch.setattr("openai.OpenAI", _StubClient)
170
+
171
+ out = ex._llm_explain(_payload(), modality="bbb")
172
+
173
+ assert out is None, "401 must surface as a None return (caller falls back to template)"
174
+ assert len(attempts) == 1, f"401 must short-circuit; tried {len(attempts)} models: {attempts}"
175
+
176
+ def test_explain_returns_template_source_on_401(self, monkeypatch):
177
+ from src.llm import explainer as ex
178
+ from openai import APIStatusError
179
+ import httpx
180
+
181
+ monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
182
+ monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
183
+
184
+ def _raise_401(**kwargs):
185
+ req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
186
+ raise APIStatusError(
187
+ message="auth",
188
+ response=httpx.Response(401, request=req),
189
+ body={},
190
+ )
191
+
192
+ class _Comp:
193
+ create = staticmethod(_raise_401)
194
+
195
+ class _Chat:
196
+ completions = _Comp()
197
+
198
+ class _Client:
199
+ chat = _Chat()
200
+ def __init__(self, **kwargs):
201
+ pass
202
+
203
+ monkeypatch.setattr("openai.OpenAI", _Client)
204
+
205
+ result = ex.explain(_payload(), modality="bbb")
206
+
207
+ assert result["source"] == "template"
208
+ assert result["model"] is None
209
+ assert result["rationale"], "rationale must never be empty"
210
+
211
+ def test_400_advances_to_next_model_instead_of_short_circuiting(self, monkeypatch):
212
+ """A 400 from one model is a prompt-shape mismatch with THAT model
213
+ (some models reject system roles, etc.) — try the next, don't give up."""
214
+ from src.llm import explainer as ex
215
+ from openai import APIStatusError
216
+ import httpx
217
+
218
+ monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
219
+ monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-anything")
220
+
221
+ attempts: list[str] = []
222
+ # Force a known multi-model chain so we can count attempts deterministically
223
+ monkeypatch.setenv("OPENROUTER_FREE_MODELS", "model-a:free,model-b:free,model-c:free")
224
+
225
+ def _raise_400(**kwargs):
226
+ attempts.append(kwargs["model"])
227
+ req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
228
+ raise APIStatusError(
229
+ message="bad request",
230
+ response=httpx.Response(400, request=req),
231
+ body={},
232
+ )
233
+
234
+ class _Comp:
235
+ create = staticmethod(_raise_400)
236
+
237
+ class _Chat:
238
+ completions = _Comp()
239
+
240
+ class _Client:
241
+ chat = _Chat()
242
+ def __init__(self, **kwargs):
243
+ pass
244
+
245
+ monkeypatch.setattr("openai.OpenAI", _Client)
246
+
247
+ out = ex._llm_explain(_payload(), modality="bbb")
248
+
249
+ assert out is None, "all models 400'd → must return None for template fallback"
250
+ assert attempts == ["model-a:free", "model-b:free", "model-c:free"], (
251
+ f"400 must advance to next model; got attempts={attempts}"
252
+ )
253
+
254
+
255
+ @pytest.mark.slow
256
+ @pytest.mark.skipif(
257
+ not os.environ.get("OPENROUTER_API_KEY"),
258
+ reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
259
+ )
260
+ @pytest.mark.skipif(
261
+ os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
262
+ reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
263
+ )
264
+ class TestLiveOpenRouterLLM:
265
+ """End-to-end: hit a real OpenRouter free-tier model and assert
266
+ `explain()` returns source='llm' with non-empty content. Skipped
267
+ when no key is set or the kill-switch is on.
268
+
269
+ Marked `slow` because it makes a real network round-trip
270
+ (worst case ~80s if the entire chain is unreachable). Run with
271
+ `pytest -m slow` or include it in the default suite by not passing
272
+ `-m "not slow"`."""
273
+
274
+ def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
275
+ import logging
276
+ from src.llm import explainer as ex
277
+
278
+ # The explainer's logger has propagate=False (see src/core/logger.py),
279
+ # so caplog's root-level handler never sees its records. Attach the
280
+ # caplog handler directly to bypass propagation.
281
+ ex.logger.addHandler(caplog.handler)
282
+ try:
283
+ with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
284
+ result = ex.explain(_payload(), modality="bbb")
285
+ finally:
286
+ ex.logger.removeHandler(caplog.handler)
287
+
288
+ # Flaky-network safety net: only skip when we have evidence the
289
+ # template fallback fired due to transient infra (rate-limit,
290
+ # 5xx, network). If the fallback fired silently — no infra-error
291
+ # log line — that's a real regression we want to fail loud.
292
+ if result["source"] == "template":
293
+ log_text = " ".join(r.getMessage() for r in caplog.records)
294
+ transient_signals = (
295
+ "429", "OpenRouter 5", "OpenRouter 4", # status-code log lines
296
+ "connection error", "timeout", # transport-error log lines
297
+ "All free models exhausted", # chain-end log line
298
+ )
299
+ had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
300
+ if not had_infra_evidence:
301
+ pytest.fail(
302
+ "explain() fell back to template with NO infra-error log "
303
+ "line — this is a real regression, not a network blip. "
304
+ f"Captured logs: {log_text!r}"
305
+ )
306
+ pytest.skip(
307
+ "All free models in the chain were rate-limited or unreachable "
308
+ "at test time. Re-run later or run scripts/diagnose_openrouter.py."
309
+ )
310
+
311
+ assert result["source"] == "llm"
312
+ assert result["model"] is not None and result["model"].endswith(":free"), (
313
+ f"unexpected model id (must end with ':free' to ensure no paid model "
314
+ f"snuck into the chain): {result['model']!r}"
315
+ )
316
+ assert result["rationale"].strip(), "LLM returned empty rationale"
317
+
318
+ # Refusal/safety-filter sanity: catch the common patterns instead
319
+ # of just one prefix.
320
+ lowered = result["rationale"].lower()
321
+ refusal_signals = (
322
+ "i cannot",
323
+ "i can't",
324
+ "i'm sorry, but i",
325
+ "i'm sorry, i can't",
326
+ "as an ai",
327
+ "as a language model",
328
+ "i'm unable to",
329
+ "i do not have the ability",
330
+ )
331
+ assert not any(lowered.startswith(s) for s in refusal_signals), (
332
+ f"LLM refused (matched refusal pattern): {result['rationale']!r}"
333
+ )
334
+
335
+ # Positive on-topic assertion: the rationale must reference at least
336
+ # one of the SHAP feature names from the payload, OR the verdict
337
+ # word ("permeable" / "non-permeable"). A model that produced
338
+ # off-topic small-talk would fail here.
339
+ payload = _payload()
340
+ feature_names = [f["feature"] for f in payload["top_features"]]
341
+ verdict = payload["label_text"].lower()
342
+ on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
343
+ assert any(anchor in lowered for anchor in on_topic_anchors), (
344
+ f"rationale appears off-topic (no mention of SHAP features "
345
+ f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
346
+ )