Spaces:

mekosotto
/

hackathon

Running

mekosotto Claude Opus 4.7 (1M context) commited on 5 days ago

Commit

8cd7173

2 Parent(s): 26adc32 fb3eff2

Merge branch 'feature/real-llm-rationale' into main

Real LLM rationale: POST /explain/{bbb,eeg,mri} now returns
source="llm" against OpenRouter free-tier; template kept as
true outage fallback (NEUROBRIDGE_DISABLE_LLM=1 still forces it).

Key changes:
- 401 short-circuits with actionable WARNING (URLs to /keys + /settings/privacy)
- 400 advances to next model (was: bail to template)
- _DEFAULT_FREE_MODEL_CHAIN refreshed with verified-live OpenRouter IDs
- scripts/diagnose_openrouter.py: one-shot reachability probe
- Live integration test (gated by OPENROUTER_API_KEY + slow marker)
- caplog handler attached for non-propagating logger (regression detector)

192/192 tests pass; live verification: BBB/EEG/MRI all source=llm.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

scripts/diagnose_openrouter.py +79 -0
src/llm/explainer.py +34 -11
tests/llm/test_explainer.py +216 -0

scripts/diagnose_openrouter.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Probe OpenRouter for which free-tier model IDs are reachable today.
+Reads OPENROUTER_API_KEY from .env (or process env). Issues a single
+8-token chat completion against a candidate list and prints one line per
+model: status (OK / HTTP-code / exception name) + a 30-char preview of
+the response when OK.
+Use:
+    python scripts/diagnose_openrouter.py
+Or to probe a custom list:
+    python scripts/diagnose_openrouter.py google/gemma-2-9b-it:free meta-llama/llama-3.2-3b-instruct:free
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+# Manually parse .env without python-dotenv (some envs choke on its
+# frame-introspection in heredocs / non-stack-rooted callers).
+_env_path = Path(__file__).resolve().parent.parent / ".env"
+if _env_path.exists():
+    for raw in _env_path.read_text().splitlines():
+        s = raw.strip()
+        if not s or s.startswith("#") or "=" not in s:
+            continue
+        k, v = s.split("=", 1)
+        os.environ.setdefault(k.strip(), v.strip())
+if not os.environ.get("OPENROUTER_API_KEY"):
+    sys.exit("OPENROUTER_API_KEY not set (looked in env and .env)")
+# Candidate list: well-known stable free-tier IDs as of 2026-Q2.
+# Update by replacing this list — script is a probe, not a config source.
+DEFAULT_CANDIDATES = [
+    "google/gemma-2-9b-it:free",
+    "google/gemini-2.0-flash-exp:free",
+    "meta-llama/llama-3.2-3b-instruct:free",
+    "meta-llama/llama-3.3-70b-instruct:free",
+    "mistralai/mistral-7b-instruct:free",
+    "qwen/qwen-2.5-72b-instruct:free",
+    "deepseek/deepseek-r1:free",
+    "deepseek/deepseek-chat:free",
+    "nousresearch/hermes-3-llama-3.1-405b:free",
+    "microsoft/phi-3-mini-128k-instruct:free",
+]
+candidates = sys.argv[1:] or DEFAULT_CANDIDATES
+from openai import (  # noqa: E402  (after env load)
+    OpenAI, APIStatusError, APIConnectionError, RateLimitError, APITimeoutError,
+)
+client = OpenAI(
+    base_url="https://openrouter.ai/api/v1",
+    api_key=os.environ["OPENROUTER_API_KEY"],
+    timeout=15.0,
+)
+for m in candidates:
+    try:
+        c = client.chat.completions.create(
+            model=m,
+            messages=[{"role": "user", "content": "Reply with the single word OK."}],
+            max_tokens=8,
+            temperature=0,
+        )
+        text = (c.choices[0].message.content or "").strip()
+        print(f"  OK     {m}  →  {text[:30]!r}")
+    except APIStatusError as e:
+        code = getattr(e, "status_code", "?")
+        print(f"  {code:<5}  {m}")
+    except RateLimitError:
+        print(f"  429    {m}  (rate-limited)")
+    except (APIConnectionError, APITimeoutError) as e:
+        print(f"  CONN   {m}  ({type(e).__name__})")
+    except Exception as e:
+        print(f"  ERR    {m}  ({type(e).__name__}: {e})")

src/llm/explainer.py CHANGED Viewed

@@ -58,18 +58,21 @@ _LLM_TEMPERATURE = 0.3
 # 5xx (upstream), we advance to the next model. Network/timeout errors fall
 # straight to the deterministic template — switching models won't help.
 # Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
-# availability on OpenRouter churns; an ID that 404s is skipped silently.
 _DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
-    "inclusionai/ling-2.6-1t:free",                        # ~1T flagship
-    "nvidia/nemotron-3-super-120b-a12b:free",              # 120B reasoning MoE
-    "minimax/minimax-m2.5:free",
-    "tencent/hy3-preview:free",                            # MoE + reasoning
-    "google/gemma-4-31b-it:free",
-    "google/gemma-4-26b-a4b-it:free",
-    "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
-    "poolside/laguna-m.1:free",
-    "poolside/laguna-xs.2:free",
-    "meta-llama/llama-3.2-3b-instruct:free",               # 3B last-resort
 )
@@ -302,6 +305,26 @@ def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, s
             continue
         except APIStatusError as e:
             status = getattr(e, "status_code", None)
             # 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
             if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
                 logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)

 # 5xx (upstream), we advance to the next model. Network/timeout errors fall
 # straight to the deterministic template — switching models won't help.
 # Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
+# availability on OpenRouter churns; verify with scripts/diagnose_openrouter.py.
+# Last verified: 2026-05-02 via scripts/diagnose_openrouter.py.
+# Entries marked "currently 429" have valid IDs but were quota-exhausted at
+# probe time; kept because OpenRouter rate-limits are per-window and recover.
 _DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
+    "inclusionai/ling-2.6-1t:free",                        # ~1T flagship — verified OK, returns content
+    "nvidia/nemotron-3-super-120b-a12b:free",              # 120B — verified OK, returns content
+    "minimax/minimax-m2.5:free",                           # MoE — verified OK, returns content
+    "qwen/qwen3-next-80b-a3b-instruct:free",               # 80B — currently 429 but valid id
+    "google/gemma-4-31b-it:free",                          # 31B — currently 429 but valid id
+    "google/gemma-4-26b-a4b-it:free",                      # 26B MoE — currently 429 but valid id
+    "tencent/hy3-preview:free",                            # MoE preview — verified OK
+    "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",  # 30B reasoning — verified OK
+    "nvidia/nemotron-3-nano-30b-a3b:free",                 # 30B — verified OK
+    "poolside/laguna-xs.2:free",                           # smallest — verified OK
 )
             continue
         except APIStatusError as e:
             status = getattr(e, "status_code", None)
+            # 401 = unauthorized — the key is bad, no model in this chain
+            # will succeed. Surface a loud, actionable hint and bail.
+            if status == 401:
+                logger.warning(
+                    "OpenRouter 401 unauthorized on %s. The OPENROUTER_API_KEY "
+                    "is rejected — verify it is current at "
+                    "https://openrouter.ai/keys and that free-model data-sharing "
+                    "is enabled at https://openrouter.ai/settings/privacy. "
+                    "Falling back to deterministic template.",
+                    model,
+                )
+                return None
+            # 400 = malformed prompt for this specific model (e.g. it
+            # rejected our system role). Skip this model, try the next.
+            if status == 400:
+                logger.info(
+                    "OpenRouter 400 on %s (likely prompt-shape mismatch); "
+                    "advancing to next free model.", model,
+                )
+                continue
             # 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
             if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
                 logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)

tests/llm/test_explainer.py CHANGED Viewed

@@ -128,3 +128,219 @@ class TestModalityDispatch:
         # Should not raise; should produce a non-empty rationale
         assert result["source"] == "template"
         assert result["rationale"], "rationale must be non-empty"

         # Should not raise; should produce a non-empty rationale
         assert result["source"] == "template"
         assert result["rationale"], "rationale must be non-empty"
+class TestAuthFailureShortCircuits:
+    """A 401 from OpenRouter means the key is unauthorized — every model
+    in the chain will fail the same way, so we must short-circuit instead
+    of burning the full chain on every request."""
+    def test_401_short_circuits_to_template_after_one_attempt(self, monkeypatch):
+        from src.llm import explainer as ex
+        from openai import APIStatusError
+        import httpx
+        monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
+        attempts: list[str] = []
+        def _raise_401(**kwargs):
+            attempts.append(kwargs["model"])
+            req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
+            resp = httpx.Response(status_code=401, request=req)
+            raise APIStatusError(message="No auth credentials found", response=resp, body={})
+        class _StubCompletions:
+            create = staticmethod(_raise_401)
+        class _StubChat:
+            completions = _StubCompletions()
+        class _StubClient:
+            chat = _StubChat()
+            def __init__(self, **kwargs):
+                pass
+        # Must patch on the `openai` module — the explainer does
+        # `from openai import OpenAI` *inside* the function (see
+        # src/llm/explainer.py:269-275), so any module-level attribute
+        # on `src.llm.explainer` would be a no-op.
+        monkeypatch.setattr("openai.OpenAI", _StubClient)
+        out = ex._llm_explain(_payload(), modality="bbb")
+        assert out is None, "401 must surface as a None return (caller falls back to template)"
+        assert len(attempts) == 1, f"401 must short-circuit; tried {len(attempts)} models: {attempts}"
+    def test_explain_returns_template_source_on_401(self, monkeypatch):
+        from src.llm import explainer as ex
+        from openai import APIStatusError
+        import httpx
+        monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
+        def _raise_401(**kwargs):
+            req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
+            raise APIStatusError(
+                message="auth",
+                response=httpx.Response(401, request=req),
+                body={},
+            )
+        class _Comp:
+            create = staticmethod(_raise_401)
+        class _Chat:
+            completions = _Comp()
+        class _Client:
+            chat = _Chat()
+            def __init__(self, **kwargs):
+                pass
+        monkeypatch.setattr("openai.OpenAI", _Client)
+        result = ex.explain(_payload(), modality="bbb")
+        assert result["source"] == "template"
+        assert result["model"] is None
+        assert result["rationale"], "rationale must never be empty"
+    def test_400_advances_to_next_model_instead_of_short_circuiting(self, monkeypatch):
+        """A 400 from one model is a prompt-shape mismatch with THAT model
+        (some models reject system roles, etc.) — try the next, don't give up."""
+        from src.llm import explainer as ex
+        from openai import APIStatusError
+        import httpx
+        monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-anything")
+        attempts: list[str] = []
+        # Force a known multi-model chain so we can count attempts deterministically
+        monkeypatch.setenv("OPENROUTER_FREE_MODELS", "model-a:free,model-b:free,model-c:free")
+        def _raise_400(**kwargs):
+            attempts.append(kwargs["model"])
+            req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
+            raise APIStatusError(
+                message="bad request",
+                response=httpx.Response(400, request=req),
+                body={},
+            )
+        class _Comp:
+            create = staticmethod(_raise_400)
+        class _Chat:
+            completions = _Comp()
+        class _Client:
+            chat = _Chat()
+            def __init__(self, **kwargs):
+                pass
+        monkeypatch.setattr("openai.OpenAI", _Client)
+        out = ex._llm_explain(_payload(), modality="bbb")
+        assert out is None, "all models 400'd → must return None for template fallback"
+        assert attempts == ["model-a:free", "model-b:free", "model-c:free"], (
+            f"400 must advance to next model; got attempts={attempts}"
+        )
+@pytest.mark.slow
+@pytest.mark.skipif(
+    not os.environ.get("OPENROUTER_API_KEY"),
+    reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
+)
+@pytest.mark.skipif(
+    os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
+    reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
+)
+class TestLiveOpenRouterLLM:
+    """End-to-end: hit a real OpenRouter free-tier model and assert
+    `explain()` returns source='llm' with non-empty content. Skipped
+    when no key is set or the kill-switch is on.
+    Marked `slow` because it makes a real network round-trip
+    (worst case ~80s if the entire chain is unreachable). Run with
+    `pytest -m slow` or include it in the default suite by not passing
+    `-m "not slow"`."""
+    def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
+        import logging
+        from src.llm import explainer as ex
+        # The explainer's logger has propagate=False (see src/core/logger.py),
+        # so caplog's root-level handler never sees its records. Attach the
+        # caplog handler directly to bypass propagation.
+        ex.logger.addHandler(caplog.handler)
+        try:
+            with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
+                result = ex.explain(_payload(), modality="bbb")
+        finally:
+            ex.logger.removeHandler(caplog.handler)
+        # Flaky-network safety net: only skip when we have evidence the
+        # template fallback fired due to transient infra (rate-limit,
+        # 5xx, network). If the fallback fired silently — no infra-error
+        # log line — that's a real regression we want to fail loud.
+        if result["source"] == "template":
+            log_text = " ".join(r.getMessage() for r in caplog.records)
+            transient_signals = (
+                "429", "OpenRouter 5", "OpenRouter 4",  # status-code log lines
+                "connection error", "timeout",          # transport-error log lines
+                "All free models exhausted",            # chain-end log line
+            )
+            had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
+            if not had_infra_evidence:
+                pytest.fail(
+                    "explain() fell back to template with NO infra-error log "
+                    "line — this is a real regression, not a network blip. "
+                    f"Captured logs: {log_text!r}"
+                )
+            pytest.skip(
+                "All free models in the chain were rate-limited or unreachable "
+                "at test time. Re-run later or run scripts/diagnose_openrouter.py."
+            )
+        assert result["source"] == "llm"
+        assert result["model"] is not None and result["model"].endswith(":free"), (
+            f"unexpected model id (must end with ':free' to ensure no paid model "
+            f"snuck into the chain): {result['model']!r}"
+        )
+        assert result["rationale"].strip(), "LLM returned empty rationale"
+        # Refusal/safety-filter sanity: catch the common patterns instead
+        # of just one prefix.
+        lowered = result["rationale"].lower()
+        refusal_signals = (
+            "i cannot",
+            "i can't",
+            "i'm sorry, but i",
+            "i'm sorry, i can't",
+            "as an ai",
+            "as a language model",
+            "i'm unable to",
+            "i do not have the ability",
+        )
+        assert not any(lowered.startswith(s) for s in refusal_signals), (
+            f"LLM refused (matched refusal pattern): {result['rationale']!r}"
+        )
+        # Positive on-topic assertion: the rationale must reference at least
+        # one of the SHAP feature names from the payload, OR the verdict
+        # word ("permeable" / "non-permeable"). A model that produced
+        # off-topic small-talk would fail here.
+        payload = _payload()
+        feature_names = [f["feature"] for f in payload["top_features"]]
+        verdict = payload["label_text"].lower()
+        on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
+        assert any(anchor in lowered for anchor in on_topic_anchors), (
+            f"rationale appears off-topic (no mention of SHAP features "
+            f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
+        )