hackathon / tests /llm /test_explainer.py
mekosotto's picture
fix(test): attach caplog handler to non-propagating explainer logger
fb3eff2
"""Tests for src.llm.explainer.
The deterministic template path is exhaustively tested here. The LLM
path is exercised only by env-gated integration tests in
test_explainer_integration.py (NOT run in CI by default).
"""
from __future__ import annotations
import os
import pytest
from src.llm.explainer import ExplainPayload, explain
def _payload(**overrides) -> ExplainPayload:
"""Build a representative ExplainPayload; overrides win."""
base: ExplainPayload = {
"smiles": "CCO",
"label": 1,
"label_text": "permeable",
"confidence": 0.82,
"top_features": [
{"feature": "fp_341", "shap_value": 0.045},
{"feature": "fp_902", "shap_value": -0.031},
{"feature": "fp_77", "shap_value": 0.022},
],
"calibration": {"threshold": 0.80, "precision": 0.92, "support": 18},
"drift_z": 0.42,
"user_question": "Why was this molecule predicted as permeable?",
}
base.update(overrides)
return base
class TestTemplateExplain:
"""Day-7 T3A: deterministic-template path of the explainer."""
def test_template_path_is_deterministic(self, monkeypatch):
"""Same input → byte-identical rationale string. No randomness."""
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
out_a = explain(_payload())
out_b = explain(_payload())
assert out_a["rationale"] == out_b["rationale"]
assert out_a["source"] == "template"
assert out_b["source"] == "template"
assert out_a["model"] is None
def test_template_includes_top_feature_names(self, monkeypatch):
"""Rationale must mention the SHAP features so jurors see attribution."""
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
result = explain(_payload())
for feat in ("fp_341", "fp_902", "fp_77"):
assert feat in result["rationale"], (
f"expected feature {feat!r} in rationale, got {result['rationale']!r}"
)
def test_template_includes_label_text(self, monkeypatch):
"""The verdict word ('permeable' / 'non-permeable') must appear."""
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
result = explain(_payload(label=0, label_text="non-permeable"))
assert "non-permeable" in result["rationale"]
def test_disable_flag_forces_template_even_with_key_set(self, monkeypatch):
"""NEUROBRIDGE_DISABLE_LLM=1 wins over OPENROUTER_API_KEY presence."""
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-fake-not-used")
result = explain(_payload())
assert result["source"] == "template"
assert result["model"] is None
class TestEEGTemplate:
"""Day-8 T1A: deterministic EEG template path."""
def test_eeg_template_uses_pipeline_metrics(self, monkeypatch):
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
payload = {
"rows": 30,
"columns": 95,
"duration_sec": 4.32,
"mlflow_run_id": "abc12345",
"user_question": "Why were epochs dropped?",
}
result = explain(payload, modality="eeg")
assert result["source"] == "template"
assert result["model"] is None
rationale = result["rationale"]
assert "30" in rationale, "epoch count must appear"
assert "95" in rationale, "feature count must appear"
assert "4.3" in rationale, "duration must appear (1-decimal)"
class TestMRITemplate:
"""Day-8 T1A: deterministic MRI template path."""
def test_mri_template_uses_combat_metrics(self, monkeypatch):
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
payload = {
"site_gap_pre": 5.0004,
"site_gap_post": 0.0015,
"reduction_factor": 3290.0,
"n_subjects": 6,
"user_question": "Why does ComBat matter?",
}
result = explain(payload, modality="mri")
assert result["source"] == "template"
rationale = result["rationale"]
assert "5.00" in rationale or "5.0" in rationale, "pre-gap must appear"
assert "3290" in rationale or "3290×" in rationale, "reduction factor must appear"
assert "6" in rationale, "n_subjects must appear"
class TestModalityDispatch:
"""Day-8 T1A: explain(modality=…) routes to the right template."""
def test_unknown_modality_falls_back_to_bbb_template(self, monkeypatch):
"""Defensive: an unknown modality string degrades gracefully (warn + bbb-style template)."""
monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
payload = {
"smiles": "CCO",
"label": 1,
"label_text": "permeable",
"confidence": 0.82,
"top_features": [{"feature": "fp_1", "shap_value": 0.05}],
}
result = explain(payload, modality="unknown_xyz")
# Should not raise; should produce a non-empty rationale
assert result["source"] == "template"
assert result["rationale"], "rationale must be non-empty"
class TestAuthFailureShortCircuits:
"""A 401 from OpenRouter means the key is unauthorized — every model
in the chain will fail the same way, so we must short-circuit instead
of burning the full chain on every request."""
def test_401_short_circuits_to_template_after_one_attempt(self, monkeypatch):
from src.llm import explainer as ex
from openai import APIStatusError
import httpx
monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
attempts: list[str] = []
def _raise_401(**kwargs):
attempts.append(kwargs["model"])
req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
resp = httpx.Response(status_code=401, request=req)
raise APIStatusError(message="No auth credentials found", response=resp, body={})
class _StubCompletions:
create = staticmethod(_raise_401)
class _StubChat:
completions = _StubCompletions()
class _StubClient:
chat = _StubChat()
def __init__(self, **kwargs):
pass
# Must patch on the `openai` module — the explainer does
# `from openai import OpenAI` *inside* the function (see
# src/llm/explainer.py:269-275), so any module-level attribute
# on `src.llm.explainer` would be a no-op.
monkeypatch.setattr("openai.OpenAI", _StubClient)
out = ex._llm_explain(_payload(), modality="bbb")
assert out is None, "401 must surface as a None return (caller falls back to template)"
assert len(attempts) == 1, f"401 must short-circuit; tried {len(attempts)} models: {attempts}"
def test_explain_returns_template_source_on_401(self, monkeypatch):
from src.llm import explainer as ex
from openai import APIStatusError
import httpx
monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")
def _raise_401(**kwargs):
req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
raise APIStatusError(
message="auth",
response=httpx.Response(401, request=req),
body={},
)
class _Comp:
create = staticmethod(_raise_401)
class _Chat:
completions = _Comp()
class _Client:
chat = _Chat()
def __init__(self, **kwargs):
pass
monkeypatch.setattr("openai.OpenAI", _Client)
result = ex.explain(_payload(), modality="bbb")
assert result["source"] == "template"
assert result["model"] is None
assert result["rationale"], "rationale must never be empty"
def test_400_advances_to_next_model_instead_of_short_circuiting(self, monkeypatch):
"""A 400 from one model is a prompt-shape mismatch with THAT model
(some models reject system roles, etc.) — try the next, don't give up."""
from src.llm import explainer as ex
from openai import APIStatusError
import httpx
monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-anything")
attempts: list[str] = []
# Force a known multi-model chain so we can count attempts deterministically
monkeypatch.setenv("OPENROUTER_FREE_MODELS", "model-a:free,model-b:free,model-c:free")
def _raise_400(**kwargs):
attempts.append(kwargs["model"])
req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
raise APIStatusError(
message="bad request",
response=httpx.Response(400, request=req),
body={},
)
class _Comp:
create = staticmethod(_raise_400)
class _Chat:
completions = _Comp()
class _Client:
chat = _Chat()
def __init__(self, **kwargs):
pass
monkeypatch.setattr("openai.OpenAI", _Client)
out = ex._llm_explain(_payload(), modality="bbb")
assert out is None, "all models 400'd → must return None for template fallback"
assert attempts == ["model-a:free", "model-b:free", "model-c:free"], (
f"400 must advance to next model; got attempts={attempts}"
)
@pytest.mark.slow
@pytest.mark.skipif(
not os.environ.get("OPENROUTER_API_KEY"),
reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
)
@pytest.mark.skipif(
os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
)
class TestLiveOpenRouterLLM:
"""End-to-end: hit a real OpenRouter free-tier model and assert
`explain()` returns source='llm' with non-empty content. Skipped
when no key is set or the kill-switch is on.
Marked `slow` because it makes a real network round-trip
(worst case ~80s if the entire chain is unreachable). Run with
`pytest -m slow` or include it in the default suite by not passing
`-m "not slow"`."""
def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
import logging
from src.llm import explainer as ex
# The explainer's logger has propagate=False (see src/core/logger.py),
# so caplog's root-level handler never sees its records. Attach the
# caplog handler directly to bypass propagation.
ex.logger.addHandler(caplog.handler)
try:
with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
result = ex.explain(_payload(), modality="bbb")
finally:
ex.logger.removeHandler(caplog.handler)
# Flaky-network safety net: only skip when we have evidence the
# template fallback fired due to transient infra (rate-limit,
# 5xx, network). If the fallback fired silently — no infra-error
# log line — that's a real regression we want to fail loud.
if result["source"] == "template":
log_text = " ".join(r.getMessage() for r in caplog.records)
transient_signals = (
"429", "OpenRouter 5", "OpenRouter 4", # status-code log lines
"connection error", "timeout", # transport-error log lines
"All free models exhausted", # chain-end log line
)
had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
if not had_infra_evidence:
pytest.fail(
"explain() fell back to template with NO infra-error log "
"line — this is a real regression, not a network blip. "
f"Captured logs: {log_text!r}"
)
pytest.skip(
"All free models in the chain were rate-limited or unreachable "
"at test time. Re-run later or run scripts/diagnose_openrouter.py."
)
assert result["source"] == "llm"
assert result["model"] is not None and result["model"].endswith(":free"), (
f"unexpected model id (must end with ':free' to ensure no paid model "
f"snuck into the chain): {result['model']!r}"
)
assert result["rationale"].strip(), "LLM returned empty rationale"
# Refusal/safety-filter sanity: catch the common patterns instead
# of just one prefix.
lowered = result["rationale"].lower()
refusal_signals = (
"i cannot",
"i can't",
"i'm sorry, but i",
"i'm sorry, i can't",
"as an ai",
"as a language model",
"i'm unable to",
"i do not have the ability",
)
assert not any(lowered.startswith(s) for s in refusal_signals), (
f"LLM refused (matched refusal pattern): {result['rationale']!r}"
)
# Positive on-topic assertion: the rationale must reference at least
# one of the SHAP feature names from the payload, OR the verdict
# word ("permeable" / "non-permeable"). A model that produced
# off-topic small-talk would fail here.
payload = _payload()
feature_names = [f["feature"] for f in payload["top_features"]]
verdict = payload["label_text"].lower()
on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
assert any(anchor in lowered for anchor in on_topic_anchors), (
f"rationale appears off-topic (no mention of SHAP features "
f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
)