Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

hackathon / tests /llm /test_explainer.py

mekosotto

fix(test): attach caplog handler to non-propagating explainer logger

fb3eff2 5 days ago

raw

history blame contribute delete

14 kB

	"""Tests for src.llm.explainer.

	The deterministic template path is exhaustively tested here. The LLM
	path is exercised only by env-gated integration tests in
	test_explainer_integration.py (NOT run in CI by default).
	"""
	from __future__ import annotations

	import os

	import pytest

	from src.llm.explainer import ExplainPayload, explain


	def _payload(**overrides) -> ExplainPayload:
	"""Build a representative ExplainPayload; overrides win."""
	base: ExplainPayload = {
	"smiles": "CCO",
	"label": 1,
	"label_text": "permeable",
	"confidence": 0.82,
	"top_features": [
	{"feature": "fp_341", "shap_value": 0.045},
	{"feature": "fp_902", "shap_value": -0.031},
	{"feature": "fp_77", "shap_value": 0.022},
	],
	"calibration": {"threshold": 0.80, "precision": 0.92, "support": 18},
	"drift_z": 0.42,
	"user_question": "Why was this molecule predicted as permeable?",
	}
	base.update(overrides)
	return base


	class TestTemplateExplain:
	"""Day-7 T3A: deterministic-template path of the explainer."""

	def test_template_path_is_deterministic(self, monkeypatch):
	"""Same input → byte-identical rationale string. No randomness."""
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	out_a = explain(_payload())
	out_b = explain(_payload())
	assert out_a["rationale"] == out_b["rationale"]
	assert out_a["source"] == "template"
	assert out_b["source"] == "template"
	assert out_a["model"] is None

	def test_template_includes_top_feature_names(self, monkeypatch):
	"""Rationale must mention the SHAP features so jurors see attribution."""
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	result = explain(_payload())
	for feat in ("fp_341", "fp_902", "fp_77"):
	assert feat in result["rationale"], (
	f"expected feature {feat!r} in rationale, got {result['rationale']!r}"
	)

	def test_template_includes_label_text(self, monkeypatch):
	"""The verdict word ('permeable' / 'non-permeable') must appear."""
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	result = explain(_payload(label=0, label_text="non-permeable"))
	assert "non-permeable" in result["rationale"]

	def test_disable_flag_forces_template_even_with_key_set(self, monkeypatch):
	"""NEUROBRIDGE_DISABLE_LLM=1 wins over OPENROUTER_API_KEY presence."""
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	monkeypatch.setenv("OPENROUTER_API_KEY", "sk-fake-not-used")
	result = explain(_payload())
	assert result["source"] == "template"
	assert result["model"] is None


	class TestEEGTemplate:
	"""Day-8 T1A: deterministic EEG template path."""

	def test_eeg_template_uses_pipeline_metrics(self, monkeypatch):
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	payload = {
	"rows": 30,
	"columns": 95,
	"duration_sec": 4.32,
	"mlflow_run_id": "abc12345",
	"user_question": "Why were epochs dropped?",
	}
	result = explain(payload, modality="eeg")
	assert result["source"] == "template"
	assert result["model"] is None
	rationale = result["rationale"]
	assert "30" in rationale, "epoch count must appear"
	assert "95" in rationale, "feature count must appear"
	assert "4.3" in rationale, "duration must appear (1-decimal)"


	class TestMRITemplate:
	"""Day-8 T1A: deterministic MRI template path."""

	def test_mri_template_uses_combat_metrics(self, monkeypatch):
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	payload = {
	"site_gap_pre": 5.0004,
	"site_gap_post": 0.0015,
	"reduction_factor": 3290.0,
	"n_subjects": 6,
	"user_question": "Why does ComBat matter?",
	}
	result = explain(payload, modality="mri")
	assert result["source"] == "template"
	rationale = result["rationale"]
	assert "5.00" in rationale or "5.0" in rationale, "pre-gap must appear"
	assert "3290" in rationale or "3290×" in rationale, "reduction factor must appear"
	assert "6" in rationale, "n_subjects must appear"


	class TestModalityDispatch:
	"""Day-8 T1A: explain(modality=…) routes to the right template."""

	def test_unknown_modality_falls_back_to_bbb_template(self, monkeypatch):
	"""Defensive: an unknown modality string degrades gracefully (warn + bbb-style template)."""
	monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
	payload = {
	"smiles": "CCO",
	"label": 1,
	"label_text": "permeable",
	"confidence": 0.82,
	"top_features": [{"feature": "fp_1", "shap_value": 0.05}],
	}
	result = explain(payload, modality="unknown_xyz")
	# Should not raise; should produce a non-empty rationale
	assert result["source"] == "template"
	assert result["rationale"], "rationale must be non-empty"


	class TestAuthFailureShortCircuits:
	"""A 401 from OpenRouter means the key is unauthorized — every model
	in the chain will fail the same way, so we must short-circuit instead
	of burning the full chain on every request."""

	def test_401_short_circuits_to_template_after_one_attempt(self, monkeypatch):
	from src.llm import explainer as ex
	from openai import APIStatusError
	import httpx

	monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
	monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")

	attempts: list[str] = []

	def _raise_401(**kwargs):
	attempts.append(kwargs["model"])
	req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
	resp = httpx.Response(status_code=401, request=req)
	raise APIStatusError(message="No auth credentials found", response=resp, body={})

	class _StubCompletions:
	create = staticmethod(_raise_401)

	class _StubChat:
	completions = _StubCompletions()

	class _StubClient:
	chat = _StubChat()
	def __init__(self, **kwargs):
	pass

	# Must patch on the `openai` module — the explainer does
	# `from openai import OpenAI` inside the function (see
	# src/llm/explainer.py:269-275), so any module-level attribute
	# on `src.llm.explainer` would be a no-op.
	monkeypatch.setattr("openai.OpenAI", _StubClient)

	out = ex._llm_explain(_payload(), modality="bbb")

	assert out is None, "401 must surface as a None return (caller falls back to template)"
	assert len(attempts) == 1, f"401 must short-circuit; tried {len(attempts)} models: {attempts}"

	def test_explain_returns_template_source_on_401(self, monkeypatch):
	from src.llm import explainer as ex
	from openai import APIStatusError
	import httpx

	monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
	monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-deliberately-bad")

	def _raise_401(**kwargs):
	req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
	raise APIStatusError(
	message="auth",
	response=httpx.Response(401, request=req),
	body={},
	)

	class _Comp:
	create = staticmethod(_raise_401)

	class _Chat:
	completions = _Comp()

	class _Client:
	chat = _Chat()
	def __init__(self, **kwargs):
	pass

	monkeypatch.setattr("openai.OpenAI", _Client)

	result = ex.explain(_payload(), modality="bbb")

	assert result["source"] == "template"
	assert result["model"] is None
	assert result["rationale"], "rationale must never be empty"

	def test_400_advances_to_next_model_instead_of_short_circuiting(self, monkeypatch):
	"""A 400 from one model is a prompt-shape mismatch with THAT model
	(some models reject system roles, etc.) — try the next, don't give up."""
	from src.llm import explainer as ex
	from openai import APIStatusError
	import httpx

	monkeypatch.delenv("NEUROBRIDGE_DISABLE_LLM", raising=False)
	monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-v1-anything")

	attempts: list[str] = []
	# Force a known multi-model chain so we can count attempts deterministically
	monkeypatch.setenv("OPENROUTER_FREE_MODELS", "model-a:free,model-b:free,model-c:free")

	def _raise_400(**kwargs):
	attempts.append(kwargs["model"])
	req = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions")
	raise APIStatusError(
	message="bad request",
	response=httpx.Response(400, request=req),
	body={},
	)

	class _Comp:
	create = staticmethod(_raise_400)

	class _Chat:
	completions = _Comp()

	class _Client:
	chat = _Chat()
	def __init__(self, **kwargs):
	pass

	monkeypatch.setattr("openai.OpenAI", _Client)

	out = ex._llm_explain(_payload(), modality="bbb")

	assert out is None, "all models 400'd → must return None for template fallback"
	assert attempts == ["model-a:free", "model-b:free", "model-c:free"], (
	f"400 must advance to next model; got attempts={attempts}"
	)


	@pytest.mark.slow
	@pytest.mark.skipif(
	not os.environ.get("OPENROUTER_API_KEY"),
	reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
	)
	@pytest.mark.skipif(
	os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
	reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
	)
	class TestLiveOpenRouterLLM:
	"""End-to-end: hit a real OpenRouter free-tier model and assert
	`explain()` returns source='llm' with non-empty content. Skipped
	when no key is set or the kill-switch is on.

	Marked `slow` because it makes a real network round-trip
	(worst case ~80s if the entire chain is unreachable). Run with
	`pytest -m slow` or include it in the default suite by not passing
	`-m "not slow"`."""

	def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
	import logging
	from src.llm import explainer as ex

	# The explainer's logger has propagate=False (see src/core/logger.py),
	# so caplog's root-level handler never sees its records. Attach the
	# caplog handler directly to bypass propagation.
	ex.logger.addHandler(caplog.handler)
	try:
	with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
	result = ex.explain(_payload(), modality="bbb")
	finally:
	ex.logger.removeHandler(caplog.handler)

	# Flaky-network safety net: only skip when we have evidence the
	# template fallback fired due to transient infra (rate-limit,
	# 5xx, network). If the fallback fired silently — no infra-error
	# log line — that's a real regression we want to fail loud.
	if result["source"] == "template":
	log_text = " ".join(r.getMessage() for r in caplog.records)
	transient_signals = (
	"429", "OpenRouter 5", "OpenRouter 4", # status-code log lines
	"connection error", "timeout", # transport-error log lines
	"All free models exhausted", # chain-end log line
	)
	had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
	if not had_infra_evidence:
	pytest.fail(
	"explain() fell back to template with NO infra-error log "
	"line — this is a real regression, not a network blip. "
	f"Captured logs: {log_text!r}"
	)
	pytest.skip(
	"All free models in the chain were rate-limited or unreachable "
	"at test time. Re-run later or run scripts/diagnose_openrouter.py."
	)

	assert result["source"] == "llm"
	assert result["model"] is not None and result["model"].endswith(":free"), (
	f"unexpected model id (must end with ':free' to ensure no paid model "
	f"snuck into the chain): {result['model']!r}"
	)
	assert result["rationale"].strip(), "LLM returned empty rationale"

	# Refusal/safety-filter sanity: catch the common patterns instead
	# of just one prefix.
	lowered = result["rationale"].lower()
	refusal_signals = (
	"i cannot",
	"i can't",
	"i'm sorry, but i",
	"i'm sorry, i can't",
	"as an ai",
	"as a language model",
	"i'm unable to",
	"i do not have the ability",
	)
	assert not any(lowered.startswith(s) for s in refusal_signals), (
	f"LLM refused (matched refusal pattern): {result['rationale']!r}"
	)

	# Positive on-topic assertion: the rationale must reference at least
	# one of the SHAP feature names from the payload, OR the verdict
	# word ("permeable" / "non-permeable"). A model that produced
	# off-topic small-talk would fail here.
	payload = _payload()
	feature_names = [f["feature"] for f in payload["top_features"]]
	verdict = payload["label_text"].lower()
	on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
	assert any(anchor in lowered for anchor in on_topic_anchors), (
	f"rationale appears off-topic (no mention of SHAP features "
	f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
	)