Spaces:

mekosotto
/

hackathon

Running

mekosotto Claude Opus 4.7 (1M context) commited on 8 days ago

Commit

e5c1c61

1 Parent(s): d69f171

feat(llm): explainer with deterministic template + OpenRouter fallback

- New module src/llm/explainer.py — single public entry point
explain(payload). Returns {rationale, source, model}. Never raises.
- Deterministic template (4 sentences: verdict, calibration if any,
top-3 SHAP, drift) is the source of truth for tests.
- LLM path: OpenRouter chat completions via openai==1.51.0 SDK,
model meta-llama/llama-3.2-3b-instruct:free, 8s timeout, 256 max
tokens, temperature 0.3. Gated by OPENROUTER_API_KEY presence and
NEUROBRIDGE_DISABLE_LLM=1 kill-switch.
- Fallback chain: env-disabled → no key → SDK ImportError → API error
→ empty/malformed response → all degrade to template, log WARNING,
source="template".
- 4 new tests: deterministic, top features included, label text
included, kill-switch overrides key.
- New pip dep: openai==1.51.0 (~600KB, transitive deps already present).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

requirements.txt +3 -0
src/llm/__init__.py +8 -0
src/llm/explainer.py +211 -0
tests/llm/__init__.py +0 -0
tests/llm/test_explainer.py +70 -0

requirements.txt CHANGED Viewed

@@ -39,3 +39,6 @@ httpx==0.27.2  # FastAPI test client
 # --- Frontend (B2B dashboard) ---
 streamlit==1.39.0

 # --- Frontend (B2B dashboard) ---
 streamlit==1.39.0
+# --- LLM provider (Day 7 explainer) ---
+openai==1.51.0  # OpenRouter SDK (Day-7 LLM explainer; deterministic-template fallback always available)

src/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""LLM-backed natural-language explainers (Day 7).
+`explain()` is the ONLY public entry point. It guarantees a non-empty
+rationale every call: tries OpenRouter when available, falls back to a
+deterministic template otherwise. The deterministic path is the source
+of truth for tests; the LLM path is gated behind env config.
+"""
+from src.llm.explainer import ExplainPayload, ExplainResult, explain  # noqa: F401

src/llm/explainer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""Natural-language rationale for a single BBB prediction.
+Public entry point: `explain(payload)`. Always returns a usable
+ExplainResult — never raises. Tries OpenRouter first when a key is set
+and the kill-switch is off; falls back to a deterministic template on
+any failure (network, auth, rate limit, malformed response).
+Test discipline: deterministic template path is the source of truth.
+LLM path is env-gated and exercised by integration tests only.
+"""
+from __future__ import annotations
+import os
+from typing import Any, TypedDict
+from src.core.logger import get_logger
+logger = get_logger(__name__)
+class FeatureRow(TypedDict):
+    feature: str
+    shap_value: float
+class CalibrationDict(TypedDict):
+    threshold: float
+    precision: float
+    support: int
+class ExplainPayload(TypedDict, total=False):
+    smiles: str
+    label: int
+    label_text: str
+    confidence: float
+    top_features: list[FeatureRow]
+    calibration: CalibrationDict | None
+    drift_z: float | None
+    user_question: str
+class ExplainResult(TypedDict):
+    rationale: str
+    source: str          # "llm" | "template"
+    model: str | None    # llm model name when source="llm", else None
+_OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+_DEFAULT_MODEL = "meta-llama/llama-3.2-3b-instruct:free"
+_LLM_TIMEOUT_SECONDS = 8.0
+_LLM_MAX_TOKENS = 256
+_LLM_TEMPERATURE = 0.3
+def _should_use_llm() -> bool:
+    """Gate: env kill-switch off AND key present."""
+    if os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1":
+        return False
+    if not os.environ.get("OPENROUTER_API_KEY"):
+        return False
+    return True
+def _drift_interpretation(drift_z: float | None) -> str:
+    if drift_z is None:
+        return "drift unavailable"
+    mag = abs(drift_z)
+    if mag < 1.0:
+        return "within expected range"
+    if mag < 2.0:
+        return "mild distribution shift"
+    return "significant shift, retrain recommended"
+def _template_explain(payload: ExplainPayload) -> str:
+    """Deterministic, jury-friendly rationale. Never raises."""
+    label_text = payload.get("label_text", "unknown")
+    confidence = float(payload.get("confidence", 0.0))
+    top_features = payload.get("top_features") or []
+    # Sentence 1
+    sentences = [
+        f"Predicted **{label_text}** with {confidence * 100:.0f}% confidence."
+    ]
+    # Sentence 2 (calibration, optional)
+    cal = payload.get("calibration")
+    if cal is not None:
+        thr_pct = float(cal["threshold"]) * 100
+        prec_pct = float(cal["precision"]) * 100
+        support = int(cal["support"])
+        if support > 0:
+            sentences.append(
+                f"Calibration: predictions in the ≥{thr_pct:.0f}% bin are "
+                f"correct {prec_pct:.0f}% of the time on held-out data "
+                f"(n={support})."
+            )
+    # Sentence 3 (top-3 SHAP features)
+    if top_features:
+        feat_strs = [
+            f"{row['feature']} (Δ{float(row['shap_value']):+.3f})"
+            for row in top_features[:3]
+        ]
+        sentences.append(
+            f"Top SHAP attributions toward this label: {', '.join(feat_strs)}."
+        )
+    # Sentence 4 (drift, optional)
+    drift_z = payload.get("drift_z")
+    if drift_z is not None:
+        interp = _drift_interpretation(drift_z)
+        sentences.append(
+            f"Drift signal: trailing-100 confidence median is "
+            f"{float(drift_z):+.2f}σ from training distribution ({interp})."
+        )
+    return " ".join(sentences)
+def _build_llm_prompt(payload: ExplainPayload) -> str:
+    """Format the payload + user question into a single LLM prompt."""
+    top_features = payload.get("top_features") or []
+    top_lines = "\n".join(
+        f"  - {row['feature']}: Δ{float(row['shap_value']):+.3f}"
+        for row in top_features[:5]
+    ) or "  - (none)"
+    drift_z = payload.get("drift_z")
+    drift_str = "n/a" if drift_z is None else f"{float(drift_z):+.2f}"
+    user_q = payload.get("user_question") or (
+        "Explain the prediction in 2-4 sentences."
+    )
+    return (
+        "You are a clinical-ML explainer for a B2B blood-brain-barrier "
+        "permeability tool. Given the prediction details below, write a "
+        "2-4 sentence rationale a researcher could paste into a paper. "
+        "Use the SHAP attributions to justify the verdict. Mention drift "
+        "if abnormal. Avoid hedging; be specific about the numbers.\n\n"
+        f"Prediction:\n"
+        f"- SMILES: {payload.get('smiles', '?')}\n"
+        f"- Verdict: {payload.get('label_text', '?')} "
+        f"({float(payload.get('confidence', 0.0)) * 100:.0f}% confident)\n"
+        f"- Top SHAP features (positive = pushed toward verdict):\n"
+        f"{top_lines}\n"
+        f"- Drift z-score: {drift_str}\n"
+        f"\nUser question: {user_q}\n"
+        f"\nRespond with the rationale only, no preamble."
+    )
+def _llm_explain(payload: ExplainPayload) -> tuple[str, str] | None:
+    """Try the OpenRouter chat completion. Return (rationale, model) or None."""
+    try:
+        # Local import — keeps this dep optional at module load time.
+        from openai import OpenAI
+    except ImportError as e:
+        logger.warning("openai SDK not importable: %s", e)
+        return None
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        return None
+    client = OpenAI(
+        base_url=_OPENROUTER_BASE_URL,
+        api_key=api_key,
+        timeout=_LLM_TIMEOUT_SECONDS,
+    )
+    prompt = _build_llm_prompt(payload)
+    try:
+        completion = client.chat.completions.create(
+            model=_DEFAULT_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=_LLM_MAX_TOKENS,
+            temperature=_LLM_TEMPERATURE,
+        )
+    except Exception as e:  # broad: APITimeoutError, APIConnectionError, RateLimitError, ...
+        logger.warning("LLM call failed (%s); falling back to template.", type(e).__name__)
+        return None
+    try:
+        text = completion.choices[0].message.content
+    except (AttributeError, IndexError, TypeError) as e:
+        logger.warning("LLM response malformed (%s); falling back to template.", e)
+        return None
+    if not text or not text.strip():
+        logger.warning("LLM returned empty rationale; falling back to template.")
+        return None
+    return text.strip(), _DEFAULT_MODEL
+def explain(payload: ExplainPayload) -> ExplainResult:
+    """Return a natural-language rationale for a BBB prediction.
+    Tries the LLM first when env-permitted; falls back to a deterministic
+    template on any failure. Never raises.
+    """
+    if _should_use_llm():
+        llm_out: Any = _llm_explain(payload)
+        if llm_out is not None:
+            rationale, model = llm_out
+            return ExplainResult(rationale=rationale, source="llm", model=model)
+        # else: fall through to template
+    return ExplainResult(
+        rationale=_template_explain(payload),
+        source="template",
+        model=None,
+    )

tests/llm/__init__.py ADDED Viewed

File without changes

tests/llm/test_explainer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Tests for src.llm.explainer.
+The deterministic template path is exhaustively tested here. The LLM
+path is exercised only by env-gated integration tests in
+test_explainer_integration.py (NOT run in CI by default).
+"""
+from __future__ import annotations
+import os
+import pytest
+from src.llm.explainer import ExplainPayload, explain
+def _payload(**overrides) -> ExplainPayload:
+    """Build a representative ExplainPayload; overrides win."""
+    base: ExplainPayload = {
+        "smiles": "CCO",
+        "label": 1,
+        "label_text": "permeable",
+        "confidence": 0.82,
+        "top_features": [
+            {"feature": "fp_341", "shap_value": 0.045},
+            {"feature": "fp_902", "shap_value": -0.031},
+            {"feature": "fp_77", "shap_value": 0.022},
+        ],
+        "calibration": {"threshold": 0.80, "precision": 0.92, "support": 18},
+        "drift_z": 0.42,
+        "user_question": "Why was this molecule predicted as permeable?",
+    }
+    base.update(overrides)
+    return base
+class TestTemplateExplain:
+    """Day-7 T3A: deterministic-template path of the explainer."""
+    def test_template_path_is_deterministic(self, monkeypatch):
+        """Same input → byte-identical rationale string. No randomness."""
+        monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
+        out_a = explain(_payload())
+        out_b = explain(_payload())
+        assert out_a["rationale"] == out_b["rationale"]
+        assert out_a["source"] == "template"
+        assert out_b["source"] == "template"
+        assert out_a["model"] is None
+    def test_template_includes_top_feature_names(self, monkeypatch):
+        """Rationale must mention the SHAP features so jurors see attribution."""
+        monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
+        result = explain(_payload())
+        for feat in ("fp_341", "fp_902", "fp_77"):
+            assert feat in result["rationale"], (
+                f"expected feature {feat!r} in rationale, got {result['rationale']!r}"
+            )
+    def test_template_includes_label_text(self, monkeypatch):
+        """The verdict word ('permeable' / 'non-permeable') must appear."""
+        monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
+        result = explain(_payload(label=0, label_text="non-permeable"))
+        assert "non-permeable" in result["rationale"]
+    def test_disable_flag_forces_template_even_with_key_set(self, monkeypatch):
+        """NEUROBRIDGE_DISABLE_LLM=1 wins over OPENROUTER_API_KEY presence."""
+        monkeypatch.setenv("NEUROBRIDGE_DISABLE_LLM", "1")
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-fake-not-used")
+        result = explain(_payload())
+        assert result["source"] == "template"
+        assert result["model"] is None