test(llm): tighten live integration test (slow marker, log-gated skip, on-topic assertion)
Browse files- tests/llm/test_explainer.py +66 -22
tests/llm/test_explainer.py
CHANGED
|
@@ -252,44 +252,88 @@ class TestAuthFailureShortCircuits:
|
|
| 252 |
)
|
| 253 |
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
@_pytest.mark.skipif(
|
| 261 |
-
not _os.environ.get("OPENROUTER_API_KEY"),
|
| 262 |
reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
|
| 263 |
)
|
| 264 |
-
@
|
| 265 |
-
|
| 266 |
reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
|
| 267 |
)
|
| 268 |
class TestLiveOpenRouterLLM:
|
| 269 |
"""End-to-end: hit a real OpenRouter free-tier model and assert
|
| 270 |
`explain()` returns source='llm' with non-empty content. Skipped
|
| 271 |
-
when no key is set or the kill-switch is on.
|
| 272 |
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
from src.llm import explainer as ex
|
| 275 |
|
| 276 |
-
|
|
|
|
| 277 |
|
| 278 |
-
#
|
| 279 |
-
#
|
| 280 |
-
#
|
| 281 |
-
#
|
| 282 |
if result["source"] == "template":
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
"All free models in the chain were rate-limited or unreachable "
|
| 285 |
"at test time. Re-run later or run scripts/diagnose_openrouter.py."
|
| 286 |
)
|
| 287 |
|
| 288 |
assert result["source"] == "llm"
|
| 289 |
-
assert result["model"] is not None and result["model"].endswith(":free")
|
|
|
|
|
|
|
|
|
|
| 290 |
assert result["rationale"].strip(), "LLM returned empty rationale"
|
| 291 |
-
|
| 292 |
-
#
|
| 293 |
-
#
|
| 294 |
lowered = result["rationale"].lower()
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
|
| 255 |
+
@pytest.mark.slow
|
| 256 |
+
@pytest.mark.skipif(
|
| 257 |
+
not os.environ.get("OPENROUTER_API_KEY"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
|
| 259 |
)
|
| 260 |
+
@pytest.mark.skipif(
|
| 261 |
+
os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
|
| 262 |
reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
|
| 263 |
)
|
| 264 |
class TestLiveOpenRouterLLM:
|
| 265 |
"""End-to-end: hit a real OpenRouter free-tier model and assert
|
| 266 |
`explain()` returns source='llm' with non-empty content. Skipped
|
| 267 |
+
when no key is set or the kill-switch is on.
|
| 268 |
|
| 269 |
+
Marked `slow` because it makes a real network round-trip
|
| 270 |
+
(worst case ~80s if the entire chain is unreachable). Run with
|
| 271 |
+
`pytest -m slow` or include it in the default suite by not passing
|
| 272 |
+
`-m "not slow"`."""
|
| 273 |
+
|
| 274 |
+
def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
|
| 275 |
+
import logging
|
| 276 |
from src.llm import explainer as ex
|
| 277 |
|
| 278 |
+
with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
|
| 279 |
+
result = ex.explain(_payload(), modality="bbb")
|
| 280 |
|
| 281 |
+
# Flaky-network safety net: only skip when we have evidence the
|
| 282 |
+
# template fallback fired due to transient infra (rate-limit,
|
| 283 |
+
# 5xx, network). If the fallback fired silently — no infra-error
|
| 284 |
+
# log line — that's a real regression we want to fail loud.
|
| 285 |
if result["source"] == "template":
|
| 286 |
+
log_text = " ".join(r.getMessage() for r in caplog.records)
|
| 287 |
+
transient_signals = (
|
| 288 |
+
"429", "OpenRouter 5", "OpenRouter 4", # status-code log lines
|
| 289 |
+
"connection error", "timeout", # transport-error log lines
|
| 290 |
+
"All free models exhausted", # chain-end log line
|
| 291 |
+
)
|
| 292 |
+
had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
|
| 293 |
+
if not had_infra_evidence:
|
| 294 |
+
pytest.fail(
|
| 295 |
+
"explain() fell back to template with NO infra-error log "
|
| 296 |
+
"line — this is a real regression, not a network blip. "
|
| 297 |
+
f"Captured logs: {log_text!r}"
|
| 298 |
+
)
|
| 299 |
+
pytest.skip(
|
| 300 |
"All free models in the chain were rate-limited or unreachable "
|
| 301 |
"at test time. Re-run later or run scripts/diagnose_openrouter.py."
|
| 302 |
)
|
| 303 |
|
| 304 |
assert result["source"] == "llm"
|
| 305 |
+
assert result["model"] is not None and result["model"].endswith(":free"), (
|
| 306 |
+
f"unexpected model id (must end with ':free' to ensure no paid model "
|
| 307 |
+
f"snuck into the chain): {result['model']!r}"
|
| 308 |
+
)
|
| 309 |
assert result["rationale"].strip(), "LLM returned empty rationale"
|
| 310 |
+
|
| 311 |
+
# Refusal/safety-filter sanity: catch the common patterns instead
|
| 312 |
+
# of just one prefix.
|
| 313 |
lowered = result["rationale"].lower()
|
| 314 |
+
refusal_signals = (
|
| 315 |
+
"i cannot",
|
| 316 |
+
"i can't",
|
| 317 |
+
"i'm sorry, but i",
|
| 318 |
+
"i'm sorry, i can't",
|
| 319 |
+
"as an ai",
|
| 320 |
+
"as a language model",
|
| 321 |
+
"i'm unable to",
|
| 322 |
+
"i do not have the ability",
|
| 323 |
+
)
|
| 324 |
+
assert not any(lowered.startswith(s) for s in refusal_signals), (
|
| 325 |
+
f"LLM refused (matched refusal pattern): {result['rationale']!r}"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Positive on-topic assertion: the rationale must reference at least
|
| 329 |
+
# one of the SHAP feature names from the payload, OR the verdict
|
| 330 |
+
# word ("permeable" / "non-permeable"). A model that produced
|
| 331 |
+
# off-topic small-talk would fail here.
|
| 332 |
+
payload = _payload()
|
| 333 |
+
feature_names = [f["feature"] for f in payload["top_features"]]
|
| 334 |
+
verdict = payload["label_text"].lower()
|
| 335 |
+
on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
|
| 336 |
+
assert any(anchor in lowered for anchor in on_topic_anchors), (
|
| 337 |
+
f"rationale appears off-topic (no mention of SHAP features "
|
| 338 |
+
f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
|
| 339 |
+
)
|