mekosotto commited on
Commit
b38c6b3
·
1 Parent(s): 32e1d40

test(llm): tighten live integration test (slow marker, log-gated skip, on-topic assertion)

Browse files
Files changed (1) hide show
  1. tests/llm/test_explainer.py +66 -22
tests/llm/test_explainer.py CHANGED
@@ -252,44 +252,88 @@ class TestAuthFailureShortCircuits:
252
  )
253
 
254
 
255
- import os as _os
256
-
257
- import pytest as _pytest
258
-
259
-
260
- @_pytest.mark.skipif(
261
- not _os.environ.get("OPENROUTER_API_KEY"),
262
  reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
263
  )
264
- @_pytest.mark.skipif(
265
- _os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
266
  reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
267
  )
268
  class TestLiveOpenRouterLLM:
269
  """End-to-end: hit a real OpenRouter free-tier model and assert
270
  `explain()` returns source='llm' with non-empty content. Skipped
271
- when no key is set or the kill-switch is on."""
272
 
273
- def test_bbb_explain_returns_llm_source_with_real_key(self):
 
 
 
 
 
 
274
  from src.llm import explainer as ex
275
 
276
- result = ex.explain(_payload(), modality="bbb")
 
277
 
278
- # If every model in the chain is rate-limited or unreachable RIGHT NOW
279
- # the result will fall back to template that's a flaky-network
280
- # condition, not a code bug. Surface it as an XFAIL-style assertion
281
- # message instead of a hard failure.
282
  if result["source"] == "template":
283
- _pytest.skip(
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  "All free models in the chain were rate-limited or unreachable "
285
  "at test time. Re-run later or run scripts/diagnose_openrouter.py."
286
  )
287
 
288
  assert result["source"] == "llm"
289
- assert result["model"] is not None and result["model"].endswith(":free")
 
 
 
290
  assert result["rationale"].strip(), "LLM returned empty rationale"
291
- # Sanity: the rationale should mention SOMETHING about the prediction.
292
- # We do not assert on exact model wording (non-deterministic), but
293
- # we do assert it isn't a generic refusal/safety-filter response.
294
  lowered = result["rationale"].lower()
295
- assert not lowered.startswith("i cannot"), f"LLM refused: {result['rationale']!r}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  )
253
 
254
 
255
+ @pytest.mark.slow
256
+ @pytest.mark.skipif(
257
+ not os.environ.get("OPENROUTER_API_KEY"),
 
 
 
 
258
  reason="OPENROUTER_API_KEY not set — skipping live LLM integration test",
259
  )
260
+ @pytest.mark.skipif(
261
+ os.environ.get("NEUROBRIDGE_DISABLE_LLM") == "1",
262
  reason="NEUROBRIDGE_DISABLE_LLM=1 — skipping live LLM integration test",
263
  )
264
  class TestLiveOpenRouterLLM:
265
  """End-to-end: hit a real OpenRouter free-tier model and assert
266
  `explain()` returns source='llm' with non-empty content. Skipped
267
+ when no key is set or the kill-switch is on.
268
 
269
+ Marked `slow` because it makes a real network round-trip
270
+ (worst case ~80s if the entire chain is unreachable). Run with
271
+ `pytest -m slow` or include it in the default suite by not passing
272
+ `-m "not slow"`."""
273
+
274
+ def test_bbb_explain_returns_llm_source_with_real_key(self, caplog):
275
+ import logging
276
  from src.llm import explainer as ex
277
 
278
+ with caplog.at_level(logging.INFO, logger="src.llm.explainer"):
279
+ result = ex.explain(_payload(), modality="bbb")
280
 
281
+ # Flaky-network safety net: only skip when we have evidence the
282
+ # template fallback fired due to transient infra (rate-limit,
283
+ # 5xx, network). If the fallback fired silently no infra-error
284
+ # log line that's a real regression we want to fail loud.
285
  if result["source"] == "template":
286
+ log_text = " ".join(r.getMessage() for r in caplog.records)
287
+ transient_signals = (
288
+ "429", "OpenRouter 5", "OpenRouter 4", # status-code log lines
289
+ "connection error", "timeout", # transport-error log lines
290
+ "All free models exhausted", # chain-end log line
291
+ )
292
+ had_infra_evidence = any(s.lower() in log_text.lower() for s in transient_signals)
293
+ if not had_infra_evidence:
294
+ pytest.fail(
295
+ "explain() fell back to template with NO infra-error log "
296
+ "line — this is a real regression, not a network blip. "
297
+ f"Captured logs: {log_text!r}"
298
+ )
299
+ pytest.skip(
300
  "All free models in the chain were rate-limited or unreachable "
301
  "at test time. Re-run later or run scripts/diagnose_openrouter.py."
302
  )
303
 
304
  assert result["source"] == "llm"
305
+ assert result["model"] is not None and result["model"].endswith(":free"), (
306
+ f"unexpected model id (must end with ':free' to ensure no paid model "
307
+ f"snuck into the chain): {result['model']!r}"
308
+ )
309
  assert result["rationale"].strip(), "LLM returned empty rationale"
310
+
311
+ # Refusal/safety-filter sanity: catch the common patterns instead
312
+ # of just one prefix.
313
  lowered = result["rationale"].lower()
314
+ refusal_signals = (
315
+ "i cannot",
316
+ "i can't",
317
+ "i'm sorry, but i",
318
+ "i'm sorry, i can't",
319
+ "as an ai",
320
+ "as a language model",
321
+ "i'm unable to",
322
+ "i do not have the ability",
323
+ )
324
+ assert not any(lowered.startswith(s) for s in refusal_signals), (
325
+ f"LLM refused (matched refusal pattern): {result['rationale']!r}"
326
+ )
327
+
328
+ # Positive on-topic assertion: the rationale must reference at least
329
+ # one of the SHAP feature names from the payload, OR the verdict
330
+ # word ("permeable" / "non-permeable"). A model that produced
331
+ # off-topic small-talk would fail here.
332
+ payload = _payload()
333
+ feature_names = [f["feature"] for f in payload["top_features"]]
334
+ verdict = payload["label_text"].lower()
335
+ on_topic_anchors = [f.lower() for f in feature_names] + [verdict]
336
+ assert any(anchor in lowered for anchor in on_topic_anchors), (
337
+ f"rationale appears off-topic (no mention of SHAP features "
338
+ f"{feature_names!r} or verdict {verdict!r}): {result['rationale']!r}"
339
+ )