File size: 3,026 Bytes
09f4a33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | """
Two deliberately weak baselines that produce structured, complementary failure patterns.
Model A — predict_always_1:
Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive).
Model B — keyword_heuristic:
Predicts 0 when the conclusion or statement contains explicit negation markers;
predicts 1 otherwise. Fails on affirmative conclusions that are false, and on
negated conclusions that are true.
Using two models lets MI(cluster, model_identity) be informative: if clusters
partially track which model failed rather than only which reasoning type failed,
that signals model-specific failure geometry.
Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
(engineered heuristics, not trained weights).
"""
from __future__ import annotations
from typing import Any
_NEGATION_TOKENS = frozenset([
"not", "no ", "never", "cannot", "can't", "isn't", "aren't",
"doesn't", "don't", "didn't", "won't", "hasn't", "haven't",
"unpaid", "expired", "banned", "broken", "empty", "dead",
"missing", "disabled", "invalid", "locked", "closed",
])
_STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :")
def _extract_statement(x: str) -> str:
"""Pull out the asserted part so the heuristic focuses there."""
lowered = x.lower()
for marker in _STATEMENT_MARKERS:
idx = lowered.rfind(marker)
if idx != -1:
return x[idx + len(marker):].strip()
return x
def predict_always_1(x: str) -> int: # noqa: ARG001
return 1
def predict_keyword_heuristic(x: str) -> int:
statement = _extract_statement(x).lower()
for token in _NEGATION_TOKENS:
if token in statement:
return 0
return 1
BASELINES: dict[str, Any] = {
"always_1": predict_always_1,
"keyword_heuristic": predict_keyword_heuristic,
}
def run_baselines(
dataset: list[dict[str, Any]],
selected: list[str] | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
"""
Returns (all_results, failures).
Each result record:
sample_id, x, y, reasoning_type, model_id, prediction, is_correct
"""
model_ids = selected if selected else list(BASELINES.keys())
all_results: list[dict[str, Any]] = []
failures: list[dict[str, Any]] = []
for sample_id, sample in enumerate(dataset):
expected = int(sample["y"])
for model_id in model_ids:
fn = BASELINES[model_id]
prediction = fn(sample["x"])
is_correct = prediction == expected
record = {
"sample_id": sample_id,
"x": sample["x"],
"y": expected,
"reasoning_type": sample["reasoning_type"],
"model_id": model_id,
"prediction": prediction,
"is_correct": is_correct,
}
all_results.append(record)
if not is_correct:
failures.append(record)
return all_results, failures
|