| """ |
| Two deliberately weak baselines that produce structured, complementary failure patterns. |
| |
| Model A — predict_always_1: |
| Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive). |
| |
| Model B — keyword_heuristic: |
| Predicts 0 when the conclusion or statement contains explicit negation markers; |
| predicts 1 otherwise. Fails on affirmative conclusions that are false, and on |
| negated conclusions that are true. |
| |
| Using two models lets MI(cluster, model_identity) be informative: if clusters |
| partially track which model failed rather than only which reasoning type failed, |
| that signals model-specific failure geometry. |
| |
| Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py |
| (engineered heuristics, not trained weights). |
| """ |
|
|
| from __future__ import annotations |
|
|
| from typing import Any |
|
|
|
|
| _NEGATION_TOKENS = frozenset([ |
| "not", "no ", "never", "cannot", "can't", "isn't", "aren't", |
| "doesn't", "don't", "didn't", "won't", "hasn't", "haven't", |
| "unpaid", "expired", "banned", "broken", "empty", "dead", |
| "missing", "disabled", "invalid", "locked", "closed", |
| ]) |
|
|
| _STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :") |
|
|
|
|
| def _extract_statement(x: str) -> str: |
| """Pull out the asserted part so the heuristic focuses there.""" |
| lowered = x.lower() |
| for marker in _STATEMENT_MARKERS: |
| idx = lowered.rfind(marker) |
| if idx != -1: |
| return x[idx + len(marker):].strip() |
| return x |
|
|
|
|
| def predict_always_1(x: str) -> int: |
| return 1 |
|
|
|
|
| def predict_keyword_heuristic(x: str) -> int: |
| statement = _extract_statement(x).lower() |
| for token in _NEGATION_TOKENS: |
| if token in statement: |
| return 0 |
| return 1 |
|
|
|
|
| BASELINES: dict[str, Any] = { |
| "always_1": predict_always_1, |
| "keyword_heuristic": predict_keyword_heuristic, |
| } |
|
|
|
|
| def run_baselines( |
| dataset: list[dict[str, Any]], |
| selected: list[str] | None = None, |
| ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: |
| """ |
| Returns (all_results, failures). |
| |
| Each result record: |
| sample_id, x, y, reasoning_type, model_id, prediction, is_correct |
| """ |
| model_ids = selected if selected else list(BASELINES.keys()) |
| all_results: list[dict[str, Any]] = [] |
| failures: list[dict[str, Any]] = [] |
|
|
| for sample_id, sample in enumerate(dataset): |
| expected = int(sample["y"]) |
| for model_id in model_ids: |
| fn = BASELINES[model_id] |
| prediction = fn(sample["x"]) |
| is_correct = prediction == expected |
| record = { |
| "sample_id": sample_id, |
| "x": sample["x"], |
| "y": expected, |
| "reasoning_type": sample["reasoning_type"], |
| "model_id": model_id, |
| "prediction": prediction, |
| "is_correct": is_correct, |
| } |
| all_results.append(record) |
| if not is_correct: |
| failures.append(record) |
|
|
| return all_results, failures |
|
|