File size: 3,026 Bytes
09f4a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Two deliberately weak baselines that produce structured, complementary failure patterns.

Model A — predict_always_1:
  Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive).

Model B — keyword_heuristic:
  Predicts 0 when the conclusion or statement contains explicit negation markers;
  predicts 1 otherwise. Fails on affirmative conclusions that are false, and on
  negated conclusions that are true.

Using two models lets MI(cluster, model_identity) be informative: if clusters
partially track which model failed rather than only which reasoning type failed,
that signals model-specific failure geometry.

Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
(engineered heuristics, not trained weights).
"""

from __future__ import annotations

from typing import Any


_NEGATION_TOKENS = frozenset([
    "not", "no ", "never", "cannot", "can't", "isn't", "aren't",
    "doesn't", "don't", "didn't", "won't", "hasn't", "haven't",
    "unpaid", "expired", "banned", "broken", "empty", "dead",
    "missing", "disabled", "invalid", "locked", "closed",
])

_STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :")


def _extract_statement(x: str) -> str:
    """Pull out the asserted part so the heuristic focuses there."""
    lowered = x.lower()
    for marker in _STATEMENT_MARKERS:
        idx = lowered.rfind(marker)
        if idx != -1:
            return x[idx + len(marker):].strip()
    return x


def predict_always_1(x: str) -> int:  # noqa: ARG001
    return 1


def predict_keyword_heuristic(x: str) -> int:
    statement = _extract_statement(x).lower()
    for token in _NEGATION_TOKENS:
        if token in statement:
            return 0
    return 1


BASELINES: dict[str, Any] = {
    "always_1": predict_always_1,
    "keyword_heuristic": predict_keyword_heuristic,
}


def run_baselines(
    dataset: list[dict[str, Any]],
    selected: list[str] | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
    """
    Returns (all_results, failures).

    Each result record:
        sample_id, x, y, reasoning_type, model_id, prediction, is_correct
    """
    model_ids = selected if selected else list(BASELINES.keys())
    all_results: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []

    for sample_id, sample in enumerate(dataset):
        expected = int(sample["y"])
        for model_id in model_ids:
            fn = BASELINES[model_id]
            prediction = fn(sample["x"])
            is_correct = prediction == expected
            record = {
                "sample_id": sample_id,
                "x": sample["x"],
                "y": expected,
                "reasoning_type": sample["reasoning_type"],
                "model_id": model_id,
                "prediction": prediction,
                "is_correct": is_correct,
            }
            all_results.append(record)
            if not is_correct:
                failures.append(record)

    return all_results, failures