""" Two deliberately weak baselines that produce structured, complementary failure patterns. Model A — predict_always_1: Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive). Model B — keyword_heuristic: Predicts 0 when the conclusion or statement contains explicit negation markers; predicts 1 otherwise. Fails on affirmative conclusions that are false, and on negated conclusions that are true. Using two models lets MI(cluster, model_identity) be informative: if clusters partially track which model failed rather than only which reasoning type failed, that signals model-specific failure geometry. Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py (engineered heuristics, not trained weights). """ from __future__ import annotations from typing import Any _NEGATION_TOKENS = frozenset([ "not", "no ", "never", "cannot", "can't", "isn't", "aren't", "doesn't", "don't", "didn't", "won't", "hasn't", "haven't", "unpaid", "expired", "banned", "broken", "empty", "dead", "missing", "disabled", "invalid", "locked", "closed", ]) _STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :") def _extract_statement(x: str) -> str: """Pull out the asserted part so the heuristic focuses there.""" lowered = x.lower() for marker in _STATEMENT_MARKERS: idx = lowered.rfind(marker) if idx != -1: return x[idx + len(marker):].strip() return x def predict_always_1(x: str) -> int: # noqa: ARG001 return 1 def predict_keyword_heuristic(x: str) -> int: statement = _extract_statement(x).lower() for token in _NEGATION_TOKENS: if token in statement: return 0 return 1 BASELINES: dict[str, Any] = { "always_1": predict_always_1, "keyword_heuristic": predict_keyword_heuristic, } def run_baselines( dataset: list[dict[str, Any]], selected: list[str] | None = None, ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: """ Returns (all_results, failures). Each result record: sample_id, x, y, reasoning_type, model_id, prediction, is_correct """ model_ids = selected if selected else list(BASELINES.keys()) all_results: list[dict[str, Any]] = [] failures: list[dict[str, Any]] = [] for sample_id, sample in enumerate(dataset): expected = int(sample["y"]) for model_id in model_ids: fn = BASELINES[model_id] prediction = fn(sample["x"]) is_correct = prediction == expected record = { "sample_id": sample_id, "x": sample["x"], "y": expected, "reasoning_type": sample["reasoning_type"], "model_id": model_id, "prediction": prediction, "is_correct": is_correct, } all_results.append(record) if not is_correct: failures.append(record) return all_results, failures