Spaces:

obversarystudios
/

failure-geometry-demo

Running

File size: 3,026 Bytes

09f4a33

"""
Two deliberately weak baselines that produce structured, complementary failure patterns.

Model A — predict_always_1:
  Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive).

Model B — keyword_heuristic:
  Predicts 0 when the conclusion or statement contains explicit negation markers;
  predicts 1 otherwise. Fails on affirmative conclusions that are false, and on
  negated conclusions that are true.

Using two models lets MI(cluster, model_identity) be informative: if clusters
partially track which model failed rather than only which reasoning type failed,
that signals model-specific failure geometry.

Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
(engineered heuristics, not trained weights).
"""

from __future__ import annotations

from typing import Any


_NEGATION_TOKENS = frozenset([
    "not", "no ", "never", "cannot", "can't", "isn't", "aren't",
    "doesn't", "don't", "didn't", "won't", "hasn't", "haven't",
    "unpaid", "expired", "banned", "broken", "empty", "dead",
    "missing", "disabled", "invalid", "locked", "closed",
])

_STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :")


def _extract_statement(x: str) -> str:
    """Pull out the asserted part so the heuristic focuses there."""
    lowered = x.lower()
    for marker in _STATEMENT_MARKERS:
        idx = lowered.rfind(marker)
        if idx != -1:
            return x[idx + len(marker):].strip()
    return x


def predict_always_1(x: str) -> int:  # noqa: ARG001
    return 1


def predict_keyword_heuristic(x: str) -> int:
    statement = _extract_statement(x).lower()
    for token in _NEGATION_TOKENS:
        if token in statement:
            return 0
    return 1


BASELINES: dict[str, Any] = {
    "always_1": predict_always_1,
    "keyword_heuristic": predict_keyword_heuristic,
}


def run_baselines(
    dataset: list[dict[str, Any]],
    selected: list[str] | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
    """
    Returns (all_results, failures).

    Each result record:
        sample_id, x, y, reasoning_type, model_id, prediction, is_correct
    """
    model_ids = selected if selected else list(BASELINES.keys())
    all_results: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []

    for sample_id, sample in enumerate(dataset):
        expected = int(sample["y"])
        for model_id in model_ids:
            fn = BASELINES[model_id]
            prediction = fn(sample["x"])
            is_correct = prediction == expected
            record = {
                "sample_id": sample_id,
                "x": sample["x"],
                "y": expected,
                "reasoning_type": sample["reasoning_type"],
                "model_id": model_id,
                "prediction": prediction,
                "is_correct": is_correct,
            }
            all_results.append(record)
            if not is_correct:
                failures.append(record)

    return all_results, failures