Spaces:

obversarystudios
/

failure-geometry-demo

Running

failure-geometry-demo / core /baseline.py

Brian Moran

Add failure geometry demo pipeline

09f4a33 13 days ago

3.03 kB

	"""
	Two deliberately weak baselines that produce structured, complementary failure patterns.

	Model A — predict_always_1:
	Predicts 1 for every input. Fails on all y=0 items (systematic bias toward positive).

	Model B — keyword_heuristic:
	Predicts 0 when the conclusion or statement contains explicit negation markers;
	predicts 1 otherwise. Fails on affirmative conclusions that are false, and on
	negated conclusions that are true.

	Using two models lets MI(cluster, model_identity) be informative: if clusters
	partially track which model failed rather than only which reasoning type failed,
	that signals model-specific failure geometry.

	Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
	(engineered heuristics, not trained weights).
	"""

	from __future__ import annotations

	from typing import Any


	_NEGATION_TOKENS = frozenset([
	"not", "no ", "never", "cannot", "can't", "isn't", "aren't",
	"doesn't", "don't", "didn't", "won't", "hasn't", "haven't",
	"unpaid", "expired", "banned", "broken", "empty", "dead",
	"missing", "disabled", "invalid", "locked", "closed",
	])

	_STATEMENT_MARKERS = ("statement:", "conclusion:", "statement :")


	def _extract_statement(x: str) -> str:
	"""Pull out the asserted part so the heuristic focuses there."""
	lowered = x.lower()
	for marker in _STATEMENT_MARKERS:
	idx = lowered.rfind(marker)
	if idx != -1:
	return x[idx + len(marker):].strip()
	return x


	def predict_always_1(x: str) -> int: # noqa: ARG001
	return 1


	def predict_keyword_heuristic(x: str) -> int:
	statement = _extract_statement(x).lower()
	for token in _NEGATION_TOKENS:
	if token in statement:
	return 0
	return 1


	BASELINES: dict[str, Any] = {
	"always_1": predict_always_1,
	"keyword_heuristic": predict_keyword_heuristic,
	}


	def run_baselines(
	dataset: list[dict[str, Any]],
	selected: list[str] \| None = None,
	) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
	"""
	Returns (all_results, failures).

	Each result record:
	sample_id, x, y, reasoning_type, model_id, prediction, is_correct
	"""
	model_ids = selected if selected else list(BASELINES.keys())
	all_results: list[dict[str, Any]] = []
	failures: list[dict[str, Any]] = []

	for sample_id, sample in enumerate(dataset):
	expected = int(sample["y"])
	for model_id in model_ids:
	fn = BASELINES[model_id]
	prediction = fn(sample["x"])
	is_correct = prediction == expected
	record = {
	"sample_id": sample_id,
	"x": sample["x"],
	"y": expected,
	"reasoning_type": sample["reasoning_type"],
	"model_id": model_id,
	"prediction": prediction,
	"is_correct": is_correct,
	}
	all_results.append(record)
	if not is_correct:
	failures.append(record)

	return all_results, failures