""" ECHO ULTIMATE — 4 Baseline Agents. AlwaysFiftyAgent — uniform prior, maximum ignorance AlwaysHighAgent — typical LLM overconfidence HeuristicAgent — smart domain-aware rules, no learning TemperatureScaledAgent — post-hoc calibration (simulated) """ import json import logging import re from pathlib import Path from typing import Optional import numpy as np from config import cfg from env.parser import parse_response, ParseResult, format_prompt from env.reward import RewardHistory, compute_reward from core.metrics import compute_report, CalibrationReport logger = logging.getLogger(__name__) _TRICK_WORDS_RE = re.compile(r"\b(not|except|never|always|false|incorrect)\b", re.I) _CHOICE_RE = re.compile(r"choices?\s*:.*?[A-D]:", re.I | re.S) def _detect_domain(prompt: str) -> str: p = prompt.lower() if _CHOICE_RE.search(p): if any(w in p for w in ["atom", "force", "energy", "cell", "element", "chemical"]): return "science" if any(w in p for w in ["patient", "drug", "dose", "symptom", "surgery", "diagnosis"]): return "medical" return "logic" if any(w in p for w in ["print(", "def ", "return", "function", "algorithm", "code", "complexity"]): return "coding" if any(w in p for w in ["how many", "calculate", " + ", " - ", "×", "*", "divided", "percent", "%"]): return "math" if any(w in p for w in ["rhyme", "synonym", "literary", "poem", "metaphor"]): return "creative" return "factual" def _make_response(conf: int, answer: str = "") -> str: return cfg.CONFIDENCE_FORMAT.format(conf=conf, ans=answer) # ── AlwaysFiftyAgent ────────────────────────────────────────────────────────── class AlwaysFiftyAgent: """ Always outputs 50% confidence regardless of question. Represents: maximum-ignorance / uniform-prior baseline. Expected ECE: ~0.10-0.15 on mixed difficulty data. """ name = "AlwaysFifty" def __call__(self, prompt: str) -> str: domain = _detect_domain(prompt) ans = "A" if domain in ("logic", "science", "medical") else "" return _make_response(50, ans) def answer(self, question: str, domain: str = "factual") -> ParseResult: raw = _make_response(50, "A" if domain in ("logic","science","medical") else "") return parse_response(raw) # ── AlwaysHighAgent ─────────────────────────────────────────────────────────── class AlwaysHighAgent: """ Always outputs 90% confidence. Represents: typical untrained LLM overconfidence. Expected ECE: ~0.35-0.45 on mixed difficulty data. """ name = "AlwaysHigh" def __call__(self, prompt: str) -> str: domain = _detect_domain(prompt) ans = "A" if domain in ("logic", "science", "medical") else "" return _make_response(90, ans) def answer(self, question: str, domain: str = "factual") -> ParseResult: raw = _make_response(90, "A" if domain in ("logic","science","medical") else "") return parse_response(raw) # ── HeuristicAgent ──────────────────────────────────────────────────────────── class HeuristicAgent: """ Domain-aware heuristic rules. No learning involved. Expected ECE: ~0.18-0.25. """ name = "Heuristic" _BASE_CONF = { "math": 65, "logic": 35, "factual": 55, "science": 40, "medical": 30, "coding": 50, "creative": 40, } def _compute_confidence(self, question: str, domain: str) -> int: conf = self._BASE_CONF.get(domain, 50) q = question.lower() if domain == "math": ops = len(re.findall(r"[\+\-\*\/]", q)) if ops <= 1 and len(q) < 60: conf = 80 elif ops <= 2: conf = 60 else: conf = 40 elif domain in ("logic", "science", "medical"): choices = len(re.findall(r"\b[a-d]\b", q, re.I)) if choices >= 4: conf = 30 # 4 choices → 25% random baseline; say 30% elif "not" in q or "except" in q: conf = 25 elif domain == "factual": words = len(q.split()) conf = 70 if words <= 8 else (50 if words <= 14 else 35) elif domain == "coding": if "print(" in q and len(q) < 50: conf = 70 elif "complexity" in q: conf = 35 # Trick-word penalty if _TRICK_WORDS_RE.search(question): conf = max(10, conf - 15) return max(0, min(100, conf)) def __call__(self, prompt: str) -> str: domain = _detect_domain(prompt) # Extract just the question line lines = [l.strip() for l in prompt.split("\n") if l.strip()] question = next((l for l in reversed(lines) if l.startswith("Question:")), lines[-1]) question = re.sub(r"^Question:\s*", "", question) conf = self._compute_confidence(question, domain) ans = "A" if domain in ("logic", "science", "medical") else "" return _make_response(conf, ans) def answer(self, question: str, domain: str = "factual") -> ParseResult: conf = self._compute_confidence(question, domain) ans = "A" if domain in ("logic", "science", "medical") else "" return parse_response(_make_response(conf, ans)) # ── TemperatureScaledAgent ──────────────────────────────────────────────────── class TemperatureScaledAgent: """ Simulates post-hoc temperature scaling calibration. Applies a learned temperature T to logit-derived probabilities. Without real logits, we simulate by perturbing AlwaysHigh confidence through a sigmoid with learned temperature. Represents the best EXISTING calibration technique without RL. Shows that ECHO learns something temperature scaling cannot. """ name = "TempScaled" def __init__(self, temperature: float = 1.5) -> None: self.temperature = temperature self._base = AlwaysHighAgent() @staticmethod def _sigmoid(x: float) -> float: return 1.0 / (1.0 + np.exp(-x)) def _scale_confidence(self, raw_conf: int) -> int: """Apply temperature scaling to a raw confidence value.""" logit = np.log(raw_conf / 100.0 + 1e-9) - np.log(1 - raw_conf / 100.0 + 1e-9) scaled_prob = self._sigmoid(logit / self.temperature) return int(np.clip(round(scaled_prob * 100), 0, 100)) def __call__(self, prompt: str) -> str: domain = _detect_domain(prompt) base_conf = np.random.randint(70, 95) # simulate overconfident raw output scaled = self._scale_confidence(base_conf) ans = "A" if domain in ("logic", "science", "medical") else "" return _make_response(scaled, ans) def answer(self, question: str, domain: str = "factual") -> ParseResult: raw = self(f"Question: {question}") return parse_response(raw) # ── GPTBaseline ─────────────────────────────────────────────────────────────── class GPTBaseline: """ GPT-4o-mini calibration baseline using the OpenAI API. Asks the model to produce formatted output. Requires OPENAI_API_KEY environment variable. Skipped silently if key is not set or openai is not installed. """ name = "GPT-4o-mini" def __init__(self, api_key: str = None) -> None: import os self.api_key = api_key or os.getenv("OPENAI_API_KEY", "") self._available = bool(self.api_key) def __call__(self, prompt: str) -> str: if not self._available: return _make_response(70, "") try: from openai import OpenAI client = OpenAI(api_key=self.api_key) sys_msg = ( "You are an epistemically honest AI. Before answering, state your confidence.\n" "Required format: NUMBERYOUR ANSWER" ) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": sys_msg}, {"role": "user", "content": prompt}, ], max_tokens=200, temperature=0.7, ) return response.choices[0].message.content or _make_response(70, "") except Exception as exc: logger.warning("GPTBaseline error: %s", exc) return _make_response(70, "") def answer(self, question: str, domain: str = "factual") -> ParseResult: raw = self(f"Question: {question}") return parse_response(raw) # ── Baseline evaluation ─────────────────────────────────────────────────────── ALL_BASELINES = { "always_fifty": AlwaysFiftyAgent(), "always_high": AlwaysHighAgent(), "heuristic": HeuristicAgent(), "temp_scaled": TemperatureScaledAgent(), } def run_baseline_evaluation( task_bank, n_episodes: int = 200, save_path: str = cfg.BASELINE_LOG, ) -> dict: """ Run all 4 baselines on the same n_episodes questions. Returns dict: agent_name → CalibrationReport """ from env.echo_env import EchoEnv results = {} for name, agent in ALL_BASELINES.items(): logger.info("Evaluating baseline: %s (%d episodes)…", name, n_episodes) history = RewardHistory() env = EchoEnv(task_bank=task_bank, reward_history=history, phase=3) confs, corrs = [], [] for ep in range(n_episodes): task = task_bank.get_batch(1, phase=3)[0] env._current_task = task env._episode_step = 0 prompt = format_prompt(task["question"], task["domain"], task["difficulty"]) try: action = agent(prompt) except Exception: action = _make_response(50, "") _, _, _, _, info = env.step(action) confs.append(info["parsed_confidence"]) corrs.append(info["was_correct"]) rep = compute_report(confs, corrs) results[name] = rep # Save JSON log Path(save_path).parent.mkdir(parents=True, exist_ok=True) with open(save_path, "w") as f: json.dump({k: v.to_dict() for k, v in results.items()}, f, indent=2) logger.info("Baseline log saved → %s", save_path) return results