Spaces:
Sleeping
Sleeping
| """ | |
| ECHO ULTIMATE β 4 Baseline Agents. | |
| AlwaysFiftyAgent β uniform prior, maximum ignorance | |
| AlwaysHighAgent β typical LLM overconfidence | |
| HeuristicAgent β smart domain-aware rules, no learning | |
| TemperatureScaledAgent β post-hoc calibration (simulated) | |
| """ | |
| import json | |
| import logging | |
| import re | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| from config import cfg | |
| from env.parser import parse_response, ParseResult, format_prompt | |
| from env.reward import RewardHistory, compute_reward | |
| from core.metrics import compute_report, CalibrationReport | |
| logger = logging.getLogger(__name__) | |
| _TRICK_WORDS_RE = re.compile(r"\b(not|except|never|always|false|incorrect)\b", re.I) | |
| _CHOICE_RE = re.compile(r"choices?\s*:.*?[A-D]:", re.I | re.S) | |
| def _detect_domain(prompt: str) -> str: | |
| p = prompt.lower() | |
| if _CHOICE_RE.search(p): | |
| if any(w in p for w in ["atom", "force", "energy", "cell", "element", "chemical"]): | |
| return "science" | |
| if any(w in p for w in ["patient", "drug", "dose", "symptom", "surgery", "diagnosis"]): | |
| return "medical" | |
| return "logic" | |
| if any(w in p for w in ["print(", "def ", "return", "function", "algorithm", "code", "complexity"]): | |
| return "coding" | |
| if any(w in p for w in ["how many", "calculate", " + ", " - ", "Γ", "*", "divided", "percent", "%"]): | |
| return "math" | |
| if any(w in p for w in ["rhyme", "synonym", "literary", "poem", "metaphor"]): | |
| return "creative" | |
| return "factual" | |
| def _make_response(conf: int, answer: str = "") -> str: | |
| return cfg.CONFIDENCE_FORMAT.format(conf=conf, ans=answer) | |
| # ββ AlwaysFiftyAgent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class AlwaysFiftyAgent: | |
| """ | |
| Always outputs 50% confidence regardless of question. | |
| Represents: maximum-ignorance / uniform-prior baseline. | |
| Expected ECE: ~0.10-0.15 on mixed difficulty data. | |
| """ | |
| name = "AlwaysFifty" | |
| def __call__(self, prompt: str) -> str: | |
| domain = _detect_domain(prompt) | |
| ans = "A" if domain in ("logic", "science", "medical") else "" | |
| return _make_response(50, ans) | |
| def answer(self, question: str, domain: str = "factual") -> ParseResult: | |
| raw = _make_response(50, "A" if domain in ("logic","science","medical") else "") | |
| return parse_response(raw) | |
| # ββ AlwaysHighAgent βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class AlwaysHighAgent: | |
| """ | |
| Always outputs 90% confidence. | |
| Represents: typical untrained LLM overconfidence. | |
| Expected ECE: ~0.35-0.45 on mixed difficulty data. | |
| """ | |
| name = "AlwaysHigh" | |
| def __call__(self, prompt: str) -> str: | |
| domain = _detect_domain(prompt) | |
| ans = "A" if domain in ("logic", "science", "medical") else "" | |
| return _make_response(90, ans) | |
| def answer(self, question: str, domain: str = "factual") -> ParseResult: | |
| raw = _make_response(90, "A" if domain in ("logic","science","medical") else "") | |
| return parse_response(raw) | |
| # ββ HeuristicAgent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class HeuristicAgent: | |
| """ | |
| Domain-aware heuristic rules. No learning involved. | |
| Expected ECE: ~0.18-0.25. | |
| """ | |
| name = "Heuristic" | |
| _BASE_CONF = { | |
| "math": 65, | |
| "logic": 35, | |
| "factual": 55, | |
| "science": 40, | |
| "medical": 30, | |
| "coding": 50, | |
| "creative": 40, | |
| } | |
| def _compute_confidence(self, question: str, domain: str) -> int: | |
| conf = self._BASE_CONF.get(domain, 50) | |
| q = question.lower() | |
| if domain == "math": | |
| ops = len(re.findall(r"[\+\-\*\/]", q)) | |
| if ops <= 1 and len(q) < 60: | |
| conf = 80 | |
| elif ops <= 2: | |
| conf = 60 | |
| else: | |
| conf = 40 | |
| elif domain in ("logic", "science", "medical"): | |
| choices = len(re.findall(r"\b[a-d]\b", q, re.I)) | |
| if choices >= 4: | |
| conf = 30 # 4 choices β 25% random baseline; say 30% | |
| elif "not" in q or "except" in q: | |
| conf = 25 | |
| elif domain == "factual": | |
| words = len(q.split()) | |
| conf = 70 if words <= 8 else (50 if words <= 14 else 35) | |
| elif domain == "coding": | |
| if "print(" in q and len(q) < 50: | |
| conf = 70 | |
| elif "complexity" in q: | |
| conf = 35 | |
| # Trick-word penalty | |
| if _TRICK_WORDS_RE.search(question): | |
| conf = max(10, conf - 15) | |
| return max(0, min(100, conf)) | |
| def __call__(self, prompt: str) -> str: | |
| domain = _detect_domain(prompt) | |
| # Extract just the question line | |
| lines = [l.strip() for l in prompt.split("\n") if l.strip()] | |
| question = next((l for l in reversed(lines) if l.startswith("Question:")), lines[-1]) | |
| question = re.sub(r"^Question:\s*", "", question) | |
| conf = self._compute_confidence(question, domain) | |
| ans = "A" if domain in ("logic", "science", "medical") else "" | |
| return _make_response(conf, ans) | |
| def answer(self, question: str, domain: str = "factual") -> ParseResult: | |
| conf = self._compute_confidence(question, domain) | |
| ans = "A" if domain in ("logic", "science", "medical") else "" | |
| return parse_response(_make_response(conf, ans)) | |
| # ββ TemperatureScaledAgent ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TemperatureScaledAgent: | |
| """ | |
| Simulates post-hoc temperature scaling calibration. | |
| Applies a learned temperature T to logit-derived probabilities. | |
| Without real logits, we simulate by perturbing AlwaysHigh confidence | |
| through a sigmoid with learned temperature. | |
| Represents the best EXISTING calibration technique without RL. | |
| Shows that ECHO learns something temperature scaling cannot. | |
| """ | |
| name = "TempScaled" | |
| def __init__(self, temperature: float = 1.5) -> None: | |
| self.temperature = temperature | |
| self._base = AlwaysHighAgent() | |
| def _sigmoid(x: float) -> float: | |
| return 1.0 / (1.0 + np.exp(-x)) | |
| def _scale_confidence(self, raw_conf: int) -> int: | |
| """Apply temperature scaling to a raw confidence value.""" | |
| logit = np.log(raw_conf / 100.0 + 1e-9) - np.log(1 - raw_conf / 100.0 + 1e-9) | |
| scaled_prob = self._sigmoid(logit / self.temperature) | |
| return int(np.clip(round(scaled_prob * 100), 0, 100)) | |
| def __call__(self, prompt: str) -> str: | |
| domain = _detect_domain(prompt) | |
| base_conf = np.random.randint(70, 95) # simulate overconfident raw output | |
| scaled = self._scale_confidence(base_conf) | |
| ans = "A" if domain in ("logic", "science", "medical") else "" | |
| return _make_response(scaled, ans) | |
| def answer(self, question: str, domain: str = "factual") -> ParseResult: | |
| raw = self(f"Question: {question}") | |
| return parse_response(raw) | |
| # ββ GPTBaseline βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class GPTBaseline: | |
| """ | |
| GPT-4o-mini calibration baseline using the OpenAI API. | |
| Asks the model to produce <confidence><answer> formatted output. | |
| Requires OPENAI_API_KEY environment variable. | |
| Skipped silently if key is not set or openai is not installed. | |
| """ | |
| name = "GPT-4o-mini" | |
| def __init__(self, api_key: str = None) -> None: | |
| import os | |
| self.api_key = api_key or os.getenv("OPENAI_API_KEY", "") | |
| self._available = bool(self.api_key) | |
| def __call__(self, prompt: str) -> str: | |
| if not self._available: | |
| return _make_response(70, "") | |
| try: | |
| from openai import OpenAI | |
| client = OpenAI(api_key=self.api_key) | |
| sys_msg = ( | |
| "You are an epistemically honest AI. Before answering, state your confidence.\n" | |
| "Required format: <confidence>NUMBER</confidence><answer>YOUR ANSWER</answer>" | |
| ) | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": sys_msg}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| max_tokens=200, | |
| temperature=0.7, | |
| ) | |
| return response.choices[0].message.content or _make_response(70, "") | |
| except Exception as exc: | |
| logger.warning("GPTBaseline error: %s", exc) | |
| return _make_response(70, "") | |
| def answer(self, question: str, domain: str = "factual") -> ParseResult: | |
| raw = self(f"Question: {question}") | |
| return parse_response(raw) | |
| # ββ Baseline evaluation βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ALL_BASELINES = { | |
| "always_fifty": AlwaysFiftyAgent(), | |
| "always_high": AlwaysHighAgent(), | |
| "heuristic": HeuristicAgent(), | |
| "temp_scaled": TemperatureScaledAgent(), | |
| } | |
| def run_baseline_evaluation( | |
| task_bank, | |
| n_episodes: int = 200, | |
| save_path: str = cfg.BASELINE_LOG, | |
| ) -> dict: | |
| """ | |
| Run all 4 baselines on the same n_episodes questions. | |
| Returns dict: agent_name β CalibrationReport | |
| """ | |
| from env.echo_env import EchoEnv | |
| results = {} | |
| for name, agent in ALL_BASELINES.items(): | |
| logger.info("Evaluating baseline: %s (%d episodes)β¦", name, n_episodes) | |
| history = RewardHistory() | |
| env = EchoEnv(task_bank=task_bank, reward_history=history, phase=3) | |
| confs, corrs = [], [] | |
| for ep in range(n_episodes): | |
| task = task_bank.get_batch(1, phase=3)[0] | |
| env._current_task = task | |
| env._episode_step = 0 | |
| prompt = format_prompt(task["question"], task["domain"], task["difficulty"]) | |
| try: | |
| action = agent(prompt) | |
| except Exception: | |
| action = _make_response(50, "") | |
| _, _, _, _, info = env.step(action) | |
| confs.append(info["parsed_confidence"]) | |
| corrs.append(info["was_correct"]) | |
| rep = compute_report(confs, corrs) | |
| results[name] = rep | |
| # Save JSON log | |
| Path(save_path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(save_path, "w") as f: | |
| json.dump({k: v.to_dict() for k, v in results.items()}, f, indent=2) | |
| logger.info("Baseline log saved β %s", save_path) | |
| return results | |