echo-ultimate / core /baseline.py
Vikaspandey582003's picture
Upload folder using huggingface_hub
acb327b verified
"""
ECHO ULTIMATE β€” 4 Baseline Agents.
AlwaysFiftyAgent β€” uniform prior, maximum ignorance
AlwaysHighAgent β€” typical LLM overconfidence
HeuristicAgent β€” smart domain-aware rules, no learning
TemperatureScaledAgent β€” post-hoc calibration (simulated)
"""
import json
import logging
import re
from pathlib import Path
from typing import Optional
import numpy as np
from config import cfg
from env.parser import parse_response, ParseResult, format_prompt
from env.reward import RewardHistory, compute_reward
from core.metrics import compute_report, CalibrationReport
logger = logging.getLogger(__name__)
_TRICK_WORDS_RE = re.compile(r"\b(not|except|never|always|false|incorrect)\b", re.I)
_CHOICE_RE = re.compile(r"choices?\s*:.*?[A-D]:", re.I | re.S)
def _detect_domain(prompt: str) -> str:
p = prompt.lower()
if _CHOICE_RE.search(p):
if any(w in p for w in ["atom", "force", "energy", "cell", "element", "chemical"]):
return "science"
if any(w in p for w in ["patient", "drug", "dose", "symptom", "surgery", "diagnosis"]):
return "medical"
return "logic"
if any(w in p for w in ["print(", "def ", "return", "function", "algorithm", "code", "complexity"]):
return "coding"
if any(w in p for w in ["how many", "calculate", " + ", " - ", "Γ—", "*", "divided", "percent", "%"]):
return "math"
if any(w in p for w in ["rhyme", "synonym", "literary", "poem", "metaphor"]):
return "creative"
return "factual"
def _make_response(conf: int, answer: str = "") -> str:
return cfg.CONFIDENCE_FORMAT.format(conf=conf, ans=answer)
# ── AlwaysFiftyAgent ──────────────────────────────────────────────────────────
class AlwaysFiftyAgent:
"""
Always outputs 50% confidence regardless of question.
Represents: maximum-ignorance / uniform-prior baseline.
Expected ECE: ~0.10-0.15 on mixed difficulty data.
"""
name = "AlwaysFifty"
def __call__(self, prompt: str) -> str:
domain = _detect_domain(prompt)
ans = "A" if domain in ("logic", "science", "medical") else ""
return _make_response(50, ans)
def answer(self, question: str, domain: str = "factual") -> ParseResult:
raw = _make_response(50, "A" if domain in ("logic","science","medical") else "")
return parse_response(raw)
# ── AlwaysHighAgent ───────────────────────────────────────────────────────────
class AlwaysHighAgent:
"""
Always outputs 90% confidence.
Represents: typical untrained LLM overconfidence.
Expected ECE: ~0.35-0.45 on mixed difficulty data.
"""
name = "AlwaysHigh"
def __call__(self, prompt: str) -> str:
domain = _detect_domain(prompt)
ans = "A" if domain in ("logic", "science", "medical") else ""
return _make_response(90, ans)
def answer(self, question: str, domain: str = "factual") -> ParseResult:
raw = _make_response(90, "A" if domain in ("logic","science","medical") else "")
return parse_response(raw)
# ── HeuristicAgent ────────────────────────────────────────────────────────────
class HeuristicAgent:
"""
Domain-aware heuristic rules. No learning involved.
Expected ECE: ~0.18-0.25.
"""
name = "Heuristic"
_BASE_CONF = {
"math": 65,
"logic": 35,
"factual": 55,
"science": 40,
"medical": 30,
"coding": 50,
"creative": 40,
}
def _compute_confidence(self, question: str, domain: str) -> int:
conf = self._BASE_CONF.get(domain, 50)
q = question.lower()
if domain == "math":
ops = len(re.findall(r"[\+\-\*\/]", q))
if ops <= 1 and len(q) < 60:
conf = 80
elif ops <= 2:
conf = 60
else:
conf = 40
elif domain in ("logic", "science", "medical"):
choices = len(re.findall(r"\b[a-d]\b", q, re.I))
if choices >= 4:
conf = 30 # 4 choices β†’ 25% random baseline; say 30%
elif "not" in q or "except" in q:
conf = 25
elif domain == "factual":
words = len(q.split())
conf = 70 if words <= 8 else (50 if words <= 14 else 35)
elif domain == "coding":
if "print(" in q and len(q) < 50:
conf = 70
elif "complexity" in q:
conf = 35
# Trick-word penalty
if _TRICK_WORDS_RE.search(question):
conf = max(10, conf - 15)
return max(0, min(100, conf))
def __call__(self, prompt: str) -> str:
domain = _detect_domain(prompt)
# Extract just the question line
lines = [l.strip() for l in prompt.split("\n") if l.strip()]
question = next((l for l in reversed(lines) if l.startswith("Question:")), lines[-1])
question = re.sub(r"^Question:\s*", "", question)
conf = self._compute_confidence(question, domain)
ans = "A" if domain in ("logic", "science", "medical") else ""
return _make_response(conf, ans)
def answer(self, question: str, domain: str = "factual") -> ParseResult:
conf = self._compute_confidence(question, domain)
ans = "A" if domain in ("logic", "science", "medical") else ""
return parse_response(_make_response(conf, ans))
# ── TemperatureScaledAgent ────────────────────────────────────────────────────
class TemperatureScaledAgent:
"""
Simulates post-hoc temperature scaling calibration.
Applies a learned temperature T to logit-derived probabilities.
Without real logits, we simulate by perturbing AlwaysHigh confidence
through a sigmoid with learned temperature.
Represents the best EXISTING calibration technique without RL.
Shows that ECHO learns something temperature scaling cannot.
"""
name = "TempScaled"
def __init__(self, temperature: float = 1.5) -> None:
self.temperature = temperature
self._base = AlwaysHighAgent()
@staticmethod
def _sigmoid(x: float) -> float:
return 1.0 / (1.0 + np.exp(-x))
def _scale_confidence(self, raw_conf: int) -> int:
"""Apply temperature scaling to a raw confidence value."""
logit = np.log(raw_conf / 100.0 + 1e-9) - np.log(1 - raw_conf / 100.0 + 1e-9)
scaled_prob = self._sigmoid(logit / self.temperature)
return int(np.clip(round(scaled_prob * 100), 0, 100))
def __call__(self, prompt: str) -> str:
domain = _detect_domain(prompt)
base_conf = np.random.randint(70, 95) # simulate overconfident raw output
scaled = self._scale_confidence(base_conf)
ans = "A" if domain in ("logic", "science", "medical") else ""
return _make_response(scaled, ans)
def answer(self, question: str, domain: str = "factual") -> ParseResult:
raw = self(f"Question: {question}")
return parse_response(raw)
# ── GPTBaseline ───────────────────────────────────────────────────────────────
class GPTBaseline:
"""
GPT-4o-mini calibration baseline using the OpenAI API.
Asks the model to produce <confidence><answer> formatted output.
Requires OPENAI_API_KEY environment variable.
Skipped silently if key is not set or openai is not installed.
"""
name = "GPT-4o-mini"
def __init__(self, api_key: str = None) -> None:
import os
self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
self._available = bool(self.api_key)
def __call__(self, prompt: str) -> str:
if not self._available:
return _make_response(70, "")
try:
from openai import OpenAI
client = OpenAI(api_key=self.api_key)
sys_msg = (
"You are an epistemically honest AI. Before answering, state your confidence.\n"
"Required format: <confidence>NUMBER</confidence><answer>YOUR ANSWER</answer>"
)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": sys_msg},
{"role": "user", "content": prompt},
],
max_tokens=200,
temperature=0.7,
)
return response.choices[0].message.content or _make_response(70, "")
except Exception as exc:
logger.warning("GPTBaseline error: %s", exc)
return _make_response(70, "")
def answer(self, question: str, domain: str = "factual") -> ParseResult:
raw = self(f"Question: {question}")
return parse_response(raw)
# ── Baseline evaluation ───────────────────────────────────────────────────────
ALL_BASELINES = {
"always_fifty": AlwaysFiftyAgent(),
"always_high": AlwaysHighAgent(),
"heuristic": HeuristicAgent(),
"temp_scaled": TemperatureScaledAgent(),
}
def run_baseline_evaluation(
task_bank,
n_episodes: int = 200,
save_path: str = cfg.BASELINE_LOG,
) -> dict:
"""
Run all 4 baselines on the same n_episodes questions.
Returns dict: agent_name β†’ CalibrationReport
"""
from env.echo_env import EchoEnv
results = {}
for name, agent in ALL_BASELINES.items():
logger.info("Evaluating baseline: %s (%d episodes)…", name, n_episodes)
history = RewardHistory()
env = EchoEnv(task_bank=task_bank, reward_history=history, phase=3)
confs, corrs = [], []
for ep in range(n_episodes):
task = task_bank.get_batch(1, phase=3)[0]
env._current_task = task
env._episode_step = 0
prompt = format_prompt(task["question"], task["domain"], task["difficulty"])
try:
action = agent(prompt)
except Exception:
action = _make_response(50, "")
_, _, _, _, info = env.step(action)
confs.append(info["parsed_confidence"])
corrs.append(info["was_correct"])
rep = compute_report(confs, corrs)
results[name] = rep
# Save JSON log
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
with open(save_path, "w") as f:
json.dump({k: v.to_dict() for k, v in results.items()}, f, indent=2)
logger.info("Baseline log saved β†’ %s", save_path)
return results