Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

App Files Files Community

echo-ultimate / core /baseline.py

Vikaspandey582003

Upload folder using huggingface_hub

acb327b verified 13 days ago

raw

history blame contribute delete

11.2 kB

	"""
	ECHO ULTIMATE — 4 Baseline Agents.

	AlwaysFiftyAgent — uniform prior, maximum ignorance
	AlwaysHighAgent — typical LLM overconfidence
	HeuristicAgent — smart domain-aware rules, no learning
	TemperatureScaledAgent — post-hoc calibration (simulated)
	"""

	import json
	import logging
	import re
	from pathlib import Path
	from typing import Optional

	import numpy as np

	from config import cfg
	from env.parser import parse_response, ParseResult, format_prompt
	from env.reward import RewardHistory, compute_reward
	from core.metrics import compute_report, CalibrationReport

	logger = logging.getLogger(__name__)

	_TRICK_WORDS_RE = re.compile(r"\b(not\|except\|never\|always\|false\|incorrect)\b", re.I)
	_CHOICE_RE = re.compile(r"choices?\s:.?[A-D]:", re.I \| re.S)


	def _detect_domain(prompt: str) -> str:
	p = prompt.lower()
	if _CHOICE_RE.search(p):
	if any(w in p for w in ["atom", "force", "energy", "cell", "element", "chemical"]):
	return "science"
	if any(w in p for w in ["patient", "drug", "dose", "symptom", "surgery", "diagnosis"]):
	return "medical"
	return "logic"
	if any(w in p for w in ["print(", "def ", "return", "function", "algorithm", "code", "complexity"]):
	return "coding"
	if any(w in p for w in ["how many", "calculate", " + ", " - ", "×", "*", "divided", "percent", "%"]):
	return "math"
	if any(w in p for w in ["rhyme", "synonym", "literary", "poem", "metaphor"]):
	return "creative"
	return "factual"


	def _make_response(conf: int, answer: str = "") -> str:
	return cfg.CONFIDENCE_FORMAT.format(conf=conf, ans=answer)


	# ── AlwaysFiftyAgent ──────────────────────────────────────────────────────────

	class AlwaysFiftyAgent:
	"""
	Always outputs 50% confidence regardless of question.
	Represents: maximum-ignorance / uniform-prior baseline.
	Expected ECE: ~0.10-0.15 on mixed difficulty data.
	"""
	name = "AlwaysFifty"

	def __call__(self, prompt: str) -> str:
	domain = _detect_domain(prompt)
	ans = "A" if domain in ("logic", "science", "medical") else ""
	return _make_response(50, ans)

	def answer(self, question: str, domain: str = "factual") -> ParseResult:
	raw = _make_response(50, "A" if domain in ("logic","science","medical") else "")
	return parse_response(raw)


	# ── AlwaysHighAgent ───────────────────────────────────────────────────────────

	class AlwaysHighAgent:
	"""
	Always outputs 90% confidence.
	Represents: typical untrained LLM overconfidence.
	Expected ECE: ~0.35-0.45 on mixed difficulty data.
	"""
	name = "AlwaysHigh"

	def __call__(self, prompt: str) -> str:
	domain = _detect_domain(prompt)
	ans = "A" if domain in ("logic", "science", "medical") else ""
	return _make_response(90, ans)

	def answer(self, question: str, domain: str = "factual") -> ParseResult:
	raw = _make_response(90, "A" if domain in ("logic","science","medical") else "")
	return parse_response(raw)


	# ── HeuristicAgent ────────────────────────────────────────────────────────────

	class HeuristicAgent:
	"""
	Domain-aware heuristic rules. No learning involved.
	Expected ECE: ~0.18-0.25.
	"""
	name = "Heuristic"

	_BASE_CONF = {
	"math": 65,
	"logic": 35,
	"factual": 55,
	"science": 40,
	"medical": 30,
	"coding": 50,
	"creative": 40,
	}

	def _compute_confidence(self, question: str, domain: str) -> int:
	conf = self._BASE_CONF.get(domain, 50)
	q = question.lower()

	if domain == "math":
	ops = len(re.findall(r"[\+\-\*\/]", q))
	if ops <= 1 and len(q) < 60:
	conf = 80
	elif ops <= 2:
	conf = 60
	else:
	conf = 40

	elif domain in ("logic", "science", "medical"):
	choices = len(re.findall(r"\b[a-d]\b", q, re.I))
	if choices >= 4:
	conf = 30 # 4 choices → 25% random baseline; say 30%
	elif "not" in q or "except" in q:
	conf = 25

	elif domain == "factual":
	words = len(q.split())
	conf = 70 if words <= 8 else (50 if words <= 14 else 35)

	elif domain == "coding":
	if "print(" in q and len(q) < 50:
	conf = 70
	elif "complexity" in q:
	conf = 35

	# Trick-word penalty
	if _TRICK_WORDS_RE.search(question):
	conf = max(10, conf - 15)

	return max(0, min(100, conf))

	def __call__(self, prompt: str) -> str:
	domain = _detect_domain(prompt)
	# Extract just the question line
	lines = [l.strip() for l in prompt.split("\n") if l.strip()]
	question = next((l for l in reversed(lines) if l.startswith("Question:")), lines[-1])
	question = re.sub(r"^Question:\s*", "", question)
	conf = self._compute_confidence(question, domain)
	ans = "A" if domain in ("logic", "science", "medical") else ""
	return _make_response(conf, ans)

	def answer(self, question: str, domain: str = "factual") -> ParseResult:
	conf = self._compute_confidence(question, domain)
	ans = "A" if domain in ("logic", "science", "medical") else ""
	return parse_response(_make_response(conf, ans))


	# ── TemperatureScaledAgent ────────────────────────────────────────────────────

	class TemperatureScaledAgent:
	"""
	Simulates post-hoc temperature scaling calibration.
	Applies a learned temperature T to logit-derived probabilities.
	Without real logits, we simulate by perturbing AlwaysHigh confidence
	through a sigmoid with learned temperature.

	Represents the best EXISTING calibration technique without RL.
	Shows that ECHO learns something temperature scaling cannot.
	"""
	name = "TempScaled"

	def __init__(self, temperature: float = 1.5) -> None:
	self.temperature = temperature
	self._base = AlwaysHighAgent()

	@staticmethod
	def _sigmoid(x: float) -> float:
	return 1.0 / (1.0 + np.exp(-x))

	def _scale_confidence(self, raw_conf: int) -> int:
	"""Apply temperature scaling to a raw confidence value."""
	logit = np.log(raw_conf / 100.0 + 1e-9) - np.log(1 - raw_conf / 100.0 + 1e-9)
	scaled_prob = self._sigmoid(logit / self.temperature)
	return int(np.clip(round(scaled_prob * 100), 0, 100))

	def __call__(self, prompt: str) -> str:
	domain = _detect_domain(prompt)
	base_conf = np.random.randint(70, 95) # simulate overconfident raw output
	scaled = self._scale_confidence(base_conf)
	ans = "A" if domain in ("logic", "science", "medical") else ""
	return _make_response(scaled, ans)

	def answer(self, question: str, domain: str = "factual") -> ParseResult:
	raw = self(f"Question: {question}")
	return parse_response(raw)


	# ── GPTBaseline ───────────────────────────────────────────────────────────────

	class GPTBaseline:
	"""
	GPT-4o-mini calibration baseline using the OpenAI API.
	Asks the model to produce <confidence><answer> formatted output.
	Requires OPENAI_API_KEY environment variable.
	Skipped silently if key is not set or openai is not installed.
	"""
	name = "GPT-4o-mini"

	def __init__(self, api_key: str = None) -> None:
	import os
	self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
	self._available = bool(self.api_key)

	def __call__(self, prompt: str) -> str:
	if not self._available:
	return _make_response(70, "")
	try:
	from openai import OpenAI
	client = OpenAI(api_key=self.api_key)
	sys_msg = (
	"You are an epistemically honest AI. Before answering, state your confidence.\n"
	"Required format: <confidence>NUMBER</confidence><answer>YOUR ANSWER</answer>"
	)
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": sys_msg},
	{"role": "user", "content": prompt},
	],
	max_tokens=200,
	temperature=0.7,
	)
	return response.choices[0].message.content or _make_response(70, "")
	except Exception as exc:
	logger.warning("GPTBaseline error: %s", exc)
	return _make_response(70, "")

	def answer(self, question: str, domain: str = "factual") -> ParseResult:
	raw = self(f"Question: {question}")
	return parse_response(raw)


	# ── Baseline evaluation ───────────────────────────────────────────────────────

	ALL_BASELINES = {
	"always_fifty": AlwaysFiftyAgent(),
	"always_high": AlwaysHighAgent(),
	"heuristic": HeuristicAgent(),
	"temp_scaled": TemperatureScaledAgent(),
	}


	def run_baseline_evaluation(
	task_bank,
	n_episodes: int = 200,
	save_path: str = cfg.BASELINE_LOG,
	) -> dict:
	"""
	Run all 4 baselines on the same n_episodes questions.
	Returns dict: agent_name → CalibrationReport
	"""
	from env.echo_env import EchoEnv

	results = {}
	for name, agent in ALL_BASELINES.items():
	logger.info("Evaluating baseline: %s (%d episodes)…", name, n_episodes)
	history = RewardHistory()
	env = EchoEnv(task_bank=task_bank, reward_history=history, phase=3)
	confs, corrs = [], []

	for ep in range(n_episodes):
	task = task_bank.get_batch(1, phase=3)[0]
	env._current_task = task
	env._episode_step = 0
	prompt = format_prompt(task["question"], task["domain"], task["difficulty"])

	try:
	action = agent(prompt)
	except Exception:
	action = _make_response(50, "")

	_, _, _, _, info = env.step(action)
	confs.append(info["parsed_confidence"])
	corrs.append(info["was_correct"])

	rep = compute_report(confs, corrs)
	results[name] = rep

	# Save JSON log
	Path(save_path).parent.mkdir(parents=True, exist_ok=True)
	with open(save_path, "w") as f:
	json.dump({k: v.to_dict() for k, v in results.items()}, f, indent=2)
	logger.info("Baseline log saved → %s", save_path)

	return results