Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

File size: 11,195 Bytes

acb327b

"""
ECHO ULTIMATE — 4 Baseline Agents.

AlwaysFiftyAgent         — uniform prior, maximum ignorance
AlwaysHighAgent          — typical LLM overconfidence
HeuristicAgent           — smart domain-aware rules, no learning
TemperatureScaledAgent   — post-hoc calibration (simulated)
"""

import json
import logging
import re
from pathlib import Path
from typing import Optional

import numpy as np

from config import cfg
from env.parser import parse_response, ParseResult, format_prompt
from env.reward import RewardHistory, compute_reward
from core.metrics import compute_report, CalibrationReport

logger = logging.getLogger(__name__)

_TRICK_WORDS_RE  = re.compile(r"\b(not|except|never|always|false|incorrect)\b", re.I)
_CHOICE_RE       = re.compile(r"choices?\s*:.*?[A-D]:", re.I | re.S)


def _detect_domain(prompt: str) -> str:
    p = prompt.lower()
    if _CHOICE_RE.search(p):
        if any(w in p for w in ["atom", "force", "energy", "cell", "element", "chemical"]):
            return "science"
        if any(w in p for w in ["patient", "drug", "dose", "symptom", "surgery", "diagnosis"]):
            return "medical"
        return "logic"
    if any(w in p for w in ["print(", "def ", "return", "function", "algorithm", "code", "complexity"]):
        return "coding"
    if any(w in p for w in ["how many", "calculate", " + ", " - ", "×", "*", "divided", "percent", "%"]):
        return "math"
    if any(w in p for w in ["rhyme", "synonym", "literary", "poem", "metaphor"]):
        return "creative"
    return "factual"


def _make_response(conf: int, answer: str = "") -> str:
    return cfg.CONFIDENCE_FORMAT.format(conf=conf, ans=answer)


# ── AlwaysFiftyAgent ──────────────────────────────────────────────────────────

class AlwaysFiftyAgent:
    """
    Always outputs 50% confidence regardless of question.
    Represents: maximum-ignorance / uniform-prior baseline.
    Expected ECE: ~0.10-0.15 on mixed difficulty data.
    """
    name = "AlwaysFifty"

    def __call__(self, prompt: str) -> str:
        domain = _detect_domain(prompt)
        ans = "A" if domain in ("logic", "science", "medical") else ""
        return _make_response(50, ans)

    def answer(self, question: str, domain: str = "factual") -> ParseResult:
        raw = _make_response(50, "A" if domain in ("logic","science","medical") else "")
        return parse_response(raw)


# ── AlwaysHighAgent ───────────────────────────────────────────────────────────

class AlwaysHighAgent:
    """
    Always outputs 90% confidence.
    Represents: typical untrained LLM overconfidence.
    Expected ECE: ~0.35-0.45 on mixed difficulty data.
    """
    name = "AlwaysHigh"

    def __call__(self, prompt: str) -> str:
        domain = _detect_domain(prompt)
        ans = "A" if domain in ("logic", "science", "medical") else ""
        return _make_response(90, ans)

    def answer(self, question: str, domain: str = "factual") -> ParseResult:
        raw = _make_response(90, "A" if domain in ("logic","science","medical") else "")
        return parse_response(raw)


# ── HeuristicAgent ────────────────────────────────────────────────────────────

class HeuristicAgent:
    """
    Domain-aware heuristic rules. No learning involved.
    Expected ECE: ~0.18-0.25.
    """
    name = "Heuristic"

    _BASE_CONF = {
        "math":     65,
        "logic":    35,
        "factual":  55,
        "science":  40,
        "medical":  30,
        "coding":   50,
        "creative": 40,
    }

    def _compute_confidence(self, question: str, domain: str) -> int:
        conf = self._BASE_CONF.get(domain, 50)
        q = question.lower()

        if domain == "math":
            ops = len(re.findall(r"[\+\-\*\/]", q))
            if ops <= 1 and len(q) < 60:
                conf = 80
            elif ops <= 2:
                conf = 60
            else:
                conf = 40

        elif domain in ("logic", "science", "medical"):
            choices = len(re.findall(r"\b[a-d]\b", q, re.I))
            if choices >= 4:
                conf = 30    # 4 choices → 25% random baseline; say 30%
            elif "not" in q or "except" in q:
                conf = 25

        elif domain == "factual":
            words = len(q.split())
            conf = 70 if words <= 8 else (50 if words <= 14 else 35)

        elif domain == "coding":
            if "print(" in q and len(q) < 50:
                conf = 70
            elif "complexity" in q:
                conf = 35

        # Trick-word penalty
        if _TRICK_WORDS_RE.search(question):
            conf = max(10, conf - 15)

        return max(0, min(100, conf))

    def __call__(self, prompt: str) -> str:
        domain = _detect_domain(prompt)
        # Extract just the question line
        lines = [l.strip() for l in prompt.split("\n") if l.strip()]
        question = next((l for l in reversed(lines) if l.startswith("Question:")), lines[-1])
        question = re.sub(r"^Question:\s*", "", question)
        conf = self._compute_confidence(question, domain)
        ans  = "A" if domain in ("logic", "science", "medical") else ""
        return _make_response(conf, ans)

    def answer(self, question: str, domain: str = "factual") -> ParseResult:
        conf = self._compute_confidence(question, domain)
        ans  = "A" if domain in ("logic", "science", "medical") else ""
        return parse_response(_make_response(conf, ans))


# ── TemperatureScaledAgent ────────────────────────────────────────────────────

class TemperatureScaledAgent:
    """
    Simulates post-hoc temperature scaling calibration.
    Applies a learned temperature T to logit-derived probabilities.
    Without real logits, we simulate by perturbing AlwaysHigh confidence
    through a sigmoid with learned temperature.

    Represents the best EXISTING calibration technique without RL.
    Shows that ECHO learns something temperature scaling cannot.
    """
    name = "TempScaled"

    def __init__(self, temperature: float = 1.5) -> None:
        self.temperature = temperature
        self._base = AlwaysHighAgent()

    @staticmethod
    def _sigmoid(x: float) -> float:
        return 1.0 / (1.0 + np.exp(-x))

    def _scale_confidence(self, raw_conf: int) -> int:
        """Apply temperature scaling to a raw confidence value."""
        logit = np.log(raw_conf / 100.0 + 1e-9) - np.log(1 - raw_conf / 100.0 + 1e-9)
        scaled_prob = self._sigmoid(logit / self.temperature)
        return int(np.clip(round(scaled_prob * 100), 0, 100))

    def __call__(self, prompt: str) -> str:
        domain = _detect_domain(prompt)
        base_conf = np.random.randint(70, 95)   # simulate overconfident raw output
        scaled = self._scale_confidence(base_conf)
        ans    = "A" if domain in ("logic", "science", "medical") else ""
        return _make_response(scaled, ans)

    def answer(self, question: str, domain: str = "factual") -> ParseResult:
        raw = self(f"Question: {question}")
        return parse_response(raw)


# ── GPTBaseline ───────────────────────────────────────────────────────────────

class GPTBaseline:
    """
    GPT-4o-mini calibration baseline using the OpenAI API.
    Asks the model to produce <confidence><answer> formatted output.
    Requires OPENAI_API_KEY environment variable.
    Skipped silently if key is not set or openai is not installed.
    """
    name = "GPT-4o-mini"

    def __init__(self, api_key: str = None) -> None:
        import os
        self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
        self._available = bool(self.api_key)

    def __call__(self, prompt: str) -> str:
        if not self._available:
            return _make_response(70, "")
        try:
            from openai import OpenAI
            client = OpenAI(api_key=self.api_key)
            sys_msg = (
                "You are an epistemically honest AI. Before answering, state your confidence.\n"
                "Required format: <confidence>NUMBER</confidence><answer>YOUR ANSWER</answer>"
            )
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": sys_msg},
                    {"role": "user",   "content": prompt},
                ],
                max_tokens=200,
                temperature=0.7,
            )
            return response.choices[0].message.content or _make_response(70, "")
        except Exception as exc:
            logger.warning("GPTBaseline error: %s", exc)
            return _make_response(70, "")

    def answer(self, question: str, domain: str = "factual") -> ParseResult:
        raw = self(f"Question: {question}")
        return parse_response(raw)


# ── Baseline evaluation ───────────────────────────────────────────────────────

ALL_BASELINES = {
    "always_fifty":  AlwaysFiftyAgent(),
    "always_high":   AlwaysHighAgent(),
    "heuristic":     HeuristicAgent(),
    "temp_scaled":   TemperatureScaledAgent(),
}


def run_baseline_evaluation(
    task_bank,
    n_episodes: int = 200,
    save_path: str = cfg.BASELINE_LOG,
) -> dict:
    """
    Run all 4 baselines on the same n_episodes questions.
    Returns dict: agent_name → CalibrationReport
    """
    from env.echo_env import EchoEnv

    results = {}
    for name, agent in ALL_BASELINES.items():
        logger.info("Evaluating baseline: %s (%d episodes)…", name, n_episodes)
        history = RewardHistory()
        env     = EchoEnv(task_bank=task_bank, reward_history=history, phase=3)
        confs, corrs = [], []

        for ep in range(n_episodes):
            task = task_bank.get_batch(1, phase=3)[0]
            env._current_task  = task
            env._episode_step  = 0
            prompt = format_prompt(task["question"], task["domain"], task["difficulty"])

            try:
                action = agent(prompt)
            except Exception:
                action = _make_response(50, "")

            _, _, _, _, info = env.step(action)
            confs.append(info["parsed_confidence"])
            corrs.append(info["was_correct"])

        rep = compute_report(confs, corrs)
        results[name] = rep

    # Save JSON log
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    with open(save_path, "w") as f:
        json.dump({k: v.to_dict() for k, v in results.items()}, f, indent=2)
    logger.info("Baseline log saved → %s", save_path)

    return results