Akshay Babbar
chore: HF Space export (size filter)
98a5a8c
#!/usr/bin/env python3
"""
eval_all.py — Budget Router Consolidated Evaluator
====================================================
Runs heuristic + LLM (+ optional PPO) across all tasks and seeds.
Outputs a Markdown table + per-episode JSON to outputs/.
Usage:
# Quick (3 seeds, heuristic + LLM):
uv run python eval_all.py
# Full (10 seeds, all policies):
uv run python eval_all.py --seeds 10 --policies heuristic llm
# Heuristic only (no API needed):
uv run python eval_all.py --policies heuristic
# Specific tasks:
uv run python eval_all.py --tasks hard hard_multi --seeds 5
# Explicit fresh seed bucket:
uv run python eval_all.py --tasks hard_multi --seed-values "200,201,202"
Prerequisites:
export HF_TOKEN=<your_hf_token> # required for LLM policy
export API_BASE_URL=https://router.huggingface.co/v1 # default
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct # default
Output:
outputs/eval_results_<timestamp>.json — full per-episode data
outputs/eval_summary_<timestamp>.md — markdown table for README
"""
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import typer
# ── Add parent to path so we can import budget_router ──────────────────────
sys.path.insert(0, str(Path(__file__).parent))
from budget_router.environment import BudgetRouterEnv
from budget_router.models import Action, ActionType, Observation, TaskConfig
from budget_router.policies import heuristic_baseline_policy
from budget_router.reward import episode_metrics, grade_episode
from budget_router.tasks import EASY, HARD, HARD_MULTI, MEDIUM
from inference import LLMRouter
# ── Config ──────────────────────────────────────────────────────────────────
TASKS: Dict[str, TaskConfig] = {
"easy": EASY,
"medium": MEDIUM,
"hard": HARD,
"hard_multi": HARD_MULTI,
}
SEED_SETS: Dict[str, List[int]] = {
"dev": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
"heldout": [100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
}
API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
LLM_LOG_RAW = (os.getenv("LLM_LOG_RAW") or "").strip().lower() in {"1", "true", "yes", "y", "on"}
LLM_LOG_RAW_MAX_CHARS = int(os.getenv("LLM_LOG_RAW_MAX_CHARS") or "220")
def select_seeds(seed_set: str, seeds: int, seed_values: Optional[str] = None) -> List[int]:
"""Resolve either a named seed set or an explicit comma/space-separated seed list."""
if seed_values is not None:
parsed = [int(part) for part in seed_values.replace(",", " ").split()]
if not parsed:
raise ValueError("No explicit seeds provided in --seed-values")
return parsed
if seed_set not in SEED_SETS:
raise ValueError(f"Unknown seed set: {seed_set}. Choose from: {list(SEED_SETS)}")
named_seeds = SEED_SETS[seed_set]
return named_seeds[:max(1, min(seeds, len(named_seeds)))]
def _single_line(value: str | None) -> str:
if not value:
return "null"
return str(value).replace("\n", " ").replace("\r", " ")
def _truncate(value: str | None, max_chars: int) -> str:
s = _single_line(value).strip()
if len(s) <= max_chars:
return s
return s[: max(0, max_chars - 3)] + "..."
# ── Policies ────────────────────────────────────────────────────────────────
def _llm_choose_action(policy: LLMRouter, obs: Observation) -> str:
action = policy.choose_action(obs)
return action.action_type.value
def _heuristic(obs: Observation) -> str:
return heuristic_baseline_policy(obs).action_type.value
# ── Episode runner ───────────────────────────────────────────────────────────
def run_one_episode(
task_name: str,
task_cfg: TaskConfig,
seed: int,
policy_name: str,
policy, # callable or LLMPolicy
) -> Dict:
env = BudgetRouterEnv()
if policy_name == "llm":
policy.reset(task_name=task_name)
obs = env.reset(seed=seed, scenario=task_cfg)
rewards = []
actions = []
while not obs.done:
if policy_name == "heuristic":
action_str = _heuristic(obs)
else:
action_str = _llm_choose_action(policy, obs)
obs = env.step(Action(action_type=ActionType(action_str)))
reward = float(obs.reward or 0.0)
rewards.append(reward)
actions.append(action_str)
if policy_name == "llm" and LLM_LOG_RAW:
llm_raw = getattr(policy, "last_raw_output", None)
llm_parsed = getattr(policy, "last_parsed_action", None)
typer.echo(
f"[LLM] step={env._internal.current_step} action={action_str} "
f"reward={reward:+.2f} llm_raw={_truncate(llm_raw, max(20, LLM_LOG_RAW_MAX_CHARS))} "
f"llm_parsed={_single_line(llm_parsed)}"
)
grader = grade_episode(env._internal.history)
metrics = episode_metrics(env._internal.history)
return {
"task": task_name,
"seed": seed,
"policy": policy_name,
"total_reward": round(sum(rewards), 4),
"grader_score": round(grader["overall_score"], 4),
"success_score": round(grader["success_score"], 4),
"budget_score": round(grader["budget_score"], 4),
"adaptation_score": round(grader["adaptation_score"], 4),
"latency_score": round(grader["latency_score"], 4),
"sla_score": round(grader["sla_score"], 4),
"success_rate": round(metrics["success_rate"], 4),
"steps": len(rewards),
"actions": actions,
"rewards": rewards,
}
# ── Summary helpers ──────────────────────────────────────────────────────────
def _mean(vals: List[float]) -> float:
return round(sum(vals) / len(vals), 4) if vals else 0.0
def build_summary(results: List[Dict]) -> Dict:
summary = {}
for r in results:
key = (r["task"], r["policy"])
summary.setdefault(key, []).append(r)
return {
f"{task}|{pol}": {
"grader_mean": _mean([e["grader_score"] for e in eps]),
"reward_mean": _mean([e["total_reward"] for e in eps]),
"success_rate": _mean([e["success_rate"] for e in eps]),
"adaptation": _mean([e["adaptation_score"] for e in eps]),
"n": len(eps),
}
for (task, pol), eps in summary.items()
}
def render_markdown_table(summary: Dict, policies: List[str], tasks: List[str]) -> str:
task_labels = {"easy": "Easy", "medium": "Medium", "hard": "Hard", "hard_multi": "Hard_Multi"}
pol_headers = " | ".join(f"{p.upper()} Grader" for p in policies)
lines = [
f"| Task | {pol_headers} | Notes |",
"|" + "---|" * (len(policies) + 2),
]
for task in tasks:
scores = []
for p in policies:
key = f"{task}|{p}"
s = summary.get(key, {})
if s:
n = s["n"]
scores.append(f"{s['grader_mean']:.4f} (n={n})")
else:
scores.append("—")
note = ""
if task == "hard_multi" and len(policies) >= 2:
k0 = f"{task}|{policies[0]}"
k1 = f"{task}|{policies[1]}"
if k0 in summary and k1 in summary:
diff = summary[k1]["grader_mean"] - summary[k0]["grader_mean"]
if diff > 0:
note = f"LLM +{diff*100:.1f} points vs heuristic"
line = f"| {task_labels.get(task, task)} | {' | '.join(scores)} | {note} |"
lines.append(line)
return "\n".join(lines)
# ── CLI ──────────────────────────────────────────────────────────────────────
app = typer.Typer(add_completion=False)
@app.command()
def main(
policies: List[str] = typer.Option(["heuristic", "llm"], help="Policies to run"),
tasks: List[str] = typer.Option(["easy", "medium", "hard", "hard_multi"], help="Tasks"),
seeds: int = typer.Option(3, help="Number of dev seeds (1-10, costs scale with LLM)"),
seed_set: str = typer.Option("dev", help="Seed set: dev | heldout"),
seed_values: Optional[str] = typer.Option(None, help="Explicit comma/space-separated seeds; overrides --seed-set/--seeds"),
out_dir: Path = typer.Option(Path("outputs"), help="Output directory"),
) -> None:
"""Run Budget Router evaluation across policies, tasks, and seeds."""
out_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
try:
selected_seeds = select_seeds(seed_set=seed_set, seeds=seeds, seed_values=seed_values)
except ValueError as e:
typer.echo(str(e), err=True)
raise typer.Exit(1) from e
selected_tasks = {t: TASKS[t] for t in tasks if t in TASKS}
if not selected_tasks:
typer.echo(f"No valid tasks. Choose from: {list(TASKS)}", err=True)
raise typer.Exit(1)
# Build policy instances
policy_instances = {}
for p in policies:
if p == "heuristic":
policy_instances["heuristic"] = None # uses _heuristic() directly
elif p == "llm":
try:
if not API_KEY:
raise RuntimeError("No API key found. Set HF_TOKEN or API_KEY env var.")
policy_instances["llm"] = LLMRouter(
api_base_url=API_BASE_URL, model_name=MODEL_NAME, api_key=API_KEY
)
typer.echo(f"LLM policy: {MODEL_NAME} via {API_BASE_URL}")
except RuntimeError as e:
typer.echo(f"[WARN] LLM policy unavailable: {e} — skipping", err=True)
elif p == "ppo":
typer.echo("[WARN] PPO eval not yet wired in this script — run your train_ppo.py separately", err=True)
all_results = []
total_episodes = len(policy_instances) * len(selected_tasks) * len(selected_seeds)
done = 0
for pol_name, pol_obj in policy_instances.items():
for task_name, task_cfg in selected_tasks.items():
for seed in selected_seeds:
typer.echo(f"[{done+1}/{total_episodes}] {pol_name:10s} | {task_name:12s} | seed={seed} ...", nl=False)
try:
result = run_one_episode(task_name, task_cfg, seed, pol_name, pol_obj)
all_results.append(result)
typer.echo(f" grader={result['grader_score']:.4f} reward={result['total_reward']:+.2f}")
except Exception as e:
typer.echo(f" ERROR: {e}", err=True)
done += 1
if not all_results:
typer.echo("No results produced.", err=True)
raise typer.Exit(1)
# Save JSON
json_path = out_dir / f"eval_results_{ts}.json"
summary = build_summary(all_results)
output = {"metadata": {"timestamp": ts, "policies": policies, "tasks": tasks, "seeds": selected_seeds}, "summary": summary, "episodes": all_results}
json_path.write_text(json.dumps(output, indent=2))
typer.echo(f"\nResults saved to {json_path}")
# Save markdown table
md_table = render_markdown_table(summary, list(policy_instances.keys()), list(selected_tasks.keys()))
md_path = out_dir / f"eval_summary_{ts}.md"
md_path.write_text(f"# Budget Router Evaluation — {ts}\n\n{md_table}\n")
typer.echo(f"Markdown table saved to {md_path}")
typer.echo(f"\n{md_table}")
if __name__ == "__main__":
app()