Spaces:
Sleeping
Sleeping
| """ | |
| Statistical evaluation: PPO vs Heuristic on Hard_Multi. | |
| This experiment produces the primary quality signal: | |
| - If PPO materially and consistently exceeds the heuristic on Hard_Multi, | |
| it demonstrates the environment contains a learnable signal that reactive | |
| policies cannot exploit — i.e., the environment is high quality. | |
| - If PPO merely ties or is worse, it suggests the 100k step budget is | |
| insufficient or the signal is too sparse. | |
| Usage: | |
| uv run python train/eval_hard_multi.py | |
| Output: | |
| Printed statistical report + per-seed breakdown. | |
| If improvements are significant, a README snippet is printed. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import statistics | |
| import sys | |
| from pathlib import Path | |
| import json, datetime | |
| # Ensure project root is on sys.path when running as a script | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from stable_baselines3 import PPO | |
| from train.gym_wrapper import BudgetRouterGymEnv | |
| from budget_router.environment import BudgetRouterEnv | |
| from budget_router.models import Action | |
| from budget_router.policies import heuristic_baseline_policy | |
| from budget_router.reward import grade_episode | |
| from budget_router.tasks import HARD_MULTI | |
| MODEL_PATH = "trained_models/ppo_hard_multi_100k.zip" | |
| EVAL_SEEDS = list(range(10)) | |
| HEURISTIC_BASELINE_GRADER = 0.6094 # confirmed from README (dev seeds 0-9) | |
| def _grader(history: list[dict]) -> float: | |
| return float(grade_episode(history)["overall_score"]) | |
| def _grader_breakdown(history: list[dict]) -> dict: | |
| g = grade_episode(history) | |
| return {k: round(float(v), 4) for k, v in g.items()} | |
| def eval_ppo(model: PPO, seeds: list[int]) -> tuple[list[float], list[dict]]: | |
| scores, breakdowns = [], [] | |
| for seed in seeds: | |
| env = BudgetRouterGymEnv(scenario=HARD_MULTI, seed=seed) | |
| inner = env._env | |
| obs, _ = env.reset() | |
| done = False | |
| while not done: | |
| action_idx, _ = model.predict(obs, deterministic=True) | |
| obs, _, terminated, truncated, _ = env.step(int(action_idx)) | |
| done = terminated or truncated | |
| bd = _grader_breakdown(inner._internal.history) | |
| scores.append(bd["overall_score"]) | |
| breakdowns.append(bd) | |
| print(f" [PPO] seed={seed:2d} overall={bd['overall_score']:.4f}" | |
| f" adapt={bd['adaptation_score']:.4f}" | |
| f" budget={bd['budget_score']:.4f}" | |
| f" success={bd['success_score']:.4f}") | |
| return scores, breakdowns | |
| def eval_heuristic(seeds: list[int]) -> tuple[list[float], list[dict]]: | |
| scores, breakdowns = [], [] | |
| for seed in seeds: | |
| env = BudgetRouterEnv() | |
| obs = env.reset(seed=seed, scenario=HARD_MULTI) | |
| while not obs.done: | |
| obs = env.step(heuristic_baseline_policy(obs)) | |
| bd = _grader_breakdown(env._internal.history) | |
| scores.append(bd["overall_score"]) | |
| breakdowns.append(bd) | |
| print(f" [HEU] seed={seed:2d} overall={bd['overall_score']:.4f}" | |
| f" adapt={bd['adaptation_score']:.4f}" | |
| f" budget={bd['budget_score']:.4f}" | |
| f" success={bd['success_score']:.4f}") | |
| return scores, breakdowns | |
| def _confidence_interval_95(values: list[float]) -> tuple[float, float]: | |
| """95% CI using t-distribution (small sample).""" | |
| n = len(values) | |
| mean = statistics.mean(values) | |
| if n < 2: | |
| return mean, mean | |
| se = statistics.stdev(values) / math.sqrt(n) | |
| # t-critical ≈ 2.262 for df=9 (n=10), 95% two-tailed | |
| t_crit = 2.262 | |
| margin = t_crit * se | |
| return mean - margin, mean + margin | |
| def main() -> None: | |
| if not Path(MODEL_PATH).exists(): | |
| print(f"[eval] Model not found at {MODEL_PATH}. Run train/train_ppo_hard_multi.py first.") | |
| return | |
| print(f"[eval] Loading {MODEL_PATH}") | |
| model = PPO.load(MODEL_PATH) | |
| print("\n─── PPO agent (deterministic, Hard_Multi) ───") | |
| ppo_scores, ppo_breakdowns = eval_ppo(model, EVAL_SEEDS) | |
| print("\n─── Heuristic baseline (Hard_Multi) ───") | |
| heuristic_scores, heuristic_breakdowns = eval_heuristic(EVAL_SEEDS) | |
| # ── Statistics ────────────────────────────────────────────────────────── | |
| ppo_mean = statistics.mean(ppo_scores) | |
| ppo_std = statistics.stdev(ppo_scores) | |
| heu_mean = statistics.mean(heuristic_scores) | |
| heu_std = statistics.stdev(heuristic_scores) | |
| delta = ppo_mean - heu_mean | |
| delta_pct = (delta / heu_mean) * 100 if heu_mean > 0 else float("nan") | |
| ppo_lo, ppo_hi = _confidence_interval_95(ppo_scores) | |
| heu_lo, heu_hi = _confidence_interval_95(heuristic_scores) | |
| # Win rate: fraction of seeds where PPO > heuristic | |
| win_rate = sum(p > h for p, h in zip(ppo_scores, heuristic_scores)) / len(ppo_scores) | |
| # Sub-score deltas | |
| avg_adapt_ppo = statistics.mean(b["adaptation_score"] for b in ppo_breakdowns) | |
| avg_adapt_heu = statistics.mean(b["adaptation_score"] for b in heuristic_breakdowns) | |
| avg_budget_ppo = statistics.mean(b["budget_score"] for b in ppo_breakdowns) | |
| avg_budget_heu = statistics.mean(b["budget_score"] for b in heuristic_breakdowns) | |
| sign = "+" if delta >= 0 else "" | |
| print(f""" | |
| ══════════════════════════════════════════════════════════ | |
| HARD_MULTI: PPO vs Heuristic — Statistical Report | |
| ══════════════════════════════════════════════════════════ | |
| PPO grader: {ppo_mean:.4f} ± {ppo_std:.4f} 95% CI [{ppo_lo:.4f}, {ppo_hi:.4f}] | |
| HEU grader: {heu_mean:.4f} ± {heu_std:.4f} 95% CI [{heu_lo:.4f}, {heu_hi:.4f}] | |
| Delta: {sign}{delta:.4f} ({sign}{delta_pct:.1f}%) | |
| Win rate: {win_rate:.0%} ({int(win_rate*len(ppo_scores))}/{len(ppo_scores)} seeds PPO wins) | |
| ── Sub-score breakdown ── | |
| Adaptation: PPO={avg_adapt_ppo:.4f} HEU={avg_adapt_heu:.4f} Δ={avg_adapt_ppo-avg_adapt_heu:+.4f} | |
| Budget: PPO={avg_budget_ppo:.4f} HEU={avg_budget_heu:.4f} Δ={avg_budget_ppo-avg_budget_heu:+.4f} | |
| """) | |
| # ── Verdict ────────────────────────────────────────────────────────────── | |
| if ppo_lo > heu_hi: | |
| verdict = "✅ STRONG: PPO 95% CI is entirely above heuristic 95% CI — non-overlapping." | |
| elif ppo_mean > heu_mean and win_rate >= 0.70: | |
| verdict = f"✅ CLEAR: PPO wins {win_rate:.0%} of seeds with positive mean improvement." | |
| elif ppo_mean > heu_mean and win_rate >= 0.50: | |
| verdict = f"⚠️ MODERATE: PPO wins {win_rate:.0%} of seeds — improvement present but with variance." | |
| elif ppo_mean > heu_mean: | |
| verdict = f"⚠️ WEAK: Mean improvement is positive but PPO wins only {win_rate:.0%} of seeds." | |
| else: | |
| verdict = f"❌ NO IMPROVEMENT: PPO ({ppo_mean:.4f}) ≤ heuristic ({heu_mean:.4f}) — more training needed." | |
| print(f" VERDICT: {verdict}") | |
| print("══════════════════════════════════════════════════════════\n") | |
| if ppo_mean > heu_mean: | |
| print(" README snippet (paste into benchmark table):") | |
| print(f" Hard_Multi PPO row: {ppo_mean:.4f} (Δ {sign}{delta_pct:.1f}% vs heuristic)") | |
| # JSON-serializable summary including per-seed breakdowns | |
| episodes = [] | |
| for seed, p_bd, h_bd in zip(EVAL_SEEDS, ppo_breakdowns, heuristic_breakdowns): | |
| episodes.append({ | |
| "seed": seed, | |
| "ppo": p_bd, | |
| "heuristic": h_bd, | |
| }) | |
| out = { | |
| "timestamp": datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), | |
| "ppo_mean": ppo_mean, | |
| "ppo_std": ppo_std, | |
| "ppo_ci": [ppo_lo, ppo_hi], | |
| "heu_mean": heu_mean, | |
| "heu_std": heu_std, | |
| "heu_ci": [heu_lo, heu_hi], | |
| "delta": delta, | |
| "delta_pct": delta_pct, | |
| "win_rate": win_rate, | |
| "episodes": episodes, | |
| } | |
| Path("outputs/ppo_hard_multi_eval.json").write_text(json.dumps(out, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |