File size: 8,446 Bytes
98a5a8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
Statistical evaluation: PPO vs Heuristic on Hard_Multi.

This experiment produces the primary quality signal:
- If PPO materially and consistently exceeds the heuristic on Hard_Multi,
  it demonstrates the environment contains a learnable signal that reactive
  policies cannot exploit — i.e., the environment is high quality.

- If PPO merely ties or is worse, it suggests the 100k step budget is
  insufficient or the signal is too sparse.

Usage:
    uv run python train/eval_hard_multi.py

Output:
    Printed statistical report + per-seed breakdown.
    If improvements are significant, a README snippet is printed.
"""
from __future__ import annotations

import math
import statistics
import sys
from pathlib import Path
import json, datetime


# Ensure project root is on sys.path when running as a script
sys.path.insert(0, str(Path(__file__).parent.parent))

from stable_baselines3 import PPO

from train.gym_wrapper import BudgetRouterGymEnv
from budget_router.environment import BudgetRouterEnv
from budget_router.models import Action
from budget_router.policies import heuristic_baseline_policy
from budget_router.reward import grade_episode
from budget_router.tasks import HARD_MULTI

MODEL_PATH = "trained_models/ppo_hard_multi_100k.zip"
EVAL_SEEDS = list(range(10))
HEURISTIC_BASELINE_GRADER = 0.6094  # confirmed from README (dev seeds 0-9)


def _grader(history: list[dict]) -> float:
    return float(grade_episode(history)["overall_score"])


def _grader_breakdown(history: list[dict]) -> dict:
    g = grade_episode(history)
    return {k: round(float(v), 4) for k, v in g.items()}


def eval_ppo(model: PPO, seeds: list[int]) -> tuple[list[float], list[dict]]:
    scores, breakdowns = [], []
    for seed in seeds:
        env = BudgetRouterGymEnv(scenario=HARD_MULTI, seed=seed)
        inner = env._env

        obs, _ = env.reset()
        done = False
        while not done:
            action_idx, _ = model.predict(obs, deterministic=True)
            obs, _, terminated, truncated, _ = env.step(int(action_idx))
            done = terminated or truncated

        bd = _grader_breakdown(inner._internal.history)
        scores.append(bd["overall_score"])
        breakdowns.append(bd)
        print(f"  [PPO]  seed={seed:2d}  overall={bd['overall_score']:.4f}"
              f"  adapt={bd['adaptation_score']:.4f}"
              f"  budget={bd['budget_score']:.4f}"
              f"  success={bd['success_score']:.4f}")
    return scores, breakdowns


def eval_heuristic(seeds: list[int]) -> tuple[list[float], list[dict]]:
    scores, breakdowns = [], []
    for seed in seeds:
        env = BudgetRouterEnv()
        obs = env.reset(seed=seed, scenario=HARD_MULTI)
        while not obs.done:
            obs = env.step(heuristic_baseline_policy(obs))
        bd = _grader_breakdown(env._internal.history)
        scores.append(bd["overall_score"])
        breakdowns.append(bd)
        print(f"  [HEU]  seed={seed:2d}  overall={bd['overall_score']:.4f}"
              f"  adapt={bd['adaptation_score']:.4f}"
              f"  budget={bd['budget_score']:.4f}"
              f"  success={bd['success_score']:.4f}")
    return scores, breakdowns


def _confidence_interval_95(values: list[float]) -> tuple[float, float]:
    """95% CI using t-distribution (small sample)."""
    n = len(values)
    mean = statistics.mean(values)
    if n < 2:
        return mean, mean
    se = statistics.stdev(values) / math.sqrt(n)
    # t-critical ≈ 2.262 for df=9 (n=10), 95% two-tailed
    t_crit = 2.262
    margin = t_crit * se
    return mean - margin, mean + margin


def main() -> None:
    if not Path(MODEL_PATH).exists():
        print(f"[eval] Model not found at {MODEL_PATH}. Run train/train_ppo_hard_multi.py first.")
        return

    print(f"[eval] Loading {MODEL_PATH}")
    model = PPO.load(MODEL_PATH)

    print("\n─── PPO agent (deterministic, Hard_Multi) ───")
    ppo_scores, ppo_breakdowns = eval_ppo(model, EVAL_SEEDS)

    print("\n─── Heuristic baseline (Hard_Multi) ───")
    heuristic_scores, heuristic_breakdowns = eval_heuristic(EVAL_SEEDS)

    # ── Statistics ──────────────────────────────────────────────────────────
    ppo_mean    = statistics.mean(ppo_scores)
    ppo_std     = statistics.stdev(ppo_scores)
    heu_mean    = statistics.mean(heuristic_scores)
    heu_std     = statistics.stdev(heuristic_scores)

    delta       = ppo_mean - heu_mean
    delta_pct   = (delta / heu_mean) * 100 if heu_mean > 0 else float("nan")

    ppo_lo, ppo_hi    = _confidence_interval_95(ppo_scores)
    heu_lo, heu_hi    = _confidence_interval_95(heuristic_scores)

    # Win rate: fraction of seeds where PPO > heuristic
    win_rate = sum(p > h for p, h in zip(ppo_scores, heuristic_scores)) / len(ppo_scores)

    # Sub-score deltas
    avg_adapt_ppo = statistics.mean(b["adaptation_score"] for b in ppo_breakdowns)
    avg_adapt_heu = statistics.mean(b["adaptation_score"] for b in heuristic_breakdowns)
    avg_budget_ppo = statistics.mean(b["budget_score"] for b in ppo_breakdowns)
    avg_budget_heu = statistics.mean(b["budget_score"] for b in heuristic_breakdowns)

    sign = "+" if delta >= 0 else ""

    print(f"""
══════════════════════════════════════════════════════════
  HARD_MULTI: PPO vs Heuristic — Statistical Report
══════════════════════════════════════════════════════════

  PPO  grader:  {ppo_mean:.4f} ± {ppo_std:.4f}   95% CI [{ppo_lo:.4f}, {ppo_hi:.4f}]
  HEU  grader:  {heu_mean:.4f} ± {heu_std:.4f}   95% CI [{heu_lo:.4f}, {heu_hi:.4f}]
  Delta:        {sign}{delta:.4f}  ({sign}{delta_pct:.1f}%)
  Win rate:     {win_rate:.0%}  ({int(win_rate*len(ppo_scores))}/{len(ppo_scores)} seeds PPO wins)

  ── Sub-score breakdown ──
  Adaptation:   PPO={avg_adapt_ppo:.4f}  HEU={avg_adapt_heu:.4f}  Δ={avg_adapt_ppo-avg_adapt_heu:+.4f}
  Budget:       PPO={avg_budget_ppo:.4f}  HEU={avg_budget_heu:.4f}  Δ={avg_budget_ppo-avg_budget_heu:+.4f}
""")

    # ── Verdict ──────────────────────────────────────────────────────────────
    if ppo_lo > heu_hi:
        verdict = "✅ STRONG: PPO 95% CI is entirely above heuristic 95% CI — non-overlapping."
    elif ppo_mean > heu_mean and win_rate >= 0.70:
        verdict = f"✅ CLEAR: PPO wins {win_rate:.0%} of seeds with positive mean improvement."
    elif ppo_mean > heu_mean and win_rate >= 0.50:
        verdict = f"⚠️  MODERATE: PPO wins {win_rate:.0%} of seeds — improvement present but with variance."
    elif ppo_mean > heu_mean:
        verdict = f"⚠️  WEAK: Mean improvement is positive but PPO wins only {win_rate:.0%} of seeds."
    else:
        verdict = f"❌ NO IMPROVEMENT: PPO ({ppo_mean:.4f}) ≤ heuristic ({heu_mean:.4f}) — more training needed."

    print(f"  VERDICT: {verdict}")
    print("══════════════════════════════════════════════════════════\n")

    if ppo_mean > heu_mean:
        print("  README snippet (paste into benchmark table):")
        print(f"  Hard_Multi PPO row: {ppo_mean:.4f}{sign}{delta_pct:.1f}% vs heuristic)")

    # JSON-serializable summary including per-seed breakdowns
    episodes = []
    for seed, p_bd, h_bd in zip(EVAL_SEEDS, ppo_breakdowns, heuristic_breakdowns):
        episodes.append({
            "seed": seed,
            "ppo": p_bd,
            "heuristic": h_bd,
        })

    out = {
        "timestamp": datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
        "ppo_mean": ppo_mean,
        "ppo_std": ppo_std,
        "ppo_ci": [ppo_lo, ppo_hi],
        "heu_mean": heu_mean,
        "heu_std": heu_std,
        "heu_ci": [heu_lo, heu_hi],
        "delta": delta,
        "delta_pct": delta_pct,
        "win_rate": win_rate,
        "episodes": episodes,
    }
    Path("outputs/ppo_hard_multi_eval.json").write_text(json.dumps(out, indent=2))


if __name__ == "__main__":
    main()