test-rl-hackathon-budget

Sleeping

test-rl-hackathon-budget / train /eval_ppo.py

Akshay Babbar

chore: HF Space export (size filter)

98a5a8c 12 days ago

3.43 kB

	"""
	Evaluate the trained PPO agent against the heuristic baseline.

	Usage:
	uv run python train/eval_ppo.py

	Loads trained_models/ppo_easy_50k.zip and runs 10 episodes (seeds 0-9),
	reporting per-episode and mean grader scores.
	"""
	from __future__ import annotations

	import statistics
	from pathlib import Path

	from stable_baselines3 import PPO

	from train.gym_wrapper import BudgetRouterGymEnv
	from budget_router.environment import BudgetRouterEnv
	from budget_router.models import Action, ActionType
	from budget_router.policies import heuristic_baseline_policy
	from budget_router.reward import grade_episode
	from budget_router.tasks import EASY

	MODEL_PATH = "trained_models/ppo_easy_50k.zip"
	EVAL_SEEDS = list(range(10)) # seeds 0-9 (development set)
	HEURISTIC_BASELINE = 0.7958 # confirmed grader score from README


	def _grader_score_from_history(history: list[dict]) -> float:
	"""Compute grader score directly from the environment's history dict."""
	return float(grade_episode(history)["overall_score"])


	def eval_ppo(model: PPO, seeds: list[int]) -> list[float]:
	"""Run PPO policy for each seed, return list of grader scores."""
	scores = []
	for seed in seeds:
	env = BudgetRouterGymEnv(scenario=EASY, seed=seed)
	inner_env = env._env # direct access to BudgetRouterEnv for history

	obs, _ = env.reset()
	done = False
	while not done:
	action_idx, _ = model.predict(obs, deterministic=True)
	obs, _, terminated, truncated, _ = env.step(int(action_idx))
	done = terminated or truncated

	score = _grader_score_from_history(inner_env._internal.history)
	scores.append(score)
	print(f" seed={seed:2d} grader={score:.4f}")
	return scores


	def eval_heuristic(seeds: list[int]) -> list[float]:
	"""Run heuristic policy for each seed, return list of grader scores."""
	scores = []
	for seed in seeds:
	env = BudgetRouterEnv()
	obs = env.reset(seed=seed, scenario=EASY)
	while not obs.done:
	action = heuristic_baseline_policy(obs)
	obs = env.step(action)
	score = _grader_score_from_history(env._internal.history)
	scores.append(score)
	return scores


	def main() -> None:
	if not Path(MODEL_PATH).exists():
	print(f"[eval] Model not found at {MODEL_PATH}. Run train/train_ppo.py first.")
	return

	print(f"[eval] Loading {MODEL_PATH}")
	model = PPO.load(MODEL_PATH)

	print("\n[eval] PPO agent (deterministic):")
	ppo_scores = eval_ppo(model, EVAL_SEEDS)
	ppo_mean = statistics.mean(ppo_scores)

	print("\n[eval] Heuristic baseline:")
	heuristic_scores = eval_heuristic(EVAL_SEEDS)
	heuristic_mean = statistics.mean(heuristic_scores)

	print("\n── Results ──────────────────────────────────")
	print(f" PPO mean grader score : {ppo_mean:.4f}")
	print(f" Heuristic mean grader : {heuristic_mean:.4f} (expected ≈ {HEURISTIC_BASELINE})")
	delta = ppo_mean - heuristic_mean
	sign = "+" if delta >= 0 else ""
	print(f" Delta (PPO - heuristic): {sign}{delta:.4f}")
	if ppo_mean > 0.60:
	print(" ✅ PPO > 0.60 threshold — README update warranted.")
	else:
	print(" ⚠️ PPO < 0.60 — keep scaffolding but skip README PPO row.")


	if __name__ == "__main__":
	main()