test-rl-hackathon-budget

Sleeping

test-rl-hackathon-budget / train /train_ppo_hard_multi.py

Akshay Babbar

chore: HF Space export (size filter)

98a5a8c 12 days ago

2.97 kB

	"""
	Train a PPO agent on the Hard_Multi scenario.

	This is the key experiment: Hard_Multi has a secondary provider cascade at step 10
	(Provider B degrades after A). A reactive heuristic cannot conserve budget in advance
	and scores ~0.6094. An RL agent with access to step_count + budget_remaining can
	learn anticipatory routing and should materially exceed the heuristic.

	Usage:
	uv run python train/train_ppo_hard_multi.py

	Output:
	trained_models/ppo_hard_multi_100k.zip — saved SB3 model
	trained_models/ppo_hard_multi_100k_tb/ — TensorBoard logs
	"""
	from __future__ import annotations

	import sys
	from pathlib import Path

	# Ensure project root is on sys.path when running as a script
	sys.path.insert(0, str(Path(__file__).parent.parent))

	import torch
	from stable_baselines3 import PPO
	from stable_baselines3.common.env_util import make_vec_env
	from stable_baselines3.common.callbacks import EvalCallback

	from train.gym_wrapper import BudgetRouterGymEnv
	from budget_router.tasks import HARD_MULTI

	# ── Config ──────────────────────────────────────────────────────────────────
	N_ENVS = 4
	TOTAL_STEPS = 100_000 # Hard_Multi needs more signal than Easy
	SAVE_PATH = "trained_models/ppo_hard_multi_100k"
	LOG_PATH = "trained_models/ppo_hard_multi_100k_tb"
	DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
	# ────────────────────────────────────────────────────────────────────────────


	def main() -> None:
	print(f"[train:hard_multi] device={DEVICE} n_envs={N_ENVS} total_steps={TOTAL_STEPS:,}")
	print("[train:hard_multi] Scenario: Provider A degrades step 0, Provider B degrades step 10")
	print("[train:hard_multi] Heuristic baseline grader: 0.6094 (reactive, cannot conserve budget)")

	train_env = make_vec_env(
	lambda: BudgetRouterGymEnv(scenario=HARD_MULTI),
	n_envs=N_ENVS,
	)

	eval_env = BudgetRouterGymEnv(scenario=HARD_MULTI, seed=99)

	eval_cb = EvalCallback(
	eval_env,
	eval_freq=max(10_000 // N_ENVS, 1),
	n_eval_episodes=10,
	verbose=1,
	)

	model = PPO(
	policy="MlpPolicy",
	env=train_env,
	n_steps=512,
	batch_size=64,
	n_epochs=10,
	gamma=0.99,
	gae_lambda=0.95,
	ent_coef=0.02, # slightly higher entropy to encourage exploration on harder task
	learning_rate=3e-4,
	verbose=1,
	device=DEVICE,
	)

	model.learn(
	total_timesteps=TOTAL_STEPS,
	callback=eval_cb,
	progress_bar=True,
	)

	model.save(SAVE_PATH)
	print(f"[train:hard_multi] Model saved → {SAVE_PATH}.zip")


	if __name__ == "__main__":
	main()