"""
Train a PPO agent on the Hard_Multi scenario.

This is the key experiment: Hard_Multi has a secondary provider cascade at step 10
(Provider B degrades after A). A reactive heuristic cannot conserve budget in advance
and scores ~0.6094. An RL agent with access to step_count + budget_remaining can
learn anticipatory routing and should materially exceed the heuristic.

Usage:
    uv run python train/train_ppo_hard_multi.py

Output:
    trained_models/ppo_hard_multi_100k.zip   — saved SB3 model
    trained_models/ppo_hard_multi_100k_tb/   — TensorBoard logs
"""
from __future__ import annotations

import sys
from pathlib import Path

# Ensure project root is on sys.path when running as a script
sys.path.insert(0, str(Path(__file__).parent.parent))

import torch
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

from train.gym_wrapper import BudgetRouterGymEnv
from budget_router.tasks import HARD_MULTI

# ── Config ──────────────────────────────────────────────────────────────────
N_ENVS      = 4
TOTAL_STEPS = 100_000           # Hard_Multi needs more signal than Easy
SAVE_PATH   = "trained_models/ppo_hard_multi_100k"
LOG_PATH    = "trained_models/ppo_hard_multi_100k_tb"
DEVICE      = "mps" if torch.backends.mps.is_available() else "cpu"
# ────────────────────────────────────────────────────────────────────────────


def main() -> None:
    print(f"[train:hard_multi] device={DEVICE}  n_envs={N_ENVS}  total_steps={TOTAL_STEPS:,}")
    print("[train:hard_multi] Scenario: Provider A degrades step 0, Provider B degrades step 10")
    print("[train:hard_multi] Heuristic baseline grader: 0.6094  (reactive, cannot conserve budget)")

    train_env = make_vec_env(
        lambda: BudgetRouterGymEnv(scenario=HARD_MULTI),
        n_envs=N_ENVS,
    )

    eval_env = BudgetRouterGymEnv(scenario=HARD_MULTI, seed=99)

    eval_cb = EvalCallback(
        eval_env,
        eval_freq=max(10_000 // N_ENVS, 1),
        n_eval_episodes=10,
        verbose=1,
    )

    model = PPO(
        policy="MlpPolicy",
        env=train_env,
        n_steps=512,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        ent_coef=0.02,          # slightly higher entropy to encourage exploration on harder task
        learning_rate=3e-4,
        verbose=1,
        device=DEVICE,
    )

    model.learn(
        total_timesteps=TOTAL_STEPS,
        callback=eval_cb,
        progress_bar=True,
    )

    model.save(SAVE_PATH)
    print(f"[train:hard_multi] Model saved → {SAVE_PATH}.zip")


if __name__ == "__main__":
    main()