"""
Replot GRPO curves from a Hugging Face Job log (logging every 5 steps).
Data source: sh4shv4t/parlay-sft-1-5b → GRPO, 80 steps, G=2, L4, 2026-04-26.
Run: python scripts/plot_grpo_hf_job_curves.py
Outputs: results/grpo_reward_curve.png, results/grpo_loss_curve.png, results/grpo_train_metrics.json
"""
from __future__ import annotations

import json
from pathlib import Path

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Steps 5,10,...,80 (TRL logs every 5 steps)
STEPS = list(range(5, 81, 5))
# 'reward' and 'loss' from job log (mean reward is GRPO combined reward)
REWARDS = [
    -4.659,
    -4.604,
    -4.764,
    -9.603,
    -4.233,
    0.5344,
    0.8288,
    -4.509,
    0.675,
    -4.456,
    -9.468,
    -9.338,
    0.3431,
    0.4913,
    -4.505,
    -9.64,
]
LOSSES = [
    8.714e-05,
    0.0001001,
    4.062e-05,
    7.433e-05,
    0.0001185,
    0.0002067,
    0.0002253,
    5.42e-05,
    4.912e-05,
    0.0001332,
    0.0001032,
    4.481e-05,
    5.264e-05,
    0.0001981,
    0.0001187,
    7.517e-05,
]


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    out_dir = root / "results"
    out_dir.mkdir(parents=True, exist_ok=True)

    records = [
        {"step": s, "reward": r, "loss": L} for s, r, L in zip(STEPS, REWARDS, LOSSES, strict=True)
    ]
    (out_dir / "grpo_train_metrics.json").write_text(
        json.dumps(
            {
                "meta": {
                    "source": "Hugging Face Job",
                    "sft_model": "sh4shv4t/parlay-sft-1-5b",
                    "grpo_steps": 80,
                    "grpo_g": 2,
                    "flavor": "l4x1",
                    "train_loss_final": 0.0001051,
                    "repo": "sh4shv4t/parlay-grpo-1-5b",
                },
                "points": records,
            },
            indent=2,
        ),
        encoding="utf-8",
    )

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(STEPS, REWARDS, color="#2e6f95", marker="o", ms=4, lw=1.5)
    ax.set_xlabel("Global step")
    ax.set_ylabel("Mean batch reward")
    ax.set_title("GRPO — mean reward (HF Job, 80 steps, G=2, L4)")
    ax.grid(True, alpha=0.3)
    ax.set_xlim(0, 85)
    fig.tight_layout()
    fig.savefig(out_dir / "grpo_reward_curve.png", dpi=120, bbox_inches="tight")
    plt.close(fig)

    fig2, ax2 = plt.subplots(figsize=(10, 5))
    ax2.plot(STEPS, LOSSES, color="#8b4513", marker="o", ms=4, lw=1.5)
    ax2.set_xlabel("Global step")
    ax2.set_ylabel("Policy loss")
    ax2.set_title("GRPO — training loss (HF Job, 80 steps, G=2, L4)")
    ax2.grid(True, alpha=0.3)
    ax2.set_xlim(0, 85)
    fig2.tight_layout()
    fig2.savefig(out_dir / "grpo_loss_curve.png", dpi=120, bbox_inches="tight")
    plt.close(fig2)

    print(f"Wrote {out_dir / 'grpo_reward_curve.png'}")
    print(f"Wrote {out_dir / 'grpo_loss_curve.png'}")
    print(f"Wrote {out_dir / 'grpo_train_metrics.json'}")


if __name__ == "__main__":
    main()