Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

App Files Files Community

AxiomForgeAI / scripts /plot_grpo_run.py

jampuramprem

Initial Space deployment

ec4ae03 12 days ago

raw

history blame contribute delete

16.4 kB

	#!/usr/bin/env python3
	"""
	Generate demo-quality plots from a completed (or in-progress) GRPO run.

	Usage
	-----
	# from the run output directory
	python scripts/plot_grpo_run.py checkpoints/grpo/<run_name>/metrics.jsonl

	# auto-discover the latest run
	python scripts/plot_grpo_run.py --latest

	# custom output directory
	python scripts/plot_grpo_run.py metrics.jsonl --out-dir plots/my_run

	Output
	------
	Six PNG files saved next to the JSONL (or --out-dir if given):

	01_training_objective.png – combined_score vs iteration (PRIMARY demo plot)
	02_reward_components.png – 4-panel breakdown: correct / PRM / SymPy / format
	03_training_dynamics.png – GRPO loss + batch reward + batch accuracy
	04_reward_vs_eval.png – training reward vs eval score on same axis
	05_component_area.png – stacked-area chart of the 4 weighted components
	06_summary_card.png – single-panel card: all key metrics in one view

	All figures use a clean dark-on-white academic style. They are saved at
	300 dpi so they look sharp in slides and posters.
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	import matplotlib
	matplotlib.use("Agg") # headless — no display needed on training servers
	import matplotlib.pyplot as plt
	import matplotlib.ticker as mtick
	import numpy as np


	# ── Style ────────────────────────────────────────────────────────────────────

	PALETTE = {
	"combined": "#2563EB", # blue — training objective
	"correct": "#16A34A", # green — correctness
	"prm": "#DC2626", # red — PRM step quality
	"sympy": "#D97706", # amber — SymPy verification
	"fmt": "#7C3AED", # violet — format
	"reward": "#0891B2", # cyan — mean batch reward
	"loss": "#64748B", # slate — loss
	"batch_acc": "#059669", # emerald — batch accuracy
	}

	plt.rcParams.update({
	"figure.dpi": 150,
	"savefig.dpi": 300,
	"font.family": "DejaVu Sans",
	"axes.spines.top": False,
	"axes.spines.right": False,
	"axes.grid": True,
	"grid.alpha": 0.3,
	"grid.linestyle": "--",
	"axes.labelsize": 11,
	"axes.titlesize": 13,
	"legend.fontsize": 9,
	"xtick.labelsize": 9,
	"ytick.labelsize": 9,
	})


	# ── Data loading ─────────────────────────────────────────────────────────────

	def _load(path: Path) -> List[Dict[str, Any]]:
	rows = []
	with path.open(encoding="utf-8") as fh:
	for line in fh:
	line = line.strip()
	if line:
	rows.append(json.loads(line))
	return rows


	def _field(rows: List[Dict], key: str) -> Tuple[List[int], List[float]]:
	"""Return (iterations, values) for rows that have a non-empty key."""
	iters, vals = [], []
	for r in rows:
	v = r.get(key)
	if v is not None and v != "" and not (isinstance(v, float) and np.isnan(v)):
	try:
	iters.append(int(r["iteration"]))
	vals.append(float(v))
	except (TypeError, ValueError):
	pass
	return iters, vals


	# ── Individual plots ─────────────────────────────────────────────────────────

	def plot_training_objective(rows: List[Dict], out: Path) -> None:
	"""Plot 01: combined_score — the single most important demo plot."""
	xi, xv = _field(rows, "combined_score")
	if not xi:
	return

	fig, ax = plt.subplots(figsize=(9, 5))
	ax.plot(xi, xv, color=PALETTE["combined"], linewidth=2.5,
	marker="o", markersize=5, label="Training-objective score")
	ax.fill_between(xi, xv, alpha=0.12, color=PALETTE["combined"])

	# annotate first and last eval points
	ax.annotate(f"{xv[0]:.3f}", (xi[0], xv[0]), textcoords="offset points",
	xytext=(8, 6), fontsize=8, color=PALETTE["combined"])
	ax.annotate(f"{xv[-1]:.3f}", (xi[-1], xv[-1]), textcoords="offset points",
	xytext=(8, 6), fontsize=8, color=PALETTE["combined"])

	ax.set_xlabel("Iteration")
	ax.set_ylabel("Score (0 – 1)")
	ax.set_title(
	"GRPO Training — Combined Reward Score\n"
	"0.60 × correct + 0.15 × PRM + 0.15 × SymPy + 0.10 × format",
	fontsize=12,
	)
	ax.set_ylim(0, 1.05)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
	ax.legend(loc="lower right")
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	print(f" saved {out.name}")


	def plot_reward_components(rows: List[Dict], out: Path) -> None:
	"""Plot 02: four-panel breakdown of each reward component."""
	specs = [
	("correct_rate", "correct", "Correctness (gt_match)", "60 %"),
	("prm_mean", "prm", "PRM Step Quality", "15 %"),
	("sympy_mean", "sympy", "SymPy Verification", "15 %"),
	("format_mean", "fmt", "Format Compliance", "10 %"),
	]

	fig, axes = plt.subplots(2, 2, figsize=(12, 7), sharex=False)
	axes = axes.flatten()

	for ax, (key, pal, title, weight) in zip(axes, specs):
	xi, xv = _field(rows, key)
	if not xi:
	ax.set_visible(False)
	continue
	ax.plot(xi, xv, color=PALETTE[pal], linewidth=2,
	marker="o", markersize=4)
	ax.fill_between(xi, xv, alpha=0.12, color=PALETTE[pal])
	ax.set_title(f"{title} (weight {weight})", fontsize=11)
	ax.set_xlabel("Iteration")
	ax.set_ylabel("Score")
	ax.set_ylim(0, 1.05)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))

	if xv:
	delta = xv[-1] - xv[0]
	sign = "+" if delta >= 0 else ""
	ax.set_title(
	f"{title} (weight {weight}) Δ={sign}{delta:+.1%}",
	fontsize=10,
	)

	fig.suptitle("Reward Component Breakdown over Training", fontsize=13, y=1.01)
	fig.tight_layout()
	fig.savefig(out, bbox_inches="tight")
	plt.close(fig)
	print(f" saved {out.name}")


	def plot_training_dynamics(rows: List[Dict], out: Path) -> None:
	"""Plot 03: loss, mean_reward, batch_accuracy over all iterations."""
	li, lv = _field(rows, "loss")
	ri, rv = _field(rows, "mean_reward")
	bi, bv = _field(rows, "batch_accuracy")

	fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)

	if lv:
	axes[0].plot(li, lv, color=PALETTE["loss"], linewidth=1.8)
	axes[0].fill_between(li, lv, alpha=0.1, color=PALETTE["loss"])
	axes[0].set_ylabel("GRPO Loss")
	axes[0].set_title("Training Loss", fontsize=11)
	axes[0].axhline(0, color="black", linewidth=0.8, linestyle="--", alpha=0.4)

	if rv:
	axes[1].plot(ri, rv, color=PALETTE["reward"], linewidth=1.8)
	axes[1].fill_between(ri, rv, alpha=0.1, color=PALETTE["reward"])
	axes[1].set_ylabel("Reward")
	axes[1].set_ylim(0, 1.05)
	axes[1].set_title("Mean Batch Reward", fontsize=11)
	axes[1].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))

	if bv:
	axes[2].plot(bi, bv, color=PALETTE["batch_acc"], linewidth=1.8)
	axes[2].fill_between(bi, bv, alpha=0.1, color=PALETTE["batch_acc"])
	axes[2].set_ylabel("Accuracy")
	axes[2].set_ylim(0, 1.05)
	axes[2].set_title("Batch Accuracy (training rollouts)", fontsize=11)
	axes[2].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))

	for ax in axes:
	ax.set_xlabel("Iteration")

	fig.suptitle("GRPO Training Dynamics", fontsize=13)
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	print(f" saved {out.name}")


	def plot_reward_vs_eval(rows: List[Dict], out: Path) -> None:
	"""Plot 04: mean_reward (all iters) + combined_score (eval iters) overlaid."""
	ri, rv = _field(rows, "mean_reward")
	ei, ev = _field(rows, "combined_score")

	fig, ax = plt.subplots(figsize=(10, 5))

	if rv:
	ax.plot(ri, rv, color=PALETTE["reward"], linewidth=1.4, alpha=0.7,
	label="Batch reward (training)")
	ax.fill_between(ri, rv, alpha=0.06, color=PALETTE["reward"])

	if ev:
	ax.plot(ei, ev, color=PALETTE["combined"], linewidth=2.5,
	marker="D", markersize=6, label="Eval score (held-out GSM8K)")
	for x, y in zip(ei, ev):
	ax.annotate(f"{y:.3f}", (x, y), textcoords="offset points",
	xytext=(0, 8), ha="center", fontsize=7,
	color=PALETTE["combined"])

	ax.set_xlabel("Iteration")
	ax.set_ylabel("Score (0 – 1)")
	ax.set_ylim(0, 1.05)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
	ax.set_title("Training Reward vs Held-Out Eval Score", fontsize=12)
	ax.legend()
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	print(f" saved {out.name}")


	def plot_component_area(rows: List[Dict], out: Path) -> None:
	"""Plot 05: stacked-area of the four WEIGHTED components summing to combined_score."""
	ei, ev_combined = _field(rows, "combined_score")
	if not ei:
	return

	# Build per-component weighted series aligned to eval iterations
	iter_set = set(ei)
	aligned: Dict[str, List[float]] = {k: [] for k in ("correct", "prm", "sympy", "fmt")}
	weights = {"correct": 0.60, "prm": 0.15, "sympy": 0.15, "fmt": 0.10}
	keys = {"correct": "correct_rate", "prm": "prm_mean",
	"sympy": "sympy_mean", "fmt": "format_mean"}

	# Build lookup per iteration
	it_map: Dict[int, Dict] = {r["iteration"]: r for r in rows if r["iteration"] in iter_set}
	iters_sorted = sorted(iter_set)

	for it in iters_sorted:
	row = it_map.get(it, {})
	for comp, field in keys.items():
	v = row.get(field)
	if v is not None and v != "":
	aligned[comp].append(float(v) * weights[comp])
	else:
	aligned[comp].append(0.0)

	x = np.array(iters_sorted)
	arr = np.array([aligned["correct"], aligned["prm"],
	aligned["sympy"], aligned["fmt"]])

	fig, ax = plt.subplots(figsize=(10, 5))
	labels = ["Correct (×0.60)", "PRM (×0.15)", "SymPy (×0.15)", "Format (×0.10)"]
	colors = [PALETTE[k] for k in ("correct", "prm", "sympy", "fmt")]
	ax.stackplot(x, arr, labels=labels, colors=colors, alpha=0.75)

	ax.plot(x, ev_combined, color="black", linewidth=1.5,
	linestyle="--", label="Combined score", zorder=5)

	ax.set_xlabel("Iteration")
	ax.set_ylabel("Weighted contribution to score")
	ax.set_ylim(0, 1.0)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
	ax.set_title("Contribution of Each Reward Component (Stacked)", fontsize=12)
	ax.legend(loc="lower right", ncol=2)
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	print(f" saved {out.name}")


	def plot_summary_card(rows: List[Dict], run_name: str, out: Path) -> None:
	"""Plot 06: all key metrics on a single clean card — ideal for poster / slide."""
	ei, ev = _field(rows, "combined_score")
	_, crv = _field(rows, "correct_rate")
	_, prmv = _field(rows, "prm_mean")
	_, syv = _field(rows, "sympy_mean")
	_, fmv = _field(rows, "format_mean")
	_, lv = _field(rows, "loss")
	_, rv = _field(rows, "mean_reward")
	li = _field(rows, "loss")[0]
	ri = _field(rows, "mean_reward")[0]

	fig, axes = plt.subplots(2, 3, figsize=(15, 8))
	axes = axes.flatten()

	def _panel(ax, iters, vals, color, title, pct=True):
	if not iters:
	ax.set_visible(False)
	return
	ax.plot(iters, vals, color=color, linewidth=2, marker="o", markersize=4)
	ax.fill_between(iters, vals, alpha=0.12, color=color)
	ax.set_title(title, fontsize=11, fontweight="bold")
	ax.set_xlabel("Iteration", fontsize=9)
	if pct:
	ax.set_ylim(0, 1.05)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
	if vals:
	ax.annotate(f"{vals[-1]:.3f}", (iters[-1], vals[-1]),
	textcoords="offset points", xytext=(6, 4),
	fontsize=8, color=color)

	_panel(axes[0], ei, ev, PALETTE["combined"], "Training-Objective Score")
	_panel(axes[1], ei, crv, PALETTE["correct"], "Correctness Rate")
	_panel(axes[2], ei, prmv, PALETTE["prm"], "PRM Step Quality")
	_panel(axes[3], ei, syv, PALETTE["sympy"], "SymPy Verification")
	_panel(axes[4], ei, fmv, PALETTE["fmt"], "Format Compliance")
	_panel(axes[5], li, lv, PALETTE["loss"], "GRPO Loss", pct=False)

	fig.suptitle(f"GRPO Training Summary — {run_name}", fontsize=14, fontweight="bold")
	fig.tight_layout()
	fig.savefig(out, bbox_inches="tight")
	plt.close(fig)
	print(f" saved {out.name}")


	# ── CLI ──────────────────────────────────────────────────────────────────────

	def find_latest_metrics() -> Optional[Path]:
	"""Find the most recently modified metrics.jsonl under checkpoints/grpo/."""
	ckpt = Path("checkpoints/grpo")
	if not ckpt.exists():
	return None
	candidates = sorted(
	ckpt.rglob("metrics.jsonl"),
	key=lambda p: p.stat().st_mtime,
	)
	return candidates[-1] if candidates else None


	def generate_plots(metrics_path: Path, out_dir: Optional[Path] = None) -> Path:
	"""Generate all six plots and return the output directory."""
	rows = _load(metrics_path)
	if not rows:
	print(f"[plot] No data in {metrics_path}", file=sys.stderr)
	return metrics_path.parent

	out_dir = out_dir or metrics_path.parent / "plots"
	out_dir.mkdir(parents=True, exist_ok=True)

	# Derive run name from the directory name two levels up
	run_name = metrics_path.parent.name

	print(f"[plot] Generating plots for run '{run_name}' ({len(rows)} iterations)")
	print(f"[plot] Output → {out_dir}")

	plot_training_objective(rows, out_dir / "01_training_objective.png")
	plot_reward_components(rows, out_dir / "02_reward_components.png")
	plot_training_dynamics(rows, out_dir / "03_training_dynamics.png")
	plot_reward_vs_eval(rows, out_dir / "04_reward_vs_eval.png")
	plot_component_area(rows, out_dir / "05_component_area.png")
	plot_summary_card(rows, run_name, out_dir / "06_summary_card.png")

	print(f"[plot] Done — {len(list(out_dir.glob('*.png')))} PNGs in {out_dir}")
	return out_dir


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Generate demo plots from a GRPO metrics.jsonl file."
	)
	parser.add_argument(
	"metrics_jsonl", nargs="?", type=Path, default=None,
	help="Path to metrics.jsonl produced by run_grpo_training.py",
	)
	parser.add_argument(
	"--latest", action="store_true",
	help="Auto-discover the most recent metrics.jsonl under checkpoints/grpo/",
	)
	parser.add_argument(
	"--out-dir", type=Path, default=None,
	help="Directory to write PNG files (default: <metrics_dir>/plots/)",
	)
	args = parser.parse_args()

	if args.latest:
	path = find_latest_metrics()
	if path is None:
	print("No metrics.jsonl found under checkpoints/grpo/", file=sys.stderr)
	sys.exit(1)
	print(f"[plot] Auto-selected {path}")
	elif args.metrics_jsonl:
	path = args.metrics_jsonl
	else:
	parser.print_help()
	sys.exit(1)

	if not path.exists():
	print(f"File not found: {path}", file=sys.stderr)
	sys.exit(1)

	generate_plots(path, args.out_dir)


	if __name__ == "__main__":
	main()