AxiomForgeAI / scripts /plot_training_results.py
jampuramprem's picture
Initial Space deployment
ec4ae03
#!/usr/bin/env python3
"""
AxiomForgeAI β€” Training Results Plots
======================================
Reads the metrics CSV from a GRPO training run and generates five focused plots
that tell the story of what improved, how self-play was earned, and why step-level
reasoning quality matters as much as final-answer accuracy.
All plots are saved to images/ as high-resolution PNGs.
Usage
-----
python scripts/plot_training_results.py
python scripts/plot_training_results.py --metrics logs/grpo/grpo_20260426_032827/metrics.csv
python scripts/plot_training_results.py --out images/
"""
from __future__ import annotations
import argparse
import csv
from pathlib import Path
from typing import Dict, List
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
# ── Style ──────────────────────────────────────────────────────────────────────
PALETTE = {
"indigo": "#6366f1",
"pink": "#ec4899",
"cyan": "#06b6d4",
"amber": "#f59e0b",
"emerald": "#10b981",
"slate": "#94a3b8",
"red": "#ef4444",
"violet": "#8b5cf6",
"white": "#f8fafc",
"bg": "#0f172a",
"bg2": "#1e293b",
"gridline": "#1e293b",
}
plt.rcParams.update({
"figure.facecolor": PALETTE["bg"],
"axes.facecolor": PALETTE["bg"],
"axes.edgecolor": PALETTE["slate"],
"axes.labelcolor": PALETTE["white"],
"axes.titlecolor": PALETTE["white"],
"axes.titlesize": 13,
"axes.labelsize": 11,
"axes.grid": True,
"grid.color": "#1e293b",
"grid.linewidth": 0.8,
"xtick.color": PALETTE["slate"],
"ytick.color": PALETTE["slate"],
"xtick.labelsize": 9,
"ytick.labelsize": 9,
"legend.facecolor": "#1e293b",
"legend.edgecolor": PALETTE["slate"],
"legend.labelcolor": PALETTE["white"],
"legend.fontsize": 9,
"text.color": PALETTE["white"],
"font.family": "sans-serif",
"lines.linewidth": 2.0,
})
PHASE_COLORS = {
"GROUNDED_ONLY": ("#6366f120", "#6366f1"),
"SELFPLAY_RAMP": ("#10b98120", "#10b981"),
}
DPI = 160
IMAGES_DIR = Path("images")
DEFAULT_METRICS = (
"logs/grpo/grpo_20260426_032827/metrics.csv"
)
# ── Helpers ────────────────────────────────────────────────────────────────────
def load_csv(path: str) -> List[Dict]:
rows = []
with open(path, encoding="utf-8") as f:
for r in csv.DictReader(f):
rows.append({k: v for k, v in r.items()})
return rows
def f(row: Dict, key: str, default: float = float("nan")) -> float:
v = row.get(key, "")
try:
return float(v) if v != "" else default
except (ValueError, TypeError):
return default
def moving_avg(values: List[float], w: int = 3) -> List[float]:
result = []
for i in range(len(values)):
lo = max(0, i - w + 1)
chunk = [v for v in values[lo : i + 1] if not np.isnan(v)]
result.append(float(np.mean(chunk)) if chunk else float("nan"))
return result
def shade_phases(ax, iters, phases):
"""Draw translucent background rectangles for each training phase."""
prev_phase, start = None, iters[0]
for it, ph in zip(iters, phases):
if ph != prev_phase:
if prev_phase is not None:
bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
ax.axvspan(start - 0.5, it - 0.5, facecolor=bg, linewidth=0, zorder=0)
prev_phase, start = ph, it
if prev_phase is not None:
bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
ax.axvspan(start - 0.5, iters[-1] + 0.5, facecolor=bg, linewidth=0, zorder=0)
def phase_legend_patches(phases):
seen = []
patches = []
for ph in phases:
if ph not in seen:
seen.append(ph)
_, edge = PHASE_COLORS.get(ph, ("#ffffff10", "#ffffff"))
label = ph.replace("_", " ").title()
patches.append(mpatches.Patch(facecolor=edge + "40", edgecolor=edge,
linewidth=1.2, label=label))
return patches
def annotate_transition(ax, x_iter, label, ypos=0.97, color="#94a3b8"):
ax.axvline(x=x_iter - 0.5, color=color, linewidth=1, linestyle="--", alpha=0.7)
ax.text(x_iter, ypos, label, transform=ax.get_xaxis_transform(),
fontsize=7.5, color=color, ha="left", va="top",
bbox=dict(facecolor=PALETTE["bg2"], edgecolor="none", pad=2))
def save(fig: plt.Figure, name: str, out: Path):
out.mkdir(parents=True, exist_ok=True)
path = out / name
fig.savefig(path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
print(f" βœ“ {path}")
plt.close(fig)
# ══════════════════════════════════════════════════════════════════════════════
# PLOT 1 β€” Hero: Reasoning quality at evaluation checkpoints
# Shows four signals together: GSM8K accuracy, combined score, step accuracy,
# and LCCP. The message: the model doesn't just get more answers right β€”
# every step of the reasoning chain gets better.
# ══════════════════════════════════════════════════════════════════════════════
def plot_eval_quality(rows: List[Dict], out: Path):
eval_rows = [r for r in rows if r.get("eval_combined", "") != ""]
iters = [int(r["iteration"]) for r in eval_rows]
gsm8k_acc = [f(r, "eval_correct_rt") * 100 for r in eval_rows]
combined = [f(r, "eval_combined") * 100 for r in eval_rows]
step_acc = [f(r, "eval_step_acc") * 100 for r in eval_rows]
lccp = [f(r, "eval_lccp") * 100 for r in eval_rows]
prm = [f(r, "eval_prm") * 100 for r in eval_rows]
fig, ax = plt.subplots(figsize=(9, 5))
fig.suptitle("Evaluation Quality Over Training β€” AxiomForgeAI",
fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
# --- lines
ax.plot(iters, gsm8k_acc, "o-", color=PALETTE["pink"], label="GSM8K Accuracy (final answer)", ms=7, zorder=5)
ax.plot(iters, combined, "s-", color=PALETTE["indigo"], label="Combined Score", ms=6, zorder=5)
ax.plot(iters, step_acc, "^-", color=PALETTE["cyan"], label="Step Accuracy (reasoning chain)", ms=6, zorder=5)
ax.plot(iters, lccp, "D-", color=PALETTE["emerald"], label="LCCP (chain integrity)", ms=6, zorder=5)
ax.plot(iters, prm, "v--", color=PALETTE["amber"], label="PRM Mean Score", ms=5, alpha=0.8, zorder=4)
# annotate best GSM8K
best_gsm = max(gsm8k_acc)
bi = gsm8k_acc.index(best_gsm)
ax.annotate(f" {best_gsm:.1f}%",
xy=(iters[bi], best_gsm), fontsize=9, color=PALETTE["pink"],
va="bottom", ha="left")
# annotate best combined
best_c = max(combined)
bci = combined.index(best_c)
ax.annotate(f" {best_c:.1f}",
xy=(iters[bci], best_c), fontsize=9, color=PALETTE["indigo"],
va="top", ha="left")
ax.set_xlabel("Training Iteration")
ax.set_ylabel("Score (%)")
ax.set_xticks(iters)
ax.set_ylim(78, 96)
ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax.legend(loc="lower right", framealpha=0.8)
ax.set_title(
"Four angles on quality β€” answer correctness, holistic score, per-step reasoning, and chain integrity",
fontsize=9, color=PALETTE["slate"], pad=6,
)
fig.tight_layout()
save(fig, "plot1_eval_quality.png", out)
# ══════════════════════════════════════════════════════════════════════════════
# PLOT 2 β€” Training Journey: full 30-iteration timeline with phase shading
# Shows mean reward, GT match rate, and step accuracy over every iteration.
# Phase backgrounds show when self-play unlocked and the curriculum ramped.
# ══════════════════════════════════════════════════════════════════════════════
def plot_training_journey(rows: List[Dict], out: Path):
iters = [int(r["iteration"]) for r in rows]
phases = [r["training_phase"] for r in rows]
mean_r = [f(r, "mean_reward") * 100 for r in rows]
gt_match = [f(r, "gt_match_rate") * 100 for r in rows]
step_acc = [f(r, "step_accuracy") * 100 for r in rows]
batch_acc = [f(r, "batch_accuracy") * 100 for r in rows]
ma_reward = moving_avg(mean_r, w=4)
ma_gt = moving_avg(gt_match, w=4)
ma_step = moving_avg(step_acc, w=4)
fig, ax = plt.subplots(figsize=(11, 5))
shade_phases(ax, iters, phases)
# raw (faint)
ax.plot(iters, mean_r, alpha=0.25, color=PALETTE["indigo"], linewidth=1)
ax.plot(iters, gt_match, alpha=0.25, color=PALETTE["pink"], linewidth=1)
ax.plot(iters, step_acc, alpha=0.25, color=PALETTE["cyan"], linewidth=1)
# smoothed (bold)
ax.plot(iters, ma_reward, color=PALETTE["indigo"], linewidth=2.5, label="Mean Reward (smooth)")
ax.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="GT Match Rate (smooth)")
ax.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Step Accuracy (smooth)")
# self-play transition annotation
sp_start = next(i for i, p in enumerate(phases) if p == "SELFPLAY_RAMP")
annotate_transition(ax, iters[sp_start], "Self-play\nunlocked", ypos=0.98,
color=PALETTE["emerald"])
ax.set_xlabel("Training Iteration")
ax.set_ylabel("Score (%)")
ax.set_ylim(55, 105)
ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax.set_xticks(range(1, max(iters) + 1, 2))
ax.set_title("30-Iteration GRPO Training Timeline | Faint = raw Β· Bold = 4-iter moving average",
fontsize=9, color=PALETTE["slate"], pad=6)
fig.suptitle("Training Journey β€” Reward, GT Match & Step Accuracy",
fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
legend_patches = phase_legend_patches(phases)
h, l = ax.get_legend_handles_labels()
ax.legend(handles=h + legend_patches, loc="lower right", framealpha=0.8, ncol=2)
fig.tight_layout()
save(fig, "plot2_training_journey.png", out)
# ══════════════════════════════════════════════════════════════════════════════
# PLOT 3 β€” Self-Play Success: the curriculum earning its right to generate
# Shows the self-play ratio ramping up while question quality stays high.
# The headline: by iteration 30 more than 60% of training is model-generated,
# and those questions are 95-100% solvable and genuinely novel.
# ══════════════════════════════════════════════════════════════════════════════
def plot_selfplay_success(rows: List[Dict], out: Path):
sp_rows = [r for r in rows if f(r, "q_reward") > 0]
iters = [int(r["iteration"]) for r in sp_rows]
sp_rat = [f(r, "sp_ratio") * 100 for r in sp_rows]
q_sol = [f(r, "q_solvability") * 100 for r in sp_rows]
q_nov = [f(r, "q_novelty") * 100 for r in sp_rows]
q_rew = [f(r, "q_reward") * 100 for r in sp_rows]
fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()
ax2.tick_params(axis="y", labelcolor=PALETTE["slate"])
ax2.spines["right"].set_color(PALETTE["slate"])
# self-play ramp (left axis)
ax1.fill_between(iters, sp_rat, alpha=0.18, color=PALETTE["emerald"])
ax1.plot(iters, sp_rat, "o-", color=PALETTE["emerald"], ms=6,
label="Self-play ratio", linewidth=2.5)
ax1.set_ylabel("Self-play share of training (%)", color=PALETTE["emerald"])
ax1.tick_params(axis="y", labelcolor=PALETTE["emerald"])
ax1.set_ylim(0, 80)
# question quality (right axis)
ax2.plot(iters, q_sol, "s--", color=PALETTE["cyan"], ms=5, label="Solvability", linewidth=1.8)
ax2.plot(iters, q_nov, "^--", color=PALETTE["amber"], ms=5, label="Novelty", linewidth=1.8)
ax2.plot(iters, q_rew, "D--", color=PALETTE["pink"], ms=5, label="Q-Reward", linewidth=1.8)
ax2.set_ylabel("Question quality score (%)", color=PALETTE["slate"])
ax2.set_ylim(0, 115)
# merge legends
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1 + h2, l1 + l2, loc="upper left", framealpha=0.8)
ax1.set_xlabel("Training Iteration")
ax1.set_xticks(iters)
ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
# annotate final sp ratio
ax1.annotate(f" {sp_rat[-1]:.0f}% self-play\n by iter {iters[-1]}",
xy=(iters[-1], sp_rat[-1]), fontsize=9, color=PALETTE["emerald"],
va="center", ha="left")
fig.suptitle("Self-Play Curriculum β€” The Model Earns Its Own Training Data",
fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
ax1.set_title(
"Self-play ratio ramps from 0 β†’ 61% Β· Generated questions stay 93-100% solvable throughout",
fontsize=9, color=PALETTE["slate"], pad=6,
)
fig.tight_layout()
save(fig, "plot3_selfplay_success.png", out)
# ══════════════════════════════════════════════════════════════════════════════
# PLOT 4 β€” Reward Signal Tightening: mean Β± std over 30 iterations
# As the policy learns what "good" looks like, the spread between the best
# and worst solutions in a group narrows. Lower variance = more consistent
# reasoning, not lucky guessing.
# ══════════════════════════════════════════════════════════════════════════════
def plot_reward_confidence(rows: List[Dict], out: Path):
iters = [int(r["iteration"]) for r in rows]
phases = [r["training_phase"] for r in rows]
mean_r = np.array([f(r, "mean_reward") for r in rows])
std_r = np.array([f(r, "std_reward") for r in rows])
skipped = np.array([f(r, "skipped_groups", 0) for r in rows])
n_grps = np.array([f(r, "n_groups", 1) for r in rows])
skip_rt = skipped / np.maximum(n_grps, 1) * 100
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 7), sharex=True,
gridspec_kw={"height_ratios": [3, 1.2]})
fig.suptitle("Reward Confidence β€” Mean Β± Std & Skipped Groups Over 30 Iterations",
fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
shade_phases(ax1, iters, phases)
ax1.fill_between(iters, (mean_r - std_r) * 100, (mean_r + std_r) * 100,
alpha=0.20, color=PALETTE["indigo"])
ax1.plot(iters, mean_r * 100, color=PALETTE["indigo"], linewidth=2.5, label="Mean reward")
ax1.plot(iters, (mean_r - std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
alpha=0.6, label="Β±1 std")
ax1.plot(iters, (mean_r + std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
alpha=0.6)
# highlight the two tight-cluster peaks
for special_iter, label in [(11, "iter 11\nstd=0.098"), (22, "iter 22\nstd=0.124")]:
si = iters.index(special_iter)
ax1.annotate(label,
xy=(special_iter, (mean_r[si] + std_r[si]) * 100),
xytext=(special_iter + 1, (mean_r[si] + std_r[si]) * 100 + 2),
fontsize=8, color=PALETTE["amber"],
arrowprops=dict(arrowstyle="->", color=PALETTE["amber"], lw=1.2))
ax1.set_ylabel("Reward (%)")
ax1.set_ylim(55, 115)
ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
h1, l1 = ax1.get_legend_handles_labels()
ax1.legend(handles=h1 + phase_legend_patches(phases), framealpha=0.8, ncol=3)
# skip-rate bar chart (bottom panel)
shade_phases(ax2, iters, phases)
ax2.bar(iters, skip_rt, color=PALETTE["red"], alpha=0.7, width=0.7, label="Skipped groups %")
ax2.set_ylabel("Skipped\ngroups (%)")
ax2.set_xlabel("Training Iteration")
ax2.set_ylim(0, 75)
ax2.set_xticks(range(1, max(iters) + 1, 2))
ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax2.legend(loc="upper right", framealpha=0.8)
fig.tight_layout()
save(fig, "plot4_reward_confidence.png", out)
# ══════════════════════════════════════════════════════════════════════════════
# PLOT 5 β€” Step-Level Reasoning Quality: train vs eval
# Breaks down the two signals that measure HOW the model thinks (not just
# whether it gets the final answer right): step accuracy and LCCP.
# Train lines are noisy; eval lines show clean upward trends.
# ══════════════════════════════════════════════════════════════════════════════
def plot_reasoning_quality(rows: List[Dict], out: Path):
iters = [int(r["iteration"]) for r in rows]
phases = [r["training_phase"] for r in rows]
# training
t_step = [f(r, "step_accuracy") * 100 for r in rows]
t_lccp = [f(r, "lccp") * 100 for r in rows]
t_gt = [f(r, "gt_match_rate") * 100 for r in rows]
# eval (only at checkpoint iters)
eval_rows = [r for r in rows if r.get("eval_combined", "") != ""]
e_iters = [int(r["iteration"]) for r in eval_rows]
e_step = [f(r, "eval_step_acc") * 100 for r in eval_rows]
e_lccp = [f(r, "eval_lccp") * 100 for r in eval_rows]
# moving averages
ma_step = moving_avg(t_step, w=4)
ma_lccp = moving_avg(t_lccp, w=4)
ma_gt = moving_avg(t_gt, w=4)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5))
fig.suptitle("Step-Level Reasoning Quality β€” Training vs Held-Out Evaluation",
fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
# ── LEFT: step accuracy ──
shade_phases(ax1, iters, phases)
ax1.plot(iters, t_step, alpha=0.2, color=PALETTE["cyan"], linewidth=1)
ax1.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Train step acc (smooth)")
ax1.plot(iters, t_gt, alpha=0.15, color=PALETTE["pink"], linewidth=1)
ax1.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="Train GT match (smooth)")
ax1.plot(e_iters, e_step, "o-", color=PALETTE["white"], ms=8, linewidth=2,
label="Eval step accuracy", zorder=6)
# annotate eval start/end
ax1.annotate(f"{e_step[0]:.1f}%", xy=(e_iters[0], e_step[0]),
xytext=(e_iters[0] - 0.3, e_step[0] - 1.2), fontsize=8.5,
color=PALETTE["white"], ha="right")
ax1.annotate(f"{e_step[-1]:.1f}%", xy=(e_iters[-1], e_step[-1]),
xytext=(e_iters[-1] + 0.3, e_step[-1] + 0.5), fontsize=8.5,
color=PALETTE["white"])
ax1.annotate("", xy=(e_iters[-1], e_step[-1]),
xytext=(e_iters[0], e_step[0]),
arrowprops=dict(arrowstyle="->", color=PALETTE["cyan"], lw=1.5,
connectionstyle="arc3,rad=-0.3"))
ax1.set_title("Step Accuracy β€” Did each reasoning step hold up?",
fontsize=9.5, color=PALETTE["slate"], pad=5)
ax1.set_xlabel("Training Iteration")
ax1.set_ylabel("Score (%)")
ax1.set_ylim(55, 105)
ax1.set_xticks(range(1, max(iters) + 1, 3))
ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax1.legend(handles=ax1.get_legend_handles_labels()[0] + phase_legend_patches(phases),
framealpha=0.8, ncol=1, loc="lower right")
# ── RIGHT: LCCP ──
shade_phases(ax2, iters, phases)
ax2.plot(iters, t_lccp, alpha=0.2, color=PALETTE["emerald"], linewidth=1)
ax2.plot(iters, ma_lccp, color=PALETTE["emerald"], linewidth=2.5, label="Train LCCP (smooth)")
ax2.plot(e_iters, e_lccp, "o-", color=PALETTE["white"], ms=8, linewidth=2,
label="Eval LCCP", zorder=6)
ax2.annotate(f"{e_lccp[0]:.1f}%", xy=(e_iters[0], e_lccp[0]),
xytext=(e_iters[0] - 0.3, e_lccp[0] - 1.5), fontsize=8.5,
color=PALETTE["white"], ha="right")
ax2.annotate(f"{e_lccp[-1]:.1f}%", xy=(e_iters[-1], e_lccp[-1]),
xytext=(e_iters[-1] + 0.3, e_lccp[-1] + 0.5), fontsize=8.5,
color=PALETTE["white"])
# show LCCP delta
delta = e_lccp[-1] - e_lccp[0]
ax2.text(0.97, 0.06,
f"Eval LCCP Ξ” = +{delta:.2f}pp\n(iter {e_iters[0]} β†’ {e_iters[-1]})",
transform=ax2.transAxes, ha="right", va="bottom",
fontsize=8.5, color=PALETTE["emerald"],
bbox=dict(facecolor=PALETTE["bg2"], edgecolor=PALETTE["emerald"],
linewidth=0.8, pad=5))
ax2.set_title("LCCP β€” Did the chain of reasoning stay correct until the first error?",
fontsize=9.5, color=PALETTE["slate"], pad=5)
ax2.set_xlabel("Training Iteration")
ax2.set_ylabel("LCCP (%)")
ax2.set_ylim(55, 100)
ax2.set_xticks(range(1, max(iters) + 1, 3))
ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
ax2.legend(handles=ax2.get_legend_handles_labels()[0] + phase_legend_patches(phases),
framealpha=0.8, ncol=1, loc="lower right")
fig.tight_layout()
save(fig, "plot5_reasoning_quality.png", out)
# ══════════════════════════════════════════════════════════════════════════════
# Main
# ══════════════════════════════════════════════════════════════════════════════
def parse_args():
p = argparse.ArgumentParser(description="Generate AxiomForgeAI training plots")
p.add_argument("--metrics", default=DEFAULT_METRICS,
help=f"Path to metrics.csv (default: {DEFAULT_METRICS})")
p.add_argument("--out", default="images",
help="Output directory for PNGs (default: images/)")
return p.parse_args()
def main():
args = parse_args()
out = Path(args.out)
print(f"Loading metrics from : {args.metrics}")
print(f"Saving plots to : {out}/")
print()
rows = load_csv(args.metrics)
print(f"Loaded {len(rows)} iterations.\n")
print("Generating plots …")
plot_eval_quality(rows, out)
plot_training_journey(rows, out)
plot_selfplay_success(rows, out)
plot_reward_confidence(rows, out)
plot_reasoning_quality(rows, out)
print(f"\nβœ… All 5 plots saved to {out}/")
print("\nFiles:")
for p in sorted(out.glob("plot*.png")):
print(f" {p} ({p.stat().st_size // 1024} KB)")
if __name__ == "__main__":
main()