Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| AxiomForgeAI β Training Results Plots | |
| ====================================== | |
| Reads the metrics CSV from a GRPO training run and generates five focused plots | |
| that tell the story of what improved, how self-play was earned, and why step-level | |
| reasoning quality matters as much as final-answer accuracy. | |
| All plots are saved to images/ as high-resolution PNGs. | |
| Usage | |
| ----- | |
| python scripts/plot_training_results.py | |
| python scripts/plot_training_results.py --metrics logs/grpo/grpo_20260426_032827/metrics.csv | |
| python scripts/plot_training_results.py --out images/ | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import csv | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patches as mpatches | |
| import numpy as np | |
| # ββ Style ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PALETTE = { | |
| "indigo": "#6366f1", | |
| "pink": "#ec4899", | |
| "cyan": "#06b6d4", | |
| "amber": "#f59e0b", | |
| "emerald": "#10b981", | |
| "slate": "#94a3b8", | |
| "red": "#ef4444", | |
| "violet": "#8b5cf6", | |
| "white": "#f8fafc", | |
| "bg": "#0f172a", | |
| "bg2": "#1e293b", | |
| "gridline": "#1e293b", | |
| } | |
| plt.rcParams.update({ | |
| "figure.facecolor": PALETTE["bg"], | |
| "axes.facecolor": PALETTE["bg"], | |
| "axes.edgecolor": PALETTE["slate"], | |
| "axes.labelcolor": PALETTE["white"], | |
| "axes.titlecolor": PALETTE["white"], | |
| "axes.titlesize": 13, | |
| "axes.labelsize": 11, | |
| "axes.grid": True, | |
| "grid.color": "#1e293b", | |
| "grid.linewidth": 0.8, | |
| "xtick.color": PALETTE["slate"], | |
| "ytick.color": PALETTE["slate"], | |
| "xtick.labelsize": 9, | |
| "ytick.labelsize": 9, | |
| "legend.facecolor": "#1e293b", | |
| "legend.edgecolor": PALETTE["slate"], | |
| "legend.labelcolor": PALETTE["white"], | |
| "legend.fontsize": 9, | |
| "text.color": PALETTE["white"], | |
| "font.family": "sans-serif", | |
| "lines.linewidth": 2.0, | |
| }) | |
| PHASE_COLORS = { | |
| "GROUNDED_ONLY": ("#6366f120", "#6366f1"), | |
| "SELFPLAY_RAMP": ("#10b98120", "#10b981"), | |
| } | |
| DPI = 160 | |
| IMAGES_DIR = Path("images") | |
| DEFAULT_METRICS = ( | |
| "logs/grpo/grpo_20260426_032827/metrics.csv" | |
| ) | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_csv(path: str) -> List[Dict]: | |
| rows = [] | |
| with open(path, encoding="utf-8") as f: | |
| for r in csv.DictReader(f): | |
| rows.append({k: v for k, v in r.items()}) | |
| return rows | |
| def f(row: Dict, key: str, default: float = float("nan")) -> float: | |
| v = row.get(key, "") | |
| try: | |
| return float(v) if v != "" else default | |
| except (ValueError, TypeError): | |
| return default | |
| def moving_avg(values: List[float], w: int = 3) -> List[float]: | |
| result = [] | |
| for i in range(len(values)): | |
| lo = max(0, i - w + 1) | |
| chunk = [v for v in values[lo : i + 1] if not np.isnan(v)] | |
| result.append(float(np.mean(chunk)) if chunk else float("nan")) | |
| return result | |
| def shade_phases(ax, iters, phases): | |
| """Draw translucent background rectangles for each training phase.""" | |
| prev_phase, start = None, iters[0] | |
| for it, ph in zip(iters, phases): | |
| if ph != prev_phase: | |
| if prev_phase is not None: | |
| bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff")) | |
| ax.axvspan(start - 0.5, it - 0.5, facecolor=bg, linewidth=0, zorder=0) | |
| prev_phase, start = ph, it | |
| if prev_phase is not None: | |
| bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff")) | |
| ax.axvspan(start - 0.5, iters[-1] + 0.5, facecolor=bg, linewidth=0, zorder=0) | |
| def phase_legend_patches(phases): | |
| seen = [] | |
| patches = [] | |
| for ph in phases: | |
| if ph not in seen: | |
| seen.append(ph) | |
| _, edge = PHASE_COLORS.get(ph, ("#ffffff10", "#ffffff")) | |
| label = ph.replace("_", " ").title() | |
| patches.append(mpatches.Patch(facecolor=edge + "40", edgecolor=edge, | |
| linewidth=1.2, label=label)) | |
| return patches | |
| def annotate_transition(ax, x_iter, label, ypos=0.97, color="#94a3b8"): | |
| ax.axvline(x=x_iter - 0.5, color=color, linewidth=1, linestyle="--", alpha=0.7) | |
| ax.text(x_iter, ypos, label, transform=ax.get_xaxis_transform(), | |
| fontsize=7.5, color=color, ha="left", va="top", | |
| bbox=dict(facecolor=PALETTE["bg2"], edgecolor="none", pad=2)) | |
| def save(fig: plt.Figure, name: str, out: Path): | |
| out.mkdir(parents=True, exist_ok=True) | |
| path = out / name | |
| fig.savefig(path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor()) | |
| print(f" β {path}") | |
| plt.close(fig) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PLOT 1 β Hero: Reasoning quality at evaluation checkpoints | |
| # Shows four signals together: GSM8K accuracy, combined score, step accuracy, | |
| # and LCCP. The message: the model doesn't just get more answers right β | |
| # every step of the reasoning chain gets better. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_eval_quality(rows: List[Dict], out: Path): | |
| eval_rows = [r for r in rows if r.get("eval_combined", "") != ""] | |
| iters = [int(r["iteration"]) for r in eval_rows] | |
| gsm8k_acc = [f(r, "eval_correct_rt") * 100 for r in eval_rows] | |
| combined = [f(r, "eval_combined") * 100 for r in eval_rows] | |
| step_acc = [f(r, "eval_step_acc") * 100 for r in eval_rows] | |
| lccp = [f(r, "eval_lccp") * 100 for r in eval_rows] | |
| prm = [f(r, "eval_prm") * 100 for r in eval_rows] | |
| fig, ax = plt.subplots(figsize=(9, 5)) | |
| fig.suptitle("Evaluation Quality Over Training β AxiomForgeAI", | |
| fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01) | |
| # --- lines | |
| ax.plot(iters, gsm8k_acc, "o-", color=PALETTE["pink"], label="GSM8K Accuracy (final answer)", ms=7, zorder=5) | |
| ax.plot(iters, combined, "s-", color=PALETTE["indigo"], label="Combined Score", ms=6, zorder=5) | |
| ax.plot(iters, step_acc, "^-", color=PALETTE["cyan"], label="Step Accuracy (reasoning chain)", ms=6, zorder=5) | |
| ax.plot(iters, lccp, "D-", color=PALETTE["emerald"], label="LCCP (chain integrity)", ms=6, zorder=5) | |
| ax.plot(iters, prm, "v--", color=PALETTE["amber"], label="PRM Mean Score", ms=5, alpha=0.8, zorder=4) | |
| # annotate best GSM8K | |
| best_gsm = max(gsm8k_acc) | |
| bi = gsm8k_acc.index(best_gsm) | |
| ax.annotate(f" {best_gsm:.1f}%", | |
| xy=(iters[bi], best_gsm), fontsize=9, color=PALETTE["pink"], | |
| va="bottom", ha="left") | |
| # annotate best combined | |
| best_c = max(combined) | |
| bci = combined.index(best_c) | |
| ax.annotate(f" {best_c:.1f}", | |
| xy=(iters[bci], best_c), fontsize=9, color=PALETTE["indigo"], | |
| va="top", ha="left") | |
| ax.set_xlabel("Training Iteration") | |
| ax.set_ylabel("Score (%)") | |
| ax.set_xticks(iters) | |
| ax.set_ylim(78, 96) | |
| ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax.legend(loc="lower right", framealpha=0.8) | |
| ax.set_title( | |
| "Four angles on quality β answer correctness, holistic score, per-step reasoning, and chain integrity", | |
| fontsize=9, color=PALETTE["slate"], pad=6, | |
| ) | |
| fig.tight_layout() | |
| save(fig, "plot1_eval_quality.png", out) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PLOT 2 β Training Journey: full 30-iteration timeline with phase shading | |
| # Shows mean reward, GT match rate, and step accuracy over every iteration. | |
| # Phase backgrounds show when self-play unlocked and the curriculum ramped. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_training_journey(rows: List[Dict], out: Path): | |
| iters = [int(r["iteration"]) for r in rows] | |
| phases = [r["training_phase"] for r in rows] | |
| mean_r = [f(r, "mean_reward") * 100 for r in rows] | |
| gt_match = [f(r, "gt_match_rate") * 100 for r in rows] | |
| step_acc = [f(r, "step_accuracy") * 100 for r in rows] | |
| batch_acc = [f(r, "batch_accuracy") * 100 for r in rows] | |
| ma_reward = moving_avg(mean_r, w=4) | |
| ma_gt = moving_avg(gt_match, w=4) | |
| ma_step = moving_avg(step_acc, w=4) | |
| fig, ax = plt.subplots(figsize=(11, 5)) | |
| shade_phases(ax, iters, phases) | |
| # raw (faint) | |
| ax.plot(iters, mean_r, alpha=0.25, color=PALETTE["indigo"], linewidth=1) | |
| ax.plot(iters, gt_match, alpha=0.25, color=PALETTE["pink"], linewidth=1) | |
| ax.plot(iters, step_acc, alpha=0.25, color=PALETTE["cyan"], linewidth=1) | |
| # smoothed (bold) | |
| ax.plot(iters, ma_reward, color=PALETTE["indigo"], linewidth=2.5, label="Mean Reward (smooth)") | |
| ax.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="GT Match Rate (smooth)") | |
| ax.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Step Accuracy (smooth)") | |
| # self-play transition annotation | |
| sp_start = next(i for i, p in enumerate(phases) if p == "SELFPLAY_RAMP") | |
| annotate_transition(ax, iters[sp_start], "Self-play\nunlocked", ypos=0.98, | |
| color=PALETTE["emerald"]) | |
| ax.set_xlabel("Training Iteration") | |
| ax.set_ylabel("Score (%)") | |
| ax.set_ylim(55, 105) | |
| ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax.set_xticks(range(1, max(iters) + 1, 2)) | |
| ax.set_title("30-Iteration GRPO Training Timeline | Faint = raw Β· Bold = 4-iter moving average", | |
| fontsize=9, color=PALETTE["slate"], pad=6) | |
| fig.suptitle("Training Journey β Reward, GT Match & Step Accuracy", | |
| fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01) | |
| legend_patches = phase_legend_patches(phases) | |
| h, l = ax.get_legend_handles_labels() | |
| ax.legend(handles=h + legend_patches, loc="lower right", framealpha=0.8, ncol=2) | |
| fig.tight_layout() | |
| save(fig, "plot2_training_journey.png", out) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PLOT 3 β Self-Play Success: the curriculum earning its right to generate | |
| # Shows the self-play ratio ramping up while question quality stays high. | |
| # The headline: by iteration 30 more than 60% of training is model-generated, | |
| # and those questions are 95-100% solvable and genuinely novel. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_selfplay_success(rows: List[Dict], out: Path): | |
| sp_rows = [r for r in rows if f(r, "q_reward") > 0] | |
| iters = [int(r["iteration"]) for r in sp_rows] | |
| sp_rat = [f(r, "sp_ratio") * 100 for r in sp_rows] | |
| q_sol = [f(r, "q_solvability") * 100 for r in sp_rows] | |
| q_nov = [f(r, "q_novelty") * 100 for r in sp_rows] | |
| q_rew = [f(r, "q_reward") * 100 for r in sp_rows] | |
| fig, ax1 = plt.subplots(figsize=(10, 5)) | |
| ax2 = ax1.twinx() | |
| ax2.tick_params(axis="y", labelcolor=PALETTE["slate"]) | |
| ax2.spines["right"].set_color(PALETTE["slate"]) | |
| # self-play ramp (left axis) | |
| ax1.fill_between(iters, sp_rat, alpha=0.18, color=PALETTE["emerald"]) | |
| ax1.plot(iters, sp_rat, "o-", color=PALETTE["emerald"], ms=6, | |
| label="Self-play ratio", linewidth=2.5) | |
| ax1.set_ylabel("Self-play share of training (%)", color=PALETTE["emerald"]) | |
| ax1.tick_params(axis="y", labelcolor=PALETTE["emerald"]) | |
| ax1.set_ylim(0, 80) | |
| # question quality (right axis) | |
| ax2.plot(iters, q_sol, "s--", color=PALETTE["cyan"], ms=5, label="Solvability", linewidth=1.8) | |
| ax2.plot(iters, q_nov, "^--", color=PALETTE["amber"], ms=5, label="Novelty", linewidth=1.8) | |
| ax2.plot(iters, q_rew, "D--", color=PALETTE["pink"], ms=5, label="Q-Reward", linewidth=1.8) | |
| ax2.set_ylabel("Question quality score (%)", color=PALETTE["slate"]) | |
| ax2.set_ylim(0, 115) | |
| # merge legends | |
| h1, l1 = ax1.get_legend_handles_labels() | |
| h2, l2 = ax2.get_legend_handles_labels() | |
| ax1.legend(h1 + h2, l1 + l2, loc="upper left", framealpha=0.8) | |
| ax1.set_xlabel("Training Iteration") | |
| ax1.set_xticks(iters) | |
| ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| # annotate final sp ratio | |
| ax1.annotate(f" {sp_rat[-1]:.0f}% self-play\n by iter {iters[-1]}", | |
| xy=(iters[-1], sp_rat[-1]), fontsize=9, color=PALETTE["emerald"], | |
| va="center", ha="left") | |
| fig.suptitle("Self-Play Curriculum β The Model Earns Its Own Training Data", | |
| fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01) | |
| ax1.set_title( | |
| "Self-play ratio ramps from 0 β 61% Β· Generated questions stay 93-100% solvable throughout", | |
| fontsize=9, color=PALETTE["slate"], pad=6, | |
| ) | |
| fig.tight_layout() | |
| save(fig, "plot3_selfplay_success.png", out) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PLOT 4 β Reward Signal Tightening: mean Β± std over 30 iterations | |
| # As the policy learns what "good" looks like, the spread between the best | |
| # and worst solutions in a group narrows. Lower variance = more consistent | |
| # reasoning, not lucky guessing. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_reward_confidence(rows: List[Dict], out: Path): | |
| iters = [int(r["iteration"]) for r in rows] | |
| phases = [r["training_phase"] for r in rows] | |
| mean_r = np.array([f(r, "mean_reward") for r in rows]) | |
| std_r = np.array([f(r, "std_reward") for r in rows]) | |
| skipped = np.array([f(r, "skipped_groups", 0) for r in rows]) | |
| n_grps = np.array([f(r, "n_groups", 1) for r in rows]) | |
| skip_rt = skipped / np.maximum(n_grps, 1) * 100 | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 7), sharex=True, | |
| gridspec_kw={"height_ratios": [3, 1.2]}) | |
| fig.suptitle("Reward Confidence β Mean Β± Std & Skipped Groups Over 30 Iterations", | |
| fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01) | |
| shade_phases(ax1, iters, phases) | |
| ax1.fill_between(iters, (mean_r - std_r) * 100, (mean_r + std_r) * 100, | |
| alpha=0.20, color=PALETTE["indigo"]) | |
| ax1.plot(iters, mean_r * 100, color=PALETTE["indigo"], linewidth=2.5, label="Mean reward") | |
| ax1.plot(iters, (mean_r - std_r) * 100, "--", color=PALETTE["slate"], linewidth=1, | |
| alpha=0.6, label="Β±1 std") | |
| ax1.plot(iters, (mean_r + std_r) * 100, "--", color=PALETTE["slate"], linewidth=1, | |
| alpha=0.6) | |
| # highlight the two tight-cluster peaks | |
| for special_iter, label in [(11, "iter 11\nstd=0.098"), (22, "iter 22\nstd=0.124")]: | |
| si = iters.index(special_iter) | |
| ax1.annotate(label, | |
| xy=(special_iter, (mean_r[si] + std_r[si]) * 100), | |
| xytext=(special_iter + 1, (mean_r[si] + std_r[si]) * 100 + 2), | |
| fontsize=8, color=PALETTE["amber"], | |
| arrowprops=dict(arrowstyle="->", color=PALETTE["amber"], lw=1.2)) | |
| ax1.set_ylabel("Reward (%)") | |
| ax1.set_ylim(55, 115) | |
| ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| h1, l1 = ax1.get_legend_handles_labels() | |
| ax1.legend(handles=h1 + phase_legend_patches(phases), framealpha=0.8, ncol=3) | |
| # skip-rate bar chart (bottom panel) | |
| shade_phases(ax2, iters, phases) | |
| ax2.bar(iters, skip_rt, color=PALETTE["red"], alpha=0.7, width=0.7, label="Skipped groups %") | |
| ax2.set_ylabel("Skipped\ngroups (%)") | |
| ax2.set_xlabel("Training Iteration") | |
| ax2.set_ylim(0, 75) | |
| ax2.set_xticks(range(1, max(iters) + 1, 2)) | |
| ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax2.legend(loc="upper right", framealpha=0.8) | |
| fig.tight_layout() | |
| save(fig, "plot4_reward_confidence.png", out) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PLOT 5 β Step-Level Reasoning Quality: train vs eval | |
| # Breaks down the two signals that measure HOW the model thinks (not just | |
| # whether it gets the final answer right): step accuracy and LCCP. | |
| # Train lines are noisy; eval lines show clean upward trends. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_reasoning_quality(rows: List[Dict], out: Path): | |
| iters = [int(r["iteration"]) for r in rows] | |
| phases = [r["training_phase"] for r in rows] | |
| # training | |
| t_step = [f(r, "step_accuracy") * 100 for r in rows] | |
| t_lccp = [f(r, "lccp") * 100 for r in rows] | |
| t_gt = [f(r, "gt_match_rate") * 100 for r in rows] | |
| # eval (only at checkpoint iters) | |
| eval_rows = [r for r in rows if r.get("eval_combined", "") != ""] | |
| e_iters = [int(r["iteration"]) for r in eval_rows] | |
| e_step = [f(r, "eval_step_acc") * 100 for r in eval_rows] | |
| e_lccp = [f(r, "eval_lccp") * 100 for r in eval_rows] | |
| # moving averages | |
| ma_step = moving_avg(t_step, w=4) | |
| ma_lccp = moving_avg(t_lccp, w=4) | |
| ma_gt = moving_avg(t_gt, w=4) | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5)) | |
| fig.suptitle("Step-Level Reasoning Quality β Training vs Held-Out Evaluation", | |
| fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01) | |
| # ββ LEFT: step accuracy ββ | |
| shade_phases(ax1, iters, phases) | |
| ax1.plot(iters, t_step, alpha=0.2, color=PALETTE["cyan"], linewidth=1) | |
| ax1.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Train step acc (smooth)") | |
| ax1.plot(iters, t_gt, alpha=0.15, color=PALETTE["pink"], linewidth=1) | |
| ax1.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="Train GT match (smooth)") | |
| ax1.plot(e_iters, e_step, "o-", color=PALETTE["white"], ms=8, linewidth=2, | |
| label="Eval step accuracy", zorder=6) | |
| # annotate eval start/end | |
| ax1.annotate(f"{e_step[0]:.1f}%", xy=(e_iters[0], e_step[0]), | |
| xytext=(e_iters[0] - 0.3, e_step[0] - 1.2), fontsize=8.5, | |
| color=PALETTE["white"], ha="right") | |
| ax1.annotate(f"{e_step[-1]:.1f}%", xy=(e_iters[-1], e_step[-1]), | |
| xytext=(e_iters[-1] + 0.3, e_step[-1] + 0.5), fontsize=8.5, | |
| color=PALETTE["white"]) | |
| ax1.annotate("", xy=(e_iters[-1], e_step[-1]), | |
| xytext=(e_iters[0], e_step[0]), | |
| arrowprops=dict(arrowstyle="->", color=PALETTE["cyan"], lw=1.5, | |
| connectionstyle="arc3,rad=-0.3")) | |
| ax1.set_title("Step Accuracy β Did each reasoning step hold up?", | |
| fontsize=9.5, color=PALETTE["slate"], pad=5) | |
| ax1.set_xlabel("Training Iteration") | |
| ax1.set_ylabel("Score (%)") | |
| ax1.set_ylim(55, 105) | |
| ax1.set_xticks(range(1, max(iters) + 1, 3)) | |
| ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax1.legend(handles=ax1.get_legend_handles_labels()[0] + phase_legend_patches(phases), | |
| framealpha=0.8, ncol=1, loc="lower right") | |
| # ββ RIGHT: LCCP ββ | |
| shade_phases(ax2, iters, phases) | |
| ax2.plot(iters, t_lccp, alpha=0.2, color=PALETTE["emerald"], linewidth=1) | |
| ax2.plot(iters, ma_lccp, color=PALETTE["emerald"], linewidth=2.5, label="Train LCCP (smooth)") | |
| ax2.plot(e_iters, e_lccp, "o-", color=PALETTE["white"], ms=8, linewidth=2, | |
| label="Eval LCCP", zorder=6) | |
| ax2.annotate(f"{e_lccp[0]:.1f}%", xy=(e_iters[0], e_lccp[0]), | |
| xytext=(e_iters[0] - 0.3, e_lccp[0] - 1.5), fontsize=8.5, | |
| color=PALETTE["white"], ha="right") | |
| ax2.annotate(f"{e_lccp[-1]:.1f}%", xy=(e_iters[-1], e_lccp[-1]), | |
| xytext=(e_iters[-1] + 0.3, e_lccp[-1] + 0.5), fontsize=8.5, | |
| color=PALETTE["white"]) | |
| # show LCCP delta | |
| delta = e_lccp[-1] - e_lccp[0] | |
| ax2.text(0.97, 0.06, | |
| f"Eval LCCP Ξ = +{delta:.2f}pp\n(iter {e_iters[0]} β {e_iters[-1]})", | |
| transform=ax2.transAxes, ha="right", va="bottom", | |
| fontsize=8.5, color=PALETTE["emerald"], | |
| bbox=dict(facecolor=PALETTE["bg2"], edgecolor=PALETTE["emerald"], | |
| linewidth=0.8, pad=5)) | |
| ax2.set_title("LCCP β Did the chain of reasoning stay correct until the first error?", | |
| fontsize=9.5, color=PALETTE["slate"], pad=5) | |
| ax2.set_xlabel("Training Iteration") | |
| ax2.set_ylabel("LCCP (%)") | |
| ax2.set_ylim(55, 100) | |
| ax2.set_xticks(range(1, max(iters) + 1, 3)) | |
| ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%")) | |
| ax2.legend(handles=ax2.get_legend_handles_labels()[0] + phase_legend_patches(phases), | |
| framealpha=0.8, ncol=1, loc="lower right") | |
| fig.tight_layout() | |
| save(fig, "plot5_reasoning_quality.png", out) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_args(): | |
| p = argparse.ArgumentParser(description="Generate AxiomForgeAI training plots") | |
| p.add_argument("--metrics", default=DEFAULT_METRICS, | |
| help=f"Path to metrics.csv (default: {DEFAULT_METRICS})") | |
| p.add_argument("--out", default="images", | |
| help="Output directory for PNGs (default: images/)") | |
| return p.parse_args() | |
| def main(): | |
| args = parse_args() | |
| out = Path(args.out) | |
| print(f"Loading metrics from : {args.metrics}") | |
| print(f"Saving plots to : {out}/") | |
| print() | |
| rows = load_csv(args.metrics) | |
| print(f"Loaded {len(rows)} iterations.\n") | |
| print("Generating plots β¦") | |
| plot_eval_quality(rows, out) | |
| plot_training_journey(rows, out) | |
| plot_selfplay_success(rows, out) | |
| plot_reward_confidence(rows, out) | |
| plot_reasoning_quality(rows, out) | |
| print(f"\nβ All 5 plots saved to {out}/") | |
| print("\nFiles:") | |
| for p in sorted(out.glob("plot*.png")): | |
| print(f" {p} ({p.stat().st_size // 1024} KB)") | |
| if __name__ == "__main__": | |
| main() | |