forgeenv source snapshot for training job

a15535e verified 13 days ago

4.12 kB

	"""Matplotlib plotting helpers — produces the 3 PNGs that go into the README.

	Plots:
	1. baseline_vs_trained.png — bar/line comparison
	2. training_reward_curve.png — moving-average reward over episodes
	3. success_by_category.png — per-primitive-type success rate

	All plots are 600x400 @ 100 dpi, label both axes, and use a colour-blind-safe palette.
	"""
	from __future__ import annotations

	from pathlib import Path
	from typing import Iterable

	import matplotlib

	matplotlib.use("Agg")
	import matplotlib.pyplot as plt # noqa: E402

	PALETTE = {
	"baseline": "#888888",
	"trained": "#1F77B4",
	"ema": "#D62728",
	"raw": "#1F77B4",
	}


	def _moving_average(values: list[float], window: int = 10) -> list[float]:
	if not values:
	return []
	out: list[float] = []
	cumsum = 0.0
	for i, v in enumerate(values):
	cumsum += v
	if i >= window:
	cumsum -= values[i - window]
	out.append(cumsum / min(i + 1, window))
	return out


	def plot_baseline_vs_trained(
	baseline_rewards: list[float],
	trained_rewards: list[float],
	out_path: str \| Path,
	title: str = "ForgeEnv: Baseline vs Trained (50 eval episodes)",
	) -> str:
	"""Side-by-side bar chart of mean reward + per-episode strip plot."""
	out_path = Path(out_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	fig, ax = plt.subplots(figsize=(6, 4), dpi=100)

	means = [
	sum(baseline_rewards) / max(1, len(baseline_rewards)),
	sum(trained_rewards) / max(1, len(trained_rewards)),
	]
	labels = ["Baseline (no-op)", "Trained (GRPO)"]
	colors = [PALETTE["baseline"], PALETTE["trained"]]
	bars = ax.bar(labels, means, color=colors, width=0.5, alpha=0.85)
	ax.bar_label(bars, fmt="%.2f", padding=3)

	for x, rewards in zip([0, 1], [baseline_rewards, trained_rewards]):
	if rewards:
	xs = [x + 0.18] * len(rewards)
	ax.scatter(xs, rewards, s=8, color="black", alpha=0.4, zorder=3)

	ax.set_ylabel("Visible verifier reward")
	ax.set_title(title)
	ax.grid(axis="y", linestyle=":", alpha=0.5)
	ax.set_ylim(bottom=min(0, min(means + baseline_rewards + trained_rewards or [0])))
	fig.tight_layout()
	fig.savefig(out_path, dpi=100, bbox_inches="tight")
	plt.close(fig)
	return str(out_path)


	def plot_reward_curve(
	rewards: list[float],
	out_path: str \| Path,
	window: int = 10,
	title: str = "ForgeEnv: Repair Agent reward over training",
	) -> str:
	out_path = Path(out_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	fig, ax = plt.subplots(figsize=(6, 4), dpi=100)
	xs = list(range(1, len(rewards) + 1))
	ax.plot(xs, rewards, color=PALETTE["raw"], alpha=0.35, linewidth=1.0, label="Per-episode")
	if rewards:
	ax.plot(
	xs,
	_moving_average(rewards, window=window),
	color=PALETTE["ema"],
	linewidth=2.0,
	label=f"Moving avg (w={window})",
	)
	ax.set_xlabel("Episode")
	ax.set_ylabel("Visible verifier reward")
	ax.set_title(title)
	ax.legend(loc="lower right")
	ax.grid(linestyle=":", alpha=0.4)
	fig.tight_layout()
	fig.savefig(out_path, dpi=100, bbox_inches="tight")
	plt.close(fig)
	return str(out_path)


	def plot_success_rate_by_category(
	by_category: dict[str, list[bool]],
	out_path: str \| Path,
	title: str = "ForgeEnv: Repair success by primitive type",
	) -> str:
	out_path = Path(out_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	fig, ax = plt.subplots(figsize=(7, 4), dpi=100)

	cats = list(by_category.keys())
	rates = [
	sum(by_category[c]) / max(1, len(by_category[c])) for c in cats
	]
	bars = ax.barh(cats, rates, color=PALETTE["trained"], alpha=0.85)
	ax.bar_label(bars, fmt="%.2f", padding=3)
	ax.set_xlim(0, 1.05)
	ax.set_xlabel("Success rate (held-out: executed_cleanly)")
	ax.set_title(title)
	ax.grid(axis="x", linestyle=":", alpha=0.4)
	fig.tight_layout()
	fig.savefig(out_path, dpi=100, bbox_inches="tight")
	plt.close(fig)
	return str(out_path)