immunoorg-v3 / scripts /make_hackathon_training_figure.py
hirann's picture
Upload scripts/make_hackathon_training_figure.py with huggingface_hub
6ac8669 verified
#!/usr/bin/env python3
"""
Fast evidence PNG for judges (no GPU, ~1–3 min).
Produces evidence_grpo_training.png with:
- Real per-step rewards from a short heuristic episode (env rollout).
- A second panel pointing to Colab for GRPO loss / full training curves.
This does NOT fabricate GRPO loss. It shows real environment signal + where to
find training curves (ImmunoOrg_Training_Colab.ipynb Step 4b).
"""
from __future__ import annotations
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
REPO = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO))
from immunoorg.environment import ImmunoOrgEnvironment # noqa: E402
from immunoorg.models import ( # noqa: E402
ActionType,
DiagnosticAction,
ImmunoAction,
StrategicAction,
TacticalAction,
)
def _heuristic(obs, env):
"""Tiny heuristic: same spirit as demo (contain + progress)."""
phase = obs.current_phase.value
nodes = obs.visible_nodes
compromised = [n for n in nodes if n.compromised and not n.isolated]
if phase == "detection":
t = compromised[0].id if compromised else (nodes[0].id if nodes else "")
return ImmunoAction(
action_type=ActionType.TACTICAL,
tactical_action=TacticalAction.SCAN_LOGS,
target=t,
reasoning="evidence script",
)
if phase == "containment" and compromised:
return ImmunoAction(
action_type=ActionType.TACTICAL,
tactical_action=TacticalAction.ISOLATE_NODE,
target=compromised[0].id,
reasoning="evidence script",
)
if phase == "rca":
return ImmunoAction(
action_type=ActionType.DIAGNOSTIC,
diagnostic_action=DiagnosticAction.IDENTIFY_SILO,
reasoning="evidence script",
)
if phase == "refactor":
return ImmunoAction(
action_type=ActionType.STRATEGIC,
strategic_action=StrategicAction.REDUCE_BUREAUCRACY,
target="dept-management",
reasoning="evidence script",
)
return ImmunoAction(
action_type=ActionType.DIAGNOSTIC,
diagnostic_action=DiagnosticAction.MEASURE_ORG_LATENCY,
reasoning="evidence script",
)
def main() -> None:
env = ImmunoOrgEnvironment(difficulty=1, seed=42)
obs = env.reset()
steps_r: list[int] = []
rewards_r: list[float] = []
cum: list[float] = []
total = 0.0
max_steps = 35
for t in range(max_steps):
action = _heuristic(obs, env)
obs, r, done = env.step(action)
total += float(r)
steps_r.append(t + 1)
rewards_r.append(float(r))
cum.append(total)
if done:
break
DARK, CARD, TEXT, GRID = "#0d1117", "#161b22", "#c9d1d9", "#30363d"
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 7), dpi=140, height_ratios=[2.2, 1.0])
fig.patch.set_facecolor(DARK)
for ax in (ax1, ax2):
ax.set_facecolor(CARD)
ax.tick_params(colors=TEXT)
for s in ax.spines.values():
s.set_color(GRID)
ax1.plot(steps_r, cum, color="#3fb950", lw=2, marker="o", ms=3, label="cumulative reward")
ax1.set_xlabel("env step", color=TEXT)
ax1.set_ylabel("cumulative episode reward", color=TEXT)
ax1.set_title(
"Real env rollout β€” heuristic policy (difficulty 1)\n"
"GRPO in Colab learns policies that improve rewards in this same simulator",
color=TEXT,
fontsize=11,
)
ax1.grid(True, color=GRID, alpha=0.5, linestyle="--")
leg = ax1.legend(facecolor=CARD, edgecolor=GRID, labelcolor=TEXT)
ax2.axis("off")
msg = (
"GRPO training loss + logged rewards\n"
"────────────────────────────────────\n"
"Open: ImmunoOrg_Training_Colab.ipynb\n"
"β†’ Runtime β†’ Run all (GPU)\n"
"β†’ Step 4b saves evidence_grpo_training.png\n"
"\n"
"This file’s top panel is a real environment signal;\n"
"the notebook adds the optimizer loss curves judges ask for."
)
ax2.text(
0.04,
0.96,
msg,
transform=ax2.transAxes,
va="top",
ha="left",
fontsize=10,
color=TEXT,
family="monospace",
)
fig.tight_layout()
out = REPO / "evidence_grpo_training.png"
fig.savefig(out, bbox_inches="tight", facecolor=DARK)
print(f"Wrote {out}")
if __name__ == "__main__":
main()