Spaces:

hirann
/

immunoorg-v3

Paused

App Files Files Community

immunoorg-v3 / scripts /generate_training_evidence.py

hirann

Upload scripts/generate_training_evidence.py with huggingface_hub

067003f verified 12 days ago

raw

history blame contribute delete

11.4 kB

	"""
	Generate the "training-evidence" reward chart without a GPU.

	This script runs the Random and Heuristic policies through the 5
	elite scenario families implemented in ``training/scenario_hooks.py``
	(basic_containment / rag_grounding / executive_alignment /
	silo_breaker / stealth_adaptive) and renders a single PNG showing:

	- Per-family mean episode reward (Random vs Heuristic), with error bars.
	- A "lift" annotation showing how much the heuristic policy beats
	random in each family — i.e. the reward signal the GRPO trainer
	has to climb.

	The PNG is saved as ``evidence_scenario_rewards.png`` at repo root.
	The README references it as an honest reward-curve substitute when
	no GPU run is available.

	Run:
	python scripts/generate_training_evidence.py
	"""

	from __future__ import annotations

	import os
	import sys
	from pathlib import Path
	from statistics import mean, stdev

	import matplotlib

	matplotlib.use("Agg")
	import matplotlib.pyplot as plt

	# Make repo root importable (script is run as `python scripts/...`).
	REPO_ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(REPO_ROOT))

	from immunoorg.environment import ImmunoOrgEnvironment # noqa: E402
	from immunoorg.models import ( # noqa: E402
	ActionType,
	DiagnosticAction,
	ImmunoAction,
	StrategicAction,
	TacticalAction,
	)
	from training.dataset_generator import DatasetConfig, DatasetGenerator # noqa: E402
	from training.scenario_hooks import ( # noqa: E402
	apply_scenario_hooks,
	attach_hooks,
	training_step_penalty,
	)


	# ─── Policies ─────────────────────────────────────────────────────────────────


	def random_policy(rng, obs, env):
	import random as _r
	rng_ = _r.Random(rng)
	atype = rng_.choice([ActionType.TACTICAL, ActionType.STRATEGIC, ActionType.DIAGNOSTIC])
	target = obs.visible_nodes[0].id if obs.visible_nodes else ""
	if atype == ActionType.TACTICAL:
	return ImmunoAction(
	action_type=atype,
	tactical_action=rng_.choice(list(TacticalAction)),
	target=target,
	reasoning="random",
	)
	if atype == ActionType.STRATEGIC:
	return ImmunoAction(
	action_type=atype,
	strategic_action=rng_.choice(list(StrategicAction)),
	target="dept-security",
	reasoning="random",
	)
	return ImmunoAction(
	action_type=atype,
	diagnostic_action=rng_.choice(list(DiagnosticAction)),
	reasoning="random",
	)


	def heuristic_policy(_rng, obs, env):
	"""Phase-aware heuristic that exercises the right "winning-tier" feature
	for each scenario family (RAG forensics, no-isolate alignment, etc.)."""
	phase = obs.current_phase.value
	nodes = obs.visible_nodes
	compromised = [n for n in nodes if n.compromised and not n.isolated]
	hooks = getattr(env, "_active_scenario_hooks", {}) or {}

	# RAG-grounding family: prefer SNAPSHOT_FORENSICS -> DEPLOY_PATCH chain.
	if hooks.get("inject_rag_best_mitigation") and phase in ("detection", "containment"):
	if compromised:
	return ImmunoAction(
	action_type=ActionType.TACTICAL,
	tactical_action=TacticalAction.SNAPSHOT_FORENSICS,
	target=compromised[0].id,
	reasoning="RAG: capture forensic snapshot before patching the rootkit.",
	)

	# Executive alignment: never isolate; deploy IDS / patches instead.
	if hooks.get("board_uptime_no_isolate") and phase == "containment":
	target = compromised[0].id if compromised else (nodes[0].id if nodes else "")
	return ImmunoAction(
	action_type=ActionType.TACTICAL,
	tactical_action=TacticalAction.DEPLOY_PATCH,
	target=target,
	reasoning="Board uptime directive: patch instead of isolating.",
	)

	# Silo-breaker: stop trying to isolate; do org refactor.
	if hooks.get("force_denials_on_isolate") and phase in ("containment", "rca", "refactor"):
	return ImmunoAction(
	action_type=ActionType.STRATEGIC,
	strategic_action=StrategicAction.ESTABLISH_DEVSECOPS,
	target="dept-security",
	secondary_target="dept-engineering",
	reasoning="Approver keeps denying; restructure org to remove the bottleneck.",
	)

	# Stealth: prefer multi-step investigation (vuln scan + trace).
	if hooks.get("stealthy_initial_attack") and phase == "detection":
	return ImmunoAction(
	action_type=ActionType.DIAGNOSTIC,
	diagnostic_action=DiagnosticAction.VULNERABILITY_SCAN,
	reasoning="Stealth attack: deeper scan before tactical action.",
	)

	# Default phase-appropriate heuristic.
	if phase == "detection":
	target = compromised[0].id if compromised else (nodes[0].id if nodes else "")
	return ImmunoAction(
	action_type=ActionType.TACTICAL,
	tactical_action=TacticalAction.SCAN_LOGS,
	target=target,
	reasoning="Detection: scan for indicators.",
	)
	if phase == "containment":
	if compromised:
	return ImmunoAction(
	action_type=ActionType.TACTICAL,
	tactical_action=TacticalAction.ISOLATE_NODE,
	target=compromised[0].id,
	reasoning="Isolate the compromised node.",
	)
	return ImmunoAction(
	action_type=ActionType.DIAGNOSTIC,
	diagnostic_action=DiagnosticAction.TIMELINE_RECONSTRUCT,
	reasoning="Reconstruct timeline.",
	)
	if phase == "rca":
	return ImmunoAction(
	action_type=ActionType.DIAGNOSTIC,
	diagnostic_action=DiagnosticAction.IDENTIFY_SILO,
	reasoning="Find the org silo behind the failure.",
	)
	if phase == "refactor":
	return ImmunoAction(
	action_type=ActionType.STRATEGIC,
	strategic_action=StrategicAction.REDUCE_BUREAUCRACY,
	target="dept-management",
	reasoning="Reduce approval latency.",
	)
	return ImmunoAction(
	action_type=ActionType.DIAGNOSTIC,
	diagnostic_action=DiagnosticAction.MEASURE_ORG_LATENCY,
	reasoning="Validate org improvements.",
	)


	# ─── Scenario rollouts ────────────────────────────────────────────────────────


	def run_scenario(scenario, policy_fn, max_steps=60):
	env = ImmunoOrgEnvironment(
	difficulty=int(scenario["difficulty"]),
	seed=int(scenario["seed"]),
	)
	hooks = scenario.get("hooks") or {}
	attach_hooks(env, hooks)
	obs = env.reset()
	apply_scenario_hooks(env, hooks)

	total_reward = 0.0
	for step in range(min(max_steps, env.state.max_steps)):
	action = policy_fn(scenario["seed"] + step, obs, env)
	obs, reward, done = env.step(action)
	total_reward += float(reward) + float(training_step_penalty(env, action))
	if done:
	break
	return total_reward


	def main():
	print("Generating elite scenario mix (50 scenarios = 10 per family)...")
	gen = DatasetGenerator(
	DatasetConfig(
	dataset_type="elite",
	output_dir="training_runs/_evidence",
	verbose=False,
	compress_output=False,
	)
	)
	scenarios = gen.generate_elite_scenario_mix_dataset(total=50)

	families = [
	"basic_containment",
	"rag_grounding",
	"executive_alignment",
	"silo_breaker",
	"stealth_adaptive",
	]
	by_family = {f: [s for s in scenarios if s["family"] == f] for f in families}

	rewards = {pol: {f: [] for f in families} for pol in ("random", "heuristic")}

	for fam, scs in by_family.items():
	print(f"\n--- {fam} ({len(scs)} scenarios) ---")
	for sc in scs:
	r_rand = run_scenario(sc, random_policy)
	r_heur = run_scenario(sc, heuristic_policy)
	rewards["random"][fam].append(r_rand)
	rewards["heuristic"][fam].append(r_heur)
	print(
	f" random mean={mean(rewards['random'][fam]):+.2f} ± {stdev(rewards['random'][fam]):.2f}"
	)
	print(
	f" heuristic mean={mean(rewards['heuristic'][fam]):+.2f} ± {stdev(rewards['heuristic'][fam]):.2f}"
	)

	# ─── Plot ─────────────────────────────────────────────────────────────
	DARK_BG, CARD_BG = "#0d1117", "#161b22"
	TEXT, GRID = "#c9d1d9", "#30363d"
	COLOR_RAND, COLOR_HEUR = "#f78166", "#3fb950"

	fig, ax = plt.subplots(figsize=(11, 5))
	fig.patch.set_facecolor(DARK_BG)
	ax.set_facecolor(CARD_BG)

	x = list(range(len(families)))
	width = 0.36

	rand_means = [mean(rewards["random"][f]) for f in families]
	rand_stds = [stdev(rewards["random"][f]) for f in families]
	heur_means = [mean(rewards["heuristic"][f]) for f in families]
	heur_stds = [stdev(rewards["heuristic"][f]) for f in families]

	bars1 = ax.bar(
	[i - width / 2 for i in x],
	rand_means,
	width,
	yerr=rand_stds,
	capsize=4,
	color=COLOR_RAND,
	alpha=0.85,
	label="Random policy (untrained baseline)",
	edgecolor="white",
	linewidth=0.6,
	)
	bars2 = ax.bar(
	[i + width / 2 for i in x],
	heur_means,
	width,
	yerr=heur_stds,
	capsize=4,
	color=COLOR_HEUR,
	alpha=0.85,
	label="Heuristic policy (gold standard target for GRPO)",
	edgecolor="white",
	linewidth=0.6,
	)

	# Lift annotations
	for i, fam in enumerate(families):
	lift = heur_means[i] - rand_means[i]
	y_top = max(heur_means[i] + heur_stds[i], rand_means[i] + rand_stds[i]) + 0.4
	ax.annotate(
	f"+{lift:.2f}" if lift >= 0 else f"{lift:.2f}",
	xy=(i, y_top),
	ha="center",
	color=COLOR_HEUR if lift >= 0 else COLOR_RAND,
	fontsize=10,
	fontweight="bold",
	)

	ax.set_xticks(x)
	ax.set_xticklabels(
	[f.replace("_", "\n") for f in families], color=TEXT, fontsize=9
	)
	ax.set_ylabel("mean episode reward (over 10 scenarios)", color=TEXT, fontsize=10)
	ax.set_title(
	"Reward signal across the 5 elite scenarios\n"
	"(each pair shows the lift the GRPO trainer has to climb)",
	color=TEXT,
	fontsize=12,
	)
	ax.tick_params(colors=TEXT, labelsize=9)
	for spine in ax.spines.values():
	spine.set_edgecolor(GRID)
	ax.grid(True, color=GRID, linewidth=0.6, linestyle="--", alpha=0.6, axis="y")
	leg = ax.legend(loc="lower right", fontsize=9)
	for txt in leg.get_texts():
	txt.set_color(TEXT)
	leg.get_frame().set_facecolor(CARD_BG)
	leg.get_frame().set_edgecolor(GRID)

	plt.tight_layout()
	out = REPO_ROOT / "evidence_scenario_rewards.png"
	plt.savefig(out, dpi=160, bbox_inches="tight", facecolor=DARK_BG)
	print(f"\nSaved: {out}")


	if __name__ == "__main__":
	main()