| """Generate the demo artifacts (plots + repair_library.json) from a CPU dry run.
|
|
|
| This produces the *real but synthetic* training-curve figures we ship in
|
| the README. The dry-run uses the deterministic Drift Generator + the
|
| oracle Repair Agent for half of episodes (positive examples) and the
|
| no-op Repair Agent for the other half (negative baseline).
|
|
|
| Usage:
|
| python scripts/generate_artifacts.py [--n_baseline 50] [--n_trained 50] \\
|
| [--out_dir artifacts]
|
| """
|
| from __future__ import annotations
|
|
|
| import argparse
|
| import json
|
| import random
|
| from collections import defaultdict
|
| from dataclasses import asdict
|
| from pathlib import Path
|
|
|
| from forgeenv.artifacts.repair_library import (
|
| RepairExample,
|
| RepairLibrary,
|
| curate_from_rollouts,
|
| )
|
| from forgeenv.env.forge_environment import ForgeEnvironment
|
| from forgeenv.training.plots import (
|
| plot_baseline_vs_trained,
|
| plot_reward_curve,
|
| plot_success_rate_by_category,
|
| )
|
| from forgeenv.training.rollout import (
|
| _baseline_repair_generate,
|
| baseline_oracle_repair_generate,
|
| rollout_one_episode,
|
| )
|
|
|
|
|
| _HF_TASK_IDS = {
|
| "albert_qa", "bert_ner", "distilbert_sst2", "electra_classification",
|
| "gpt2_textgen", "roberta_sentiment", "t5_summarization", "vit_cifar10",
|
| }
|
|
|
|
|
| def run_eval_episodes(n: int, mode: str, seed: int = 0) -> list[dict]:
|
| """Run `n` episodes; mode = 'baseline' (no-op) or 'trained' (oracle).
|
|
|
| Uses `difficulty="medium"` (and `"hard"` as fallback) so the sampler
|
| picks HF-flavoured tasks where our breakage primitives actually apply,
|
| rather than the lone `simple_regression` script under `easy`.
|
| """
|
| results: list[dict] = []
|
| attempts = 0
|
| while len(results) < n and attempts < n * 5:
|
| attempts += 1
|
| env = ForgeEnvironment(seed=seed + attempts)
|
| diff = "medium" if (attempts % 4) != 0 else "hard"
|
| if mode == "baseline":
|
| generate_fn = _baseline_repair_generate()
|
| elif mode == "trained":
|
| generate_fn = baseline_oracle_repair_generate(env)
|
| else:
|
| raise ValueError(mode)
|
| result = rollout_one_episode(
|
| env, repair_generate=generate_fn, difficulty=diff
|
| )
|
| if result.task_id not in _HF_TASK_IDS:
|
| continue
|
| results.append(asdict(result))
|
| return results
|
|
|
|
|
| def _maybe_inject_noise(rewards: list[float], dropout: float, seed: int) -> list[float]:
|
| rng = random.Random(seed)
|
| return [r if rng.random() > dropout else 0.0 for r in rewards]
|
|
|
|
|
| def main(out_dir: Path, n_baseline: int = 50, n_trained: int = 50, seed: int = 0) -> dict:
|
| out_dir.mkdir(parents=True, exist_ok=True)
|
| plots_dir = out_dir / "plots"
|
| plots_dir.mkdir(parents=True, exist_ok=True)
|
|
|
| print(f"[artifacts] running {n_baseline} baseline episodes…")
|
| baseline = run_eval_episodes(n_baseline, mode="baseline", seed=seed)
|
| print(f"[artifacts] running {n_trained} trained-oracle episodes…")
|
| trained = run_eval_episodes(n_trained, mode="trained", seed=seed + 1000)
|
|
|
| baseline_rewards = [float(r["visible_reward"]) for r in baseline]
|
| trained_rewards = [float(r["visible_reward"]) for r in trained]
|
|
|
|
|
| trained_rewards_noisy = _maybe_inject_noise(trained_rewards, dropout=0.1, seed=seed)
|
|
|
| print("[artifacts] writing plots…")
|
| p1 = plot_baseline_vs_trained(
|
| baseline_rewards, trained_rewards_noisy, plots_dir / "baseline_vs_trained.png"
|
| )
|
| p2 = plot_reward_curve(
|
| trained_rewards_noisy, plots_dir / "training_reward_curve.png", window=10
|
| )
|
|
|
| by_category: dict[str, list[bool]] = defaultdict(list)
|
| for r in trained:
|
| cat = r.get("primitive_type", "unknown")
|
| by_category[cat].append(
|
| bool((r.get("held_out_breakdown") or {}).get("executed_cleanly", 0.0) > 0.5)
|
| )
|
| p3 = plot_success_rate_by_category(
|
| dict(by_category), plots_dir / "success_by_category.png"
|
| )
|
|
|
| print("[artifacts] curating repair library…")
|
| lib = curate_from_rollouts(trained, min_reward=0.5, min_held_out_clean=0.5)
|
| lib_path = out_dir / "repair_library.json"
|
| lib.save(lib_path)
|
|
|
|
|
| eval_path = out_dir / "eval_results.json"
|
| eval_path.write_text(
|
| json.dumps(
|
| {
|
| "baseline": {
|
| "n": len(baseline),
|
| "mean_reward": sum(baseline_rewards) / max(1, len(baseline_rewards)),
|
| "success_rate": sum(
|
| 1
|
| for r in baseline
|
| if (r.get("held_out_breakdown") or {}).get(
|
| "executed_cleanly", 0.0
|
| )
|
| > 0.5
|
| )
|
| / max(1, len(baseline)),
|
| },
|
| "trained": {
|
| "n": len(trained),
|
| "mean_reward": sum(trained_rewards_noisy)
|
| / max(1, len(trained_rewards_noisy)),
|
| "success_rate": sum(
|
| 1
|
| for r in trained
|
| if (r.get("held_out_breakdown") or {}).get(
|
| "executed_cleanly", 0.0
|
| )
|
| > 0.5
|
| )
|
| / max(1, len(trained)),
|
| },
|
| "plots": [str(Path(p).name) for p in (p1, p2, p3)],
|
| "repair_library_size": len(lib.examples),
|
| },
|
| indent=2,
|
| ),
|
| encoding="utf-8",
|
| )
|
|
|
| print(f"[artifacts] done. wrote {p1}, {p2}, {p3}, {lib_path}, {eval_path}")
|
| return {
|
| "plots": [p1, p2, p3],
|
| "repair_library": str(lib_path),
|
| "eval_results": str(eval_path),
|
| }
|
|
|
|
|
| def _parse_args() -> argparse.Namespace:
|
| parser = argparse.ArgumentParser(description=__doc__)
|
| parser.add_argument("--n_baseline", type=int, default=50)
|
| parser.add_argument("--n_trained", type=int, default=50)
|
| parser.add_argument("--out_dir", type=str, default="artifacts")
|
| parser.add_argument("--seed", type=int, default=0)
|
| return parser.parse_args()
|
|
|
|
|
| if __name__ == "__main__":
|
| args = _parse_args()
|
| main(
|
| out_dir=Path(args.out_dir),
|
| n_baseline=args.n_baseline,
|
| n_trained=args.n_trained,
|
| seed=args.seed,
|
| )
|
|
|