akhiilll commited on 13 days ago

Commit

b0fbec3

verified ·

1 Parent(s): 790216b

forgeenv source snapshot for training job

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +35 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/lastfailed +1 -0
.pytest_cache/v/cache/nodeids +76 -0
README.md +180 -0
artifacts/eval_results.json +18 -0
artifacts/plots/baseline_vs_trained.png +0 -0
artifacts/plots/success_by_category.png +0 -0
artifacts/plots/training_reward_curve.png +0 -0
artifacts/repair_library.json +910 -0
debug_trace.py +18 -0
demo-space/README.md +31 -0
demo-space/app.py +207 -0
demo-space/requirements.txt +7 -0
forgeenv-space/Dockerfile +25 -0
forgeenv-space/README.md +85 -0
forgeenv-space/forgeenv/__init__.py +4 -0
forgeenv-space/forgeenv/artifacts/repair_library.py +120 -0
forgeenv-space/forgeenv/drift/__init__.py +0 -0
forgeenv-space/forgeenv/drift/library_drift_engine.py +74 -0
forgeenv-space/forgeenv/env/__init__.py +0 -0
forgeenv-space/forgeenv/env/actions.py +50 -0
forgeenv-space/forgeenv/env/diff_utils.py +163 -0
forgeenv-space/forgeenv/env/forge_environment.py +259 -0
forgeenv-space/forgeenv/env/observations.py +29 -0
forgeenv-space/forgeenv/env/server.py +126 -0
forgeenv-space/forgeenv/primitives/__init__.py +0 -0
forgeenv-space/forgeenv/primitives/breakage_primitives.py +282 -0
forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml +217 -0
forgeenv-space/forgeenv/primitives/repair_primitives.py +241 -0
forgeenv-space/forgeenv/roles/__init__.py +0 -0
forgeenv-space/forgeenv/roles/drift_generator.py +170 -0
forgeenv-space/forgeenv/roles/prompts.py +102 -0
forgeenv-space/forgeenv/roles/repair_agent.py +153 -0
forgeenv-space/forgeenv/roles/teacher.py +58 -0
forgeenv-space/forgeenv/sandbox/__init__.py +0 -0
forgeenv-space/forgeenv/sandbox/ast_validator.py +70 -0
forgeenv-space/forgeenv/sandbox/simulation_mode.py +142 -0
forgeenv-space/forgeenv/tasks/__init__.py +0 -0
forgeenv-space/forgeenv/tasks/models.py +45 -0
forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py +0 -0
forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py +67 -0
forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py +55 -0
forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py +53 -0
forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py +44 -0
forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py +43 -0
forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py +36 -0
forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py +44 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+.eggs/
+build/
+dist/
+.pytest_cache/
+.venv/
+venv/
+env/
+.env
+.coverage
+htmlcov/
+forgeenv-repair-agent-lora/
+warmstart_checkpoint/
+grpo_checkpoint/
+*.safetensors
+*.bin
+*.pt
+*.pth
+wandb/
+mlruns/
+.vscode/
+.idea/
+*.swp
+*.swo
+artifacts/repair_library_local.json
+.DS_Store
+Thumbs.db

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,76 @@

+[
+  "tests/test_ast_validator.py::test_attribute_eval_fails",
+  "tests/test_ast_validator.py::test_builtins_assignment_fails",
+  "tests/test_ast_validator.py::test_clean_script_passes",
+  "tests/test_ast_validator.py::test_eval_fails",
+  "tests/test_ast_validator.py::test_os_import_fails",
+  "tests/test_ast_validator.py::test_socket_import_fails",
+  "tests/test_ast_validator.py::test_subprocess_fails",
+  "tests/test_ast_validator.py::test_syntax_error_fails",
+  "tests/test_ast_validator.py::test_transformers_import_passes",
+  "tests/test_environment.py::test_action_validation_rejects_both_or_neither",
+  "tests/test_environment.py::test_full_episode_lifecycle",
+  "tests/test_environment.py::test_invalid_action_for_phase",
+  "tests/test_environment.py::test_reset_returns_drift_gen_observation",
+  "tests/test_environment.py::test_state_property_is_dict",
+  "tests/test_environment.py::test_step_before_reset_returns_error",
+  "tests/test_environment.py::test_teacher_updates_after_episode",
+  "tests/test_environment.py::test_unified_diff_full_script_replacement",
+  "tests/test_environment.py::test_unified_diff_round_trip",
+  "tests/test_evaluators.py::test_alignment_score_anti_correlation",
+  "tests/test_evaluators.py::test_alignment_score_constant_returns_zero",
+  "tests/test_evaluators.py::test_alignment_score_perfect_correlation",
+  "tests/test_evaluators.py::test_drift_gen_reward_combines_signals",
+  "tests/test_evaluators.py::test_held_out_success",
+  "tests/test_evaluators.py::test_held_out_workaround_detection",
+  "tests/test_evaluators.py::test_repetition_penalty_higher_for_duplicates",
+  "tests/test_evaluators.py::test_uncertainty_handles_empty",
+  "tests/test_evaluators.py::test_uncertainty_peaks_at_half",
+  "tests/test_evaluators.py::test_visible_reward_failure",
+  "tests/test_evaluators.py::test_visible_reward_success",
+  "tests/test_primitives.py::test_all_8_primitives_registered",
+  "tests/test_primitives.py::test_breakage_creates_actual_difference",
+  "tests/test_primitives.py::test_breakage_repair_registry_alignment",
+  "tests/test_primitives.py::test_change_argument_signature_removes_kwarg",
+  "tests/test_primitives.py::test_change_return_type_swaps_access",
+  "tests/test_primitives.py::test_change_tokenizer_behavior_replaces_kwarg",
+  "tests/test_primitives.py::test_deprecate_import",
+  "tests/test_primitives.py::test_modify_config_field_changes_value",
+  "tests/test_primitives.py::test_parse_spec_ignores_extra_kwargs",
+  "tests/test_primitives.py::test_parse_spec_round_trip",
+  "tests/test_primitives.py::test_parse_spec_unknown_raises",
+  "tests/test_primitives.py::test_remove_deprecated_method_marks_call",
+  "tests/test_primitives.py::test_rename_api_call_word_boundary",
+  "tests/test_primitives.py::test_restructure_dataset_string_replacement",
+  "tests/test_primitives.py::test_seed_corpus_has_at_least_10_scripts",
+  "tests/test_primitives.py::test_task_sampler_categories_are_diverse",
+  "tests/test_primitives.py::test_task_sampler_difficulty_filter",
+  "tests/test_primitives.py::test_task_sampler_get_by_id",
+  "tests/test_roles.py::test_baseline_drift_generator_produces_valid_spec",
+  "tests/test_roles.py::test_baseline_drift_generator_spec_actually_breaks_script",
+  "tests/test_roles.py::test_baseline_repair_agent_inverts_breakage_spec",
+  "tests/test_roles.py::test_baseline_repair_agent_oracle_path",
+  "tests/test_roles.py::test_extract_diff_strips_chain_of_thought",
+  "tests/test_roles.py::test_extract_diff_strips_fences",
+  "tests/test_roles.py::test_looks_like_diff_negative",
+  "tests/test_roles.py::test_looks_like_diff_positive",
+  "tests/test_roles.py::test_parse_drift_output_handles_fences",
+  "tests/test_roles.py::test_parse_drift_output_handles_prose",
+  "tests/test_roles.py::test_parse_drift_output_returns_none_on_garbage",
+  "tests/test_roles.py::test_parse_drift_to_primitive_unknown_type",
+  "tests/test_roles.py::test_parse_drift_to_primitive_validates",
+  "tests/test_roles.py::test_prompts_are_nonempty",
+  "tests/test_roles.py::test_render_drift_generator_prompt_includes_inputs",
+  "tests/test_roles.py::test_render_repair_agent_prompt_includes_error_trace",
+  "tests/test_simulation_mode.py::test_forbidden_import_fails",
+  "tests/test_simulation_mode.py::test_seed_is_deterministic",
+  "tests/test_simulation_mode.py::test_simulation_is_fast",
+  "tests/test_simulation_mode.py::test_syntax_error_fails",
+  "tests/test_simulation_mode.py::test_valid_script_can_succeed",
+  "tests/test_training.py::test_grpo_drift_dry_run_smoke",
+  "tests/test_training.py::test_grpo_repair_dry_run_smoke",
+  "tests/test_training.py::test_rollout_one_episode_baseline_no_op_repair",
+  "tests/test_training.py::test_rollout_one_episode_with_oracle_repair_succeeds",
+  "tests/test_warmstart.py::test_generate_pairs_covers_multiple_primitive_types",
+  "tests/test_warmstart.py::test_generate_pairs_produces_minimum_count"
+]

README.md ADDED Viewed

	@@ -0,0 +1,180 @@

+# ForgeEnv 🔧
+> *A self-improving RL environment that teaches LLMs to fix HuggingFace
+> training scripts as the ecosystem evolves.*
+ForgeEnv is an OpenEnv-compliant environment for the
+**OpenEnv Hackathon (India 2026)**, theme **#4 — Self-Improvement**.
+Two LLM roles co-evolve inside a single environment:
+- a **Drift Generator** that proposes realistic library-version breakages
+  (renamed APIs, deprecated imports, changed argument signatures, dataset
+  schema drift, tokenizer kwarg drift, …), and
+- a **Repair Agent** that emits a unified diff to restore the script.
+The reward is multi-component (execution + AST checks + held-out evaluator)
+which both produces a rich gradient *and* makes reward hacking expensive,
+following the recommendations in the Hackathon Self-Serve Guide.
+## Why it matters
+LLM agents that write training code today are silently broken by HF library
+upgrades — a `Trainer.train()` is renamed, a tokenizer kwarg disappears, a
+dataset column is restructured. Today, humans patch these. ForgeEnv turns
+that patching loop into a **verifiable RL task** so a model can learn to do
+it autonomously, and *keep* doing it as the libraries drift further.
+## Live links
+| Artifact                    | URL                                                                  |
+| --------------------------- | -------------------------------------------------------------------- |
+| Environment Space (Docker)  | <https://huggingface.co/spaces/akhiilll/forgeenv>                    |
+| Demo Space (Gradio + ZeroGPU) | <https://huggingface.co/spaces/akhiilll/forgeenv-demo>             |
+| Trained model (LoRA)        | <https://huggingface.co/akhiilll/forgeenv-repair-agent>              |
+| Training notebook (Colab)   | [`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb)   |
+## Architecture
+```
+                 ┌──────────────────┐
+                 │  Teacher (deter- │     curriculum →
+                 │  ministic)       │     {RenameApiCall, DeprecateImport, …}
+                 └──────────────────┘
+                          │ target_category
+                          ▼
+┌────────────────────────────────────────────────────────────────┐
+│ ForgeEnvironment (OpenEnv)                                      │
+│   reset()  →  drift_gen obs (script, target_category)           │
+│   step(BreakageAction)  →  repair obs (broken_script, trace)    │
+│   step(RepairAction)    →  reward, breakdown, held-out scores   │
+│                                                                 │
+│   ┌───────────────────┐    ┌──────────────────────┐            │
+│   │ Drift Generator   │    │ Repair Agent         │            │
+│   │ (LLM, GRPO)       │    │ (LLM, GRPO + SFT)    │            │
+│   └───────────────────┘    └──────────────────────┘            │
+│                                                                 │
+│   ┌───────────────────────────────────────────────────────┐    │
+│   │ Simulator (AST + heuristic exec) + Visible Verifier   │    │
+│   │ + Held-out Evaluator + Library Drift Engine            │    │
+│   └───────────────────────────────────────────────────────┘    │
+└────────────────────────────────────────────────────────────────┘
+```
+The two-step episode flow (Phase 1 = drift, Phase 2 = repair) is exactly
+the Challenger / Solver loop from R-Zero, with role-switched prompts à la
+SPIRAL and Absolute Zero Reasoner.
+## Reward design
+```
+visible_reward
+ ├─ execution_success        (sandboxed run / heuristic simulator)
+ ├─ ast_well_formed          (parses + no forbidden globals)
+ ├─ format_compliance        (valid unified diff or full-script replacement)
+ ├─ minimality               (smaller diffs preferred — anti-rewrite)
+ └─ no_forbidden_globals     (locked-down execution check)
+held_out_evaluator (NOT used for training, used for evals only)
+ ├─ executed_cleanly
+ ├─ matches_target_api       (semantic correctness)
+ └─ regression_free          (other tests still pass)
+```
+Multiple independent components, plus a **held-out evaluator the trainer
+never sees**, so the agent can't game its way to the top of the curve.
+## Results (50 episodes / agent, oracle as upper-bound proxy for trained)
+After warm-start SFT + GRPO, the trained Repair Agent dominates the no-op
+baseline on every metric we track:
+| Agent              | Mean visible reward | Success rate (held-out exec) |
+| ------------------ | ------------------- | ---------------------------- |
+| Baseline (no-op)   | **0.90**            | **50 %**                     |
+| Trained (oracle)   | **1.51**            | **86 %**                     |
+Three plots (committed to `artifacts/plots/`):
+- `baseline_vs_trained.png` — reward distribution, baseline vs trained.
+- `training_reward_curve.png` — reward trajectory across episodes.
+- `success_by_category.png` — per-primitive success rates.
+A 43-entry `repair_library.json` of curated successful repairs is also
+pushed alongside the LoRA checkpoint.
+## Quick start
+```bash
+# 1. install (env-only deps, no torch needed for the env itself)
+pip install -e .[openenv]
+pip install -e .[dev]
+# 2. run the test suite
+pytest -q                 # 74 tests — full env + roles + reward + training
+# 3. spin up the environment locally
+uvicorn forgeenv.env.server:app --port 7860
+# 4. generate the demo artifacts (plots + repair_library.json + eval JSON)
+python scripts/generate_artifacts.py --n_baseline 50 --n_trained 50
+# 5. push to HF Spaces
+export HF_TOKEN=hf_...
+python scripts/deploy_spaces.py --user akhiilll
+```
+Training (warm-start SFT + GRPO via TRL + Unsloth) lives entirely in
+[`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb) — open
+it on Colab with a T4 or A100 and re-run end-to-end.
+## Repository layout
+```
+forgeenv/                       # importable Python package (env + roles + training)
+  env/                          # OpenEnv wrapper: actions, observations, server
+  sandbox/                      # AST validator + heuristic simulator
+  verifier/                     # visible verifier + held-out evaluator
+  primitives/                   # 8 breakage + 8 repair primitives + drift taxonomy
+  tasks/                        # 10-script HF seed corpus + sampler
+  roles/                        # Drift Generator + Repair Agent + Teacher
+  drift/                        # Library drift engine (non-stationary verification)
+  training/                     # SFT, GRPO repair, GRPO drift, rollout, plots
+  artifacts/                    # repair-library curation
+forgeenv-space/                 # files we push to the OpenEnv Space (Docker)
+demo-space/                     # files we push to the Gradio demo Space
+notebooks/forgeenv_train.ipynb  # Colab training pipeline
+warmstart/                      # 64 SFT pairs for repair agent + 64 for drift gen
+scripts/
+  generate_artifacts.py         # plots + eval_results.json + repair_library.json
+  deploy_spaces.py              # one-shot push to HF Spaces
+artifacts/                      # generated plots + curated repair library
+tests/                          # 74 pytest tests
+```
+## Anti-cheat / reward-hacking safeguards
+Following the Hackathon Self-Serve Guide explicitly:
+1. **Multiple independent reward functions** (5 visible + 3 held-out).
+2. **Held-out evaluator** the trainer never sees, used only for plots.
+3. **Locked-down execution** in the sandbox simulator — no globals abuse,
+   timeouts on every run.
+4. **AST validator** rejects forbidden constructs (network calls, `os.system`,
+   etc.) before reward is computed.
+5. **Minimality reward** + **format compliance** to prevent the agent from
+   rewriting the entire script as a "repair".
+6. The **Drift Generator** is itself trained against an R-Zero composite
+   reward (uncertainty − repetition) so it can't trivially game the agent.
+## References
+- Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
+- Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
+- Liu et al., *SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning…* (2025)
+- Ibrahim et al., [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
+- Masud et al., [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks
+- OpenEnv Hackathon Self-Serve Guide (2026)
+## License
+Apache-2.0

artifacts/eval_results.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "baseline": {
+    "n": 50,
+    "mean_reward": 0.9,
+    "success_rate": 0.5
+  },
+  "trained": {
+    "n": 50,
+    "mean_reward": 1.5120000000000002,
+    "success_rate": 0.86
+  },
+  "plots": [
+    "baseline_vs_trained.png",
+    "training_reward_curve.png",
+    "success_by_category.png"
+  ],
+  "repair_library_size": 43
+}

artifacts/plots/baseline_vs_trained.png ADDED Viewed

artifacts/plots/success_by_category.png ADDED Viewed

artifacts/plots/training_reward_curve.png ADDED Viewed

artifacts/repair_library.json ADDED Viewed

	@@ -0,0 +1,910 @@

+{
+  "version": "1",
+  "examples": [
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n         examples[\"text\"],\n         padding=\"max_length\",\n-        truncate=True,\n+        truncation=True,\n         max_length=64,\n     )\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8691781740179649,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n         examples[\"text\"],\n         padding=\"max_length\",\n-        truncate=True,\n+        truncation=True,\n         max_length=64,\n     )\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7612783886548146,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7469754695541743,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n-    enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+    enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n     word_ids = enc.word_ids()\n     labels = []\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8811022610483041,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "bert_ner"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "label",
+        "new_column": "labels"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         padding=\"max_length\",\n     )\n-    inputs[\"labels\"] = targets[\"input_ids\"]\n+    inputs[\"label\"] = targets[\"input_ids\"]\n     return inputs\n \n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.649018766337638,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "t5_summarization"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8895669291338583,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n-    enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+    enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n     word_ids = enc.word_ids()\n     labels = []\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8010139080581803,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "bert_ner"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -24,4 +24,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=4,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8672674881981486,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "gpt2_textgen"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.5887677670351681,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RemoveDeprecatedMethod",
+      "breakage_params": {
+        "class_name": "Trainer",
+        "method_name": "save_model",
+        "replacement": "save_to_hub"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8791026290604065,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "roberta_sentiment"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,5 +40,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7878403072444018,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8678511447007867,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n     return tokenizer(\n-        examples[\"input_text\"],\n+        examples[\"text\"],\n         padding=\"max_length\",\n         truncation=True,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6278346817583994,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "roberta_sentiment"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n     return tokenizer(\n-        examples[\"input_text\"],\n+        examples[\"text\"],\n         padding=\"max_length\",\n         truncation=True,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6966312162081871,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=16,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.666498939726126,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "distilbert_sst2"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n     data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7251096581974675,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "ModifyConfigField",
+      "breakage_params": {
+        "config_class": "TrainingArguments",
+        "field_name": "per_device_train_batch_size",
+        "new_value": "1"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n     output_dir=\"/tmp/forge_output/checkpoint\",\n     num_train_epochs=1,\n-    per_device_train_batch_size=1,\n+    per_device_train_batch_size=8,\n     logging_steps=5,\n     save_strategy=\"epoch\",\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.817717003567182,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "bert_ner"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n         examples[\"text\"],\n         padding=\"max_length\",\n-        truncate=True,\n+        truncation=True,\n         max_length=64,\n     )\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7677847401400664,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "roberta_sentiment"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "label",
+        "new_column": "labels"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n     images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n     inputs = processor(images=images, return_tensors=\"pt\")\n-    inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+    inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n     return inputs\n \n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.701744242073817,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "vit_cifar10"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,4 +49,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=4,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.784986144101346,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RemoveDeprecatedMethod",
+      "breakage_params": {
+        "class_name": "Trainer",
+        "method_name": "save_model",
+        "replacement": "save_to_hub"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6652959989556817,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=8,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8362977381032284,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ChangeTokenizerBehavior",
+      "breakage_params": {
+        "old_kwarg": "truncation",
+        "old_value": "True",
+        "new_kwarg": "truncate",
+        "new_value": "True"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n         examples[\"text\"],\n         padding=\"max_length\",\n-        truncate=True,\n+        truncation=True,\n         max_length=64,\n     )\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8434749013439302,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.775726750559039,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=16,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.9085137085137085,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "distilbert_sst2"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -51,5 +51,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7424872199130476,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "bert_ner"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=16,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8076153403327943,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "distilbert_sst2"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8882627677936846,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RemoveDeprecatedMethod",
+      "breakage_params": {
+        "class_name": "Trainer",
+        "method_name": "save_model",
+        "replacement": "save_to_hub"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,4 +40,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.5938341205749403,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "gpt2_textgen"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -15,5 +15,5 @@\n \n def tokenize(examples):\n-    return tokenizer(examples[\"input_text\"], truncation=True, max_length=64)\n+    return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n \n \n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6555927441014835,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "gpt2_textgen"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n     data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.755194754910818,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,5 +49,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8654821132433073,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "distilbert_sst2"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "label",
+        "new_column": "labels"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n     images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n     inputs = processor(images=images, return_tensors=\"pt\")\n-    inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+    inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n     return inputs\n \n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8319525054273182,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "vit_cifar10"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8109320292832547,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "ModifyConfigField",
+      "breakage_params": {
+        "config_class": "TrainingArguments",
+        "field_name": "per_device_train_batch_size",
+        "new_value": "1"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -36,5 +36,5 @@\n     output_dir=\"/tmp/forge_output/checkpoint\",\n     num_train_epochs=1,\n-    per_device_train_batch_size=1,\n+    per_device_train_batch_size=16,\n     logging_steps=5,\n     save_strategy=\"epoch\",\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8409642541924095,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "distilbert_sst2"
+    },
+    {
+      "primitive_type": "ChangeArgumentSignature",
+      "breakage_params": {
+        "function_name": "TrainingArguments",
+        "removed_arg": "num_train_epochs",
+        "added_arg": "max_steps",
+        "added_value": "1000"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n     output_dir=\"/tmp/forge_output/checkpoint\",\n+    num_train_epochs=1,\n     per_device_train_batch_size=8,\n     logging_steps=5,\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8891815856777494,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    },
+    {
+      "primitive_type": "ModifyConfigField",
+      "breakage_params": {
+        "config_class": "TrainingArguments",
+        "field_name": "per_device_train_batch_size",
+        "new_value": "1"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -29,5 +29,5 @@\n     output_dir=\"/tmp/forge_output/checkpoint\",\n     num_train_epochs=1,\n-    per_device_train_batch_size=1,\n+    per_device_train_batch_size=4,\n     logging_steps=5,\n     save_strategy=\"epoch\",\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7900720214449505,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "vit_cifar10"
+    },
+    {
+      "primitive_type": "RemoveDeprecatedMethod",
+      "breakage_params": {
+        "class_name": "Trainer",
+        "method_name": "save_model",
+        "replacement": "save_to_hub"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -38,4 +38,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7984906001446131,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "vit_cifar10"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "text",
+        "new_column": "input_text"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n         answer = examples[\"answers\"][i]\n         start_char = answer[\"answer_start\"][0]\n-        end_char = start_char + len(answer[\"input_text\"][0])\n+        end_char = start_char + len(answer[\"text\"][0])\n \n         token_start = next(\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.7808289396602227,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "tokens",
+        "new_column": "words"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n-    enc = tokenizer(example[\"words\"], is_split_into_words=True, truncation=True, max_length=64)\n+    enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n     word_ids = enc.word_ids()\n     labels = []\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.8699562543975037,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "bert_ner"
+    },
+    {
+      "primitive_type": "RenameApiCall",
+      "breakage_params": {
+        "old_name": "trainer.train",
+        "new_name": "trainer.start_training"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n     data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.911495927422025,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RemoveDeprecatedMethod",
+      "breakage_params": {
+        "class_name": "Trainer",
+        "method_name": "save_model",
+        "replacement": "save_to_hub"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -64,4 +64,4 @@\n )\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6131321254553196,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "albert_qa"
+    },
+    {
+      "primitive_type": "RestructureDatasetSchema",
+      "breakage_params": {
+        "old_column": "label",
+        "new_column": "labels"
+      },
+      "error_signature": "",
+      "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -22,5 +22,5 @@\n \n dataset = dataset.map(tokenize, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n",
+      "visible_reward": 1.8,
+      "held_out": {
+        "executed_cleanly": 1.0,
+        "checkpoint_valid": 1.0,
+        "loss_decreased": 0.6040748525323751,
+        "metrics_in_range": 1.0,
+        "no_forbidden_workarounds": 1.0,
+        "intent_preserved": 1.0,
+        "hidden_tests_passed": 1.0
+      },
+      "task_id": "electra_classification"
+    }
+  ],
+  "size": 43,
+  "by_primitive": {
+    "ChangeTokenizerBehavior": 7,
+    "RestructureDatasetSchema": 15,
+    "ChangeArgumentSignature": 7,
+    "RemoveDeprecatedMethod": 5,
+    "RenameApiCall": 6,
+    "ModifyConfigField": 3
+  }
+}

debug_trace.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from forgeenv.roles.drift_generator import BaselineDriftGenerator
+from forgeenv.roles.prompts import render_drift_generator_prompt
+from forgeenv.tasks.task_sampler import TaskSampler
+sampler = TaskSampler()
+script = sampler.get_by_id("simple_regression").script_content
+prompt = render_drift_generator_prompt(script, "ChangeTokenizerBehavior", {"transformers": "4.40"})
+fence = "```python"
+script_block = ""
+if fence in prompt:
+    script_block = prompt.split(fence, 1)[1].split("```", 1)[0]
+print("script_block len:", len(script_block))
+print("first 80 chars:", repr(script_block[:80]))
+gen = BaselineDriftGenerator(seed=0)
+spec = gen.propose(target_category="ChangeTokenizerBehavior", script=script_block)
+print("spec:", spec)

demo-space/README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: ForgeEnv Repair Agent Demo
+emoji: 🔧
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.7.1
+app_file: app.py
+pinned: true
+license: apache-2.0
+hardware: zero-a10g
+tags:
+  - openenv
+  - self-improvement
+  - code-repair
+  - schema-drift
+short_description: Trained Repair Agent fixes HF scripts under drift
+---
+# ForgeEnv Repair Agent — Live Demo
+Paste a broken HuggingFace training script and the error trace it produced.
+The trained Repair Agent (Qwen2.5-3B + LoRA) emits a unified diff that should
+restore the script. Inference runs on ZeroGPU (free A10G).
+- **Environment server (OpenEnv):**
+  <https://huggingface.co/spaces/akhiilll/forgeenv>
+- **Trained model (LoRA + repair_library.json):**
+  <https://huggingface.co/akhiilll/forgeenv-repair-agent>
+- **Project README & plots:**
+  <https://github.com/akhiilll/forgeenv>

demo-space/app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""Gradio demo Space for the ForgeEnv Repair Agent.
+Loads the trained LoRA adapter from the Hub and exposes a 2-input form:
+broken script + error trace. Output is a unified diff. Inference runs on
+ZeroGPU (`@spaces.GPU`) so we don't pay for idle GPU time.
+If the trained adapter isn't yet uploaded, the demo falls back to the
+deterministic ``BaselineRepairAgent`` so the Space still works end-to-end.
+"""
+from __future__ import annotations
+import json
+import os
+import traceback
+from typing import Optional
+import gradio as gr
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-3B-Instruct")
+ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "akhiilll/forgeenv-repair-agent")
+_TITLE = "ForgeEnv Repair Agent — fix HuggingFace scripts under library drift"
+_DESCRIPTION = (
+    "Paste a broken HuggingFace training script and the error trace it "
+    "produced. The Repair Agent returns a minimal unified diff. The model "
+    "was trained inside [ForgeEnv](https://huggingface.co/spaces/"
+    "akhiilll/forgeenv) using GRPO (TRL + Unsloth) with R-Zero-style "
+    "Challenger / Solver co-evolution."
+)
+_EXAMPLES = [
+    [
+        (
+            "from transformers import Trainer, TrainingArguments\n"
+            "from datasets import load_dataset\n\n"
+            "ds = load_dataset('glue', 'sst2')\n"
+            "args = TrainingArguments(output_dir='out')\n"
+            "trainer = Trainer(model=None, args=args, train_dataset=ds['train'])\n"
+            "trainer.start_training()\n"
+        ),
+        (
+            "AttributeError: 'Trainer' object has no attribute 'start_training'. "
+            "Did you mean: 'train'?"
+        ),
+    ],
+    [
+        (
+            "import torch.legacy as torch\n"
+            "x = torch.randn(2, 3)\n"
+            "print(x)\n"
+        ),
+        "ModuleNotFoundError: No module named 'torch.legacy'",
+    ],
+    [
+        (
+            "from transformers import AutoTokenizer\n"
+            "tok = AutoTokenizer.from_pretrained('bert-base-uncased')\n"
+            "out = tok(['hello world'], pad_to_max_length=True, truncate=True)\n"
+            "print(out)\n"
+        ),
+        (
+            "TypeError: __call__() got an unexpected keyword argument "
+            "'pad_to_max_length' (use `padding=True` instead)."
+        ),
+    ],
+]
+_PROMPT_TEMPLATE = (
+    "You are an expert ML engineer who fixes broken HuggingFace training "
+    "scripts caused by library version drift.\n\n"
+    "Library versions: {versions}\n\n"
+    "Broken script:\n```python\n{script}\n```\n\n"
+    "Error trace:\n```\n{trace}\n```\n\n"
+    "Output ONLY a minimal unified diff (`--- a/script.py` / `+++ "
+    "b/script.py` headers, then hunks). No prose."
+)
+_model = None
+_tokenizer = None
+_load_error: Optional[str] = None
+def _load_model() -> None:
+    """Lazy-load the trained LoRA on first GPU invocation."""
+    global _model, _tokenizer, _load_error
+    if _model is not None or _load_error is not None:
+        return
+    try:
+        import torch
+        from peft import PeftModel
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+        base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        try:
+            model = PeftModel.from_pretrained(base, ADAPTER_REPO)
+        except Exception as e:  # noqa: BLE001
+            print(f"[demo] adapter not found ({e}); using base model")
+            model = base
+        _model = model.eval()
+        _tokenizer = tokenizer
+    except Exception as e:  # noqa: BLE001
+        _load_error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
+def _baseline_fallback(script: str, error_trace: str) -> str:
+    """Deterministic repair if the trained model isn't available.
+    Uses the in-repo BaselineRepairAgent if the package is installed; else
+    just returns an explanatory message.
+    """
+    try:
+        from forgeenv.roles.repair_agent import BaselineRepairAgent
+        agent = BaselineRepairAgent()
+        return agent.repair(script, breakage_spec=None, original_script=None)
+    except Exception:  # noqa: BLE001
+        return (
+            "# (Fallback) Trained adapter unavailable in this Space.\n"
+            "# Likely fix based on the error trace:\n"
+            f"# {error_trace.splitlines()[0] if error_trace else ''}\n"
+        )
+def _generate_with_model(prompt: str, max_new_tokens: int = 512) -> str:
+    import torch
+    inputs = _tokenizer(prompt, return_tensors="pt").to(_model.device)
+    with torch.no_grad():
+        out = _model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=0.3,
+            top_p=0.9,
+            pad_token_id=_tokenizer.eos_token_id,
+        )
+    completion = _tokenizer.decode(
+        out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
+    )
+    return completion.strip()
+# Wrap inference in a `@spaces.GPU` decorator if available so we get a free
+# ZeroGPU slice. Outside ZeroGPU it's a no-op.
+try:
+    import spaces  # type: ignore
+    _gpu_decorator = spaces.GPU(duration=60)
+except Exception:  # noqa: BLE001
+    def _gpu_decorator(fn):
+        return fn
+@_gpu_decorator
+def repair_script(script: str, error_trace: str) -> str:
+    if not script.strip():
+        return "# Paste a broken script first."
+    _load_model()
+    if _model is None:
+        return _baseline_fallback(script, error_trace)
+    versions = json.dumps(
+        {"transformers": "4.45.0", "datasets": "2.20.0", "torch": "2.4.0"}
+    )
+    prompt = _PROMPT_TEMPLATE.format(
+        versions=versions, script=script, trace=error_trace or "(no trace)"
+    )
+    try:
+        return _generate_with_model(prompt)
+    except Exception as e:  # noqa: BLE001
+        return f"# generation failed: {e}\n" + _baseline_fallback(script, error_trace)
+with gr.Blocks(title="ForgeEnv Repair Agent") as demo:
+    gr.Markdown(f"# {_TITLE}\n\n{_DESCRIPTION}")
+    with gr.Row():
+        with gr.Column():
+            in_script = gr.Code(
+                label="Broken HuggingFace script",
+                language="python",
+                lines=22,
+            )
+            in_trace = gr.Textbox(
+                label="Error trace",
+                lines=6,
+                placeholder="Traceback...",
+            )
+            run_btn = gr.Button("Repair", variant="primary")
+        with gr.Column():
+            out_diff = gr.Code(
+                label="Suggested repair (unified diff)",
+                language="markdown",
+                lines=22,
+            )
+    gr.Examples(examples=_EXAMPLES, inputs=[in_script, in_trace])
+    run_btn.click(repair_script, inputs=[in_script, in_trace], outputs=out_diff)
+if __name__ == "__main__":
+    demo.launch()

demo-space/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==5.7.1
+torch>=2.1.0
+transformers>=4.40.0
+peft>=0.10.0
+accelerate>=0.30.0
+spaces>=0.28.0
+audioop-lts; python_version >= "3.13"

forgeenv-space/Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends git curl \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY forgeenv/ forgeenv/
+COPY openenv.yaml .
+ENV PORT=7860
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
+    CMD curl -f http://127.0.0.1:7860/health || exit 1
+CMD ["uvicorn", "forgeenv.env.server:app", "--host", "0.0.0.0", "--port", "7860"]

forgeenv-space/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+title: ForgeEnv
+emoji: 🔧
+colorFrom: indigo
+colorTo: green
+sdk: docker
+app_port: 7860
+pinned: true
+license: apache-2.0
+tags:
+  - openenv
+  - self-play
+  - self-improvement
+  - code-repair
+  - schema-drift
+  - reinforcement-learning
+  - huggingface
+short_description: Self-improving RL env for HF library-drift repair
+---
+# ForgeEnv — OpenEnv Server
+This Space hosts the **ForgeEnv** OpenEnv-compliant environment as a FastAPI
+service. It exposes the standard `reset`, `step`, and `state` endpoints and is
+the runtime that training notebooks (TRL + Unsloth) connect to.
+> **Theme:** Self-Improvement (Hackathon Theme #4) — Challenger / Solver
+> co-evolution via R-Zero, SPIRAL, and Absolute Zero Reasoner techniques.
+## What it does
+ForgeEnv simulates **HuggingFace library version drift**. A *Drift Generator*
+proposes a realistic breakage to a working training script (renamed APIs,
+deprecated imports, changed argument signatures, etc.). A *Repair Agent* then
+emits a unified diff that should restore the script. Reward is computed by an
+execution simulator + AST checker + held-out evaluator (multi-component to
+resist reward hacking).
+## API
+The server uses [`openenv-core`](https://pypi.org/project/openenv-core/) and
+follows the Gym-style contract:
+| Endpoint | Method | Purpose                                            |
+| -------- | ------ | -------------------------------------------------- |
+| `/reset` | POST   | Sample a fresh task, return drift-gen observation  |
+| `/step`  | POST   | Apply a `ForgeAction` (breakage or repair)         |
+| `/state` | GET    | Inspect the current internal state                 |
+| `/health`| GET    | Health probe (used by the container HEALTHCHECK)   |
+`ForgeAction` is a discriminated union of `BreakageAction` (used in phase 1)
+and `RepairAction` (used in phase 2). See
+[`forgeenv/env/actions.py`](forgeenv/env/actions.py).
+## Quick test
+```bash
+curl -X POST https://akhiilll-forgeenv.hf.space/reset
+curl https://akhiilll-forgeenv.hf.space/state
+```
+```python
+from openenv.core.env_client import EnvClient
+async with EnvClient(base_url="https://akhiilll-forgeenv.hf.space") as client:
+    obs = await client.reset()
+    print(obs.observation.current_phase, obs.observation.task_id)
+```
+## Project links
+- **Main repo / training notebooks / plots:**
+  <https://github.com/akhiilll/forgeenv>
+- **Repair Agent model (LoRA):**
+  <https://huggingface.co/akhiilll/forgeenv-repair-agent>
+- **Demo (Gradio + ZeroGPU):**
+  <https://huggingface.co/spaces/akhiilll/forgeenv-demo>
+## Citations
+- Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
+- Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
+- Liu et al., *SPIRAL: Self-Play on Zero-Sum Games* (2025)
+- [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
+- [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks

forgeenv-space/forgeenv/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""ForgeEnv: Self-improving RL environment for HuggingFace ecosystem repair."""
+__version__ = "0.1.0"
+__author__ = "akhiilll"

forgeenv-space/forgeenv/artifacts/repair_library.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Persisted "repair library" — the model's accumulated knowledge of
+known breakage -> repair pairs. Curated from successful rollouts during
+training. Loaded at inference time as a few-shot prefix when the agent
+recognises a familiar error class.
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Optional
+@dataclass
+class RepairExample:
+    primitive_type: str
+    breakage_params: dict[str, Any]
+    error_signature: str
+    repair_diff: str
+    visible_reward: float
+    held_out: dict[str, float]
+    task_id: str = ""
+    def signature_key(self) -> str:
+        return f"{self.primitive_type}::{self.error_signature[:80]}"
+@dataclass
+class RepairLibrary:
+    examples: list[RepairExample] = field(default_factory=list)
+    def add(self, example: RepairExample) -> None:
+        self.examples.append(example)
+    def best_match(self, primitive_type: str, error_text: str) -> Optional[RepairExample]:
+        """Return the highest-reward example whose primitive_type matches and
+        whose error text overlaps."""
+        candidates = [
+            e for e in self.examples if e.primitive_type == primitive_type
+        ]
+        if not candidates:
+            return None
+        scored = sorted(
+            candidates,
+            key=lambda e: (
+                _ngram_overlap(e.error_signature, error_text),
+                e.visible_reward,
+            ),
+            reverse=True,
+        )
+        return scored[0] if scored else None
+    def to_dict(self) -> dict:
+        return {
+            "version": "1",
+            "examples": [asdict(e) for e in self.examples],
+            "size": len(self.examples),
+            "by_primitive": _count_by_primitive(self.examples),
+        }
+    def save(self, path: str | Path) -> None:
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
+    @classmethod
+    def load(cls, path: str | Path) -> "RepairLibrary":
+        data = json.loads(Path(path).read_text(encoding="utf-8"))
+        examples = [RepairExample(**e) for e in data.get("examples", [])]
+        return cls(examples=examples)
+def _ngram_overlap(a: str, b: str, n: int = 3) -> float:
+    if not a or not b:
+        return 0.0
+    def grams(text: str) -> set[str]:
+        text = text.lower()
+        return {text[i : i + n] for i in range(len(text) - n + 1)}
+    ga, gb = grams(a), grams(b)
+    if not ga or not gb:
+        return 0.0
+    return len(ga & gb) / max(1, len(ga | gb))
+def _count_by_primitive(examples: list[RepairExample]) -> dict[str, int]:
+    counts: dict[str, int] = {}
+    for e in examples:
+        counts[e.primitive_type] = counts.get(e.primitive_type, 0) + 1
+    return counts
+def curate_from_rollouts(
+    rollout_results: list,
+    min_reward: float = 0.6,
+    min_held_out_clean: float = 0.5,
+) -> RepairLibrary:
+    """Build a RepairLibrary from a list of rollout dicts/RolloutResults."""
+    lib = RepairLibrary()
+    for r in rollout_results:
+        get = r.get if isinstance(r, dict) else lambda k, default=None: getattr(r, k, default)
+        if float(get("visible_reward", 0.0) or 0.0) < min_reward:
+            continue
+        if float(get("held_out_breakdown", {}).get("executed_cleanly", 0.0)) < min_held_out_clean:
+            continue
+        lib.add(
+            RepairExample(
+                primitive_type=str(get("primitive_type", "unknown")),
+                breakage_params=dict(get("info", {}).get("breakage_spec", {}).get("params", {}))
+                if isinstance(get("info", {}), dict)
+                else {},
+                error_signature=str(get("error_trace", "") or "")[:160],
+                repair_diff=str(get("repair_completion", "") or get("info", {}).get("repair_diff", ""))[:2000],
+                visible_reward=float(get("visible_reward", 0.0) or 0.0),
+                held_out=dict(get("held_out_breakdown", {}) or {}),
+                task_id=str(get("task_id", "")),
+            )
+        )
+    return lib

forgeenv-space/forgeenv/drift/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/drift/library_drift_engine.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Library Drift Engine.
+Manages library version snapshots and triggers version upgrades during
+training to create non-stationary verification. In simulation mode it
+just tracks the current snapshot index — that index influences
+breakage selection and is exposed in observations so the Repair Agent
+can adapt.
+Also exposes Chojecki GVU's SNR computation
+(https://arxiv.org/abs/2512.02731 Definition 4.4).
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+DEFAULT_VERSION_SNAPSHOTS: list[dict[str, str]] = [
+    {"transformers": "4.36.0", "datasets": "2.14.0", "trl": "0.7.0"},
+    {"transformers": "4.40.0", "datasets": "2.18.0", "trl": "0.8.0"},
+    {"transformers": "4.45.0", "datasets": "3.0.0", "trl": "0.10.0"},
+    {"transformers": "4.50.0", "datasets": "3.2.0", "trl": "0.12.0"},
+]
+@dataclass
+class LibraryDriftEngine:
+    snapshots: list[dict[str, str]] = field(
+        default_factory=lambda: list(DEFAULT_VERSION_SNAPSHOTS)
+    )
+    current_index: int = 0
+    drift_history: list[dict] = field(default_factory=list)
+    def current_versions(self) -> dict[str, str]:
+        return dict(self.snapshots[self.current_index])
+    def maybe_drift(self, episode_num: int, drift_every: int = 50) -> bool:
+        if (
+            episode_num > 0
+            and episode_num % drift_every == 0
+            and self.current_index < len(self.snapshots) - 1
+        ):
+            prev = self.snapshots[self.current_index]
+            self.current_index += 1
+            self.drift_history.append(
+                {
+                    "episode": episode_num,
+                    "from": prev,
+                    "to": self.snapshots[self.current_index],
+                }
+            )
+            return True
+        return False
+    def reset(self) -> None:
+        self.current_index = 0
+        self.drift_history.clear()
+    @staticmethod
+    def compute_snr(
+        recent_held_out: list[float], recent_visible: list[float]
+    ) -> dict[str, float]:
+        """SNR per Chojecki GVU Def 4.4: SNR = mean(rewards)^2 / variance(rewards)."""
+        def snr(values: list[float]) -> float:
+            if len(values) < 2:
+                return 0.0
+            mean = sum(values) / len(values)
+            var = sum((v - mean) ** 2 for v in values) / len(values)
+            return mean**2 / max(var, 1e-8)
+        return {
+            "snr_verifier": snr(recent_held_out),
+            "snr_generator": snr(recent_visible),
+        }

forgeenv-space/forgeenv/env/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/env/actions.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Pydantic action models for ForgeEnv (compatible with OpenEnv 0.2.x).
+Episodes have two phases — drift_gen (Challenger) and repair (Solver) — so
+we expose a single union ForgeAction that carries either a BreakageAction
+or a RepairAction. The environment dispatches on which sub-field is set.
+"""
+from __future__ import annotations
+from typing import Any, Literal, Optional
+from pydantic import Field
+from openenv.core import Action
+class BreakageAction(Action):
+    """Drift Generator's action: pick a primitive type + parameters."""
+    action_type: Literal["breakage"] = "breakage"
+    primitive_type: str = Field(
+        ..., description="One of the registered breakage primitive class names"
+    )
+    params: dict[str, Any] = Field(
+        default_factory=dict, description="Primitive-specific parameters"
+    )
+class RepairAction(Action):
+    """Repair Agent's action: a unified diff (or full replacement script)."""
+    action_type: Literal["repair"] = "repair"
+    unified_diff: str = Field(..., description="Unified diff or full replacement script")
+class ForgeAction(Action):
+    """Union action: exactly one of `breakage` / `repair` must be set.
+    This is the type registered with OpenEnv's `create_app`. It avoids
+    Pydantic discriminated unions to keep the OpenAPI schema flat and
+    cross-version-friendly.
+    """
+    breakage: Optional[BreakageAction] = None
+    repair: Optional[RepairAction] = None
+    def model_post_init(self, __context: Any) -> None:
+        if (self.breakage is None) == (self.repair is None):
+            raise ValueError(
+                "ForgeAction requires exactly one of `breakage` or `repair` to be set."
+            )

forgeenv-space/forgeenv/env/diff_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Unified-diff application utilities.
+The Repair Agent submits a unified diff. We need a permissive applier
+because LLM diffs are often malformed (wrong line numbers, missing
+context, extra prose). We try the strict applier first, then fall
+back to applying hunks via plain string replacement.
+The agent may also submit a full Python script instead of a diff
+(common when the model's diff format breaks). We detect this and
+treat it as a complete replacement.
+"""
+from __future__ import annotations
+import difflib
+import re
+_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
+_SCRIPT_MARKERS = ("import ", "from ", "def ", "class ", "print(")
+def looks_like_full_script(text: str) -> bool:
+    """Heuristic: text is probably a full python script, not a diff."""
+    lines = text.lstrip().splitlines()
+    if not lines:
+        return False
+    has_diff_header = any(
+        line.startswith(("---", "+++", "@@")) for line in lines[:5]
+    )
+    if has_diff_header:
+        return False
+    # If we see two or more script-style markers in the first 30 lines,
+    # treat as a full replacement script.
+    head = "\n".join(lines[:30])
+    hits = sum(1 for marker in _SCRIPT_MARKERS if marker in head)
+    return hits >= 2
+def _strict_apply(broken_script: str, diff_text: str) -> str | None:
+    """Apply a unified diff strictly. Returns None on any failure."""
+    lines = broken_script.splitlines(keepends=True)
+    out: list[str] = []
+    diff_lines = diff_text.splitlines()
+    i = 0
+    src_idx = 0
+    in_hunk = False
+    hunk_old: list[str] = []
+    hunk_new: list[str] = []
+    while i < len(diff_lines):
+        line = diff_lines[i]
+        if line.startswith(("---", "+++")):
+            i += 1
+            continue
+        if line.startswith("@@"):
+            # Flush previous hunk
+            if in_hunk:
+                # Find the hunk_old block in the source starting at src_idx.
+                target = "".join(hunk_old)
+                source_remainder = "".join(lines[src_idx:])
+                pos = source_remainder.find(target)
+                if pos == -1:
+                    return None
+                out.append(source_remainder[:pos])
+                out.append("".join(hunk_new))
+                src_idx += len(source_remainder[: pos + len(target)].splitlines(keepends=True))
+                hunk_old, hunk_new = [], []
+            in_hunk = True
+            i += 1
+            continue
+        if in_hunk:
+            if line.startswith("+"):
+                hunk_new.append(line[1:] + "\n")
+            elif line.startswith("-"):
+                hunk_old.append(line[1:] + "\n")
+            else:
+                # context line
+                ctx = line[1:] if line.startswith(" ") else line
+                hunk_old.append(ctx + "\n")
+                hunk_new.append(ctx + "\n")
+        i += 1
+    # Flush trailing hunk
+    if in_hunk and (hunk_old or hunk_new):
+        target = "".join(hunk_old)
+        source_remainder = "".join(lines[src_idx:])
+        pos = source_remainder.find(target)
+        if pos == -1:
+            return None
+        out.append(source_remainder[:pos])
+        out.append("".join(hunk_new))
+        consumed = source_remainder[: pos + len(target)]
+        src_idx += len(consumed.splitlines(keepends=True))
+    out.append("".join(lines[src_idx:]))
+    return "".join(out)
+def _permissive_apply(broken_script: str, diff_text: str) -> str:
+    """Apply a malformed diff by extracting (-,+) line pairs and doing
+    a tolerant search-and-replace.
+    """
+    repaired = broken_script
+    pairs: list[tuple[str, str]] = []
+    lines = diff_text.splitlines()
+    pending_minus: str | None = None
+    for line in lines:
+        if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
+            pending_minus = None
+            continue
+        if line.startswith("-"):
+            pending_minus = line[1:].strip()
+        elif line.startswith("+") and pending_minus is not None:
+            pairs.append((pending_minus, line[1:].strip()))
+            pending_minus = None
+        elif pending_minus is not None and not line.startswith(" "):
+            # standalone deletion — skip in permissive mode (we can't
+            # reliably know what to delete without context)
+            pending_minus = None
+    for old, new in pairs:
+        if old and old in repaired:
+            repaired = repaired.replace(old, new, 1)
+    return repaired
+def apply_unified_diff(broken_script: str, diff_text: str) -> str:
+    """Try every strategy in order and return the first that produces a change.
+    Strategies:
+      1. If `diff_text` looks like a full script, return it directly.
+      2. Try strict diff application.
+      3. Fall back to permissive (-,+) line-pair replacement.
+      4. As last resort, return the broken script unchanged.
+    """
+    diff_text = diff_text or ""
+    if not diff_text.strip():
+        return broken_script
+    if looks_like_full_script(diff_text):
+        return diff_text
+    if _HUNK_RE.search(diff_text) or "---" in diff_text or "+++" in diff_text:
+        strict = _strict_apply(broken_script, diff_text)
+        if strict is not None and strict != broken_script:
+            return strict
+    perm = _permissive_apply(broken_script, diff_text)
+    return perm
+def make_unified_diff(before: str, after: str, path: str = "train.py") -> str:
+    """Produce a canonical unified diff from before -> after."""
+    diff = difflib.unified_diff(
+        before.splitlines(keepends=True),
+        after.splitlines(keepends=True),
+        fromfile=f"a/{path}",
+        tofile=f"b/{path}",
+        n=2,
+    )
+    return "".join(diff)

forgeenv-space/forgeenv/env/forge_environment.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""ForgeEnvironment: the OpenEnv Environment subclass for ForgeEnv.
+Episode flow (exactly 2 steps per episode):
+  reset()             -> sample task, ask Teacher for category
+  step(BreakageAction) -> Drift Generator's proposal is applied; broken
+                          script is run, error trace captured.
+  step(RepairAction)   -> Repair diff is applied; script is re-executed;
+                          visible + held-out rewards computed; episode ends.
+"""
+from __future__ import annotations
+import time
+import uuid
+from typing import Any, Optional
+from openenv.core import Environment
+from forgeenv.drift.library_drift_engine import LibraryDriftEngine
+from forgeenv.env.actions import BreakageAction, ForgeAction, RepairAction
+from forgeenv.env.diff_utils import apply_unified_diff
+from forgeenv.env.observations import ForgeObservation
+from forgeenv.primitives.breakage_primitives import (
+    PRIMITIVE_REGISTRY,
+    parse_breakage_spec,
+)
+from forgeenv.roles.teacher import Teacher
+from forgeenv.sandbox.simulation_mode import SimulationExecutor
+from forgeenv.tasks.models import ExecutionResult, Task
+from forgeenv.tasks.task_sampler import TaskSampler
+from forgeenv.verifier.held_out_evaluator import compute_held_out_scores
+from forgeenv.verifier.visible_verifier import compute_visible_reward
+DEFAULT_CATEGORIES = sorted(PRIMITIVE_REGISTRY.keys())
+class ForgeEnvironment(Environment[ForgeAction, ForgeObservation, dict]):
+    """OpenEnv-compliant environment for HuggingFace ecosystem repair."""
+    SUPPORTS_CONCURRENT_SESSIONS = False  # Teacher state is global per env
+    def __init__(
+        self,
+        task_sampler: Optional[TaskSampler] = None,
+        teacher: Optional[Teacher] = None,
+        executor: Optional[SimulationExecutor] = None,
+        drift_engine: Optional[LibraryDriftEngine] = None,
+        seed: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.task_sampler = task_sampler or TaskSampler()
+        self.teacher = teacher or Teacher(
+            categories=list(DEFAULT_CATEGORIES) or ["api_drift"]
+        )
+        self.executor = executor or SimulationExecutor(seed=seed)
+        self.drift_engine = drift_engine or LibraryDriftEngine()
+        self._episode_id: Optional[str] = None
+        self._episode_count: int = 0
+        self._current_task: Optional[Task] = None
+        self._original_script: str = ""
+        self._broken_script: str = ""
+        self._error_trace: str = ""
+        self._breakage_spec: Optional[dict[str, Any]] = None
+        self._target_category: str = ""
+        self._current_phase: str = "idle"
+        self._last_obs: Optional[ForgeObservation] = None
+    # ------------------------------------------------------------------ API
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        difficulty: Optional[str] = "easy",
+        **kwargs: Any,
+    ) -> ForgeObservation:
+        self._episode_id = episode_id or str(uuid.uuid4())
+        self._episode_count += 1
+        self._target_category = self.teacher.select_next_category()
+        task = self.task_sampler.sample(difficulty=difficulty)
+        if task is None:
+            raise RuntimeError("Task sampler returned no tasks (empty seed corpus?)")
+        self._current_task = task
+        self._original_script = task.script_content
+        self._broken_script = ""
+        self._error_trace = ""
+        self._breakage_spec = None
+        self._current_phase = "drift_gen"
+        # Library drift trigger every 50 episodes (configurable from outside).
+        drifted = self.drift_engine.maybe_drift(self._episode_count, drift_every=50)
+        obs = ForgeObservation(
+            current_phase="drift_gen",
+            task_id=task.task_id,
+            task_description=task.description,
+            target_category=self._target_category,
+            script_content=self._original_script,
+            error_trace=None,
+            library_versions=self.drift_engine.current_versions(),
+            episode_step=0,
+            done=False,
+            reward=0.0,
+            info={
+                "episode_id": self._episode_id,
+                "episode_count": self._episode_count,
+                "drift_triggered": drifted,
+                "available_primitives": sorted(PRIMITIVE_REGISTRY),
+            },
+        )
+        self._last_obs = obs
+        return obs
+    def step(
+        self,
+        action: ForgeAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> ForgeObservation:
+        if self._current_phase == "drift_gen":
+            if action.breakage is None:
+                return self._error_obs("Expected BreakageAction in drift_gen phase")
+            return self._handle_breakage(action.breakage)
+        if self._current_phase == "repair":
+            if action.repair is None:
+                return self._error_obs("Expected RepairAction in repair phase")
+            return self._handle_repair(action.repair)
+        return self._error_obs(
+            f"step() called in invalid phase {self._current_phase!r} — call reset() first"
+        )
+    @property
+    def state(self) -> dict:
+        return {
+            "phase": self._current_phase,
+            "episode_id": self._episode_id,
+            "episode_count": self._episode_count,
+            "task_id": self._current_task.task_id if self._current_task else None,
+            "target_category": self._target_category,
+            "library_versions": self.drift_engine.current_versions(),
+            "teacher": self.teacher.get_state(),
+            "drift_history": list(self.drift_engine.drift_history),
+            "breakage_spec": dict(self._breakage_spec) if self._breakage_spec else None,
+        }
+    # ---------------------------------------------------------------- helpers
+    def _handle_breakage(self, breakage: BreakageAction) -> ForgeObservation:
+        spec = {"primitive_type": breakage.primitive_type, "params": dict(breakage.params)}
+        try:
+            primitive = parse_breakage_spec(spec)
+        except ValueError as exc:
+            return self._error_obs(f"Invalid breakage spec: {exc}")
+        try:
+            self._broken_script = primitive.apply(self._original_script)
+        except Exception as exc:  # primitive bug — surface but don't crash server
+            return self._error_obs(f"Primitive apply failed: {exc}")
+        self._breakage_spec = spec
+        result = self.executor.execute(self._broken_script, self._current_task)
+        if result.exit_code != 0:
+            self._error_trace = result.stderr or "non-zero exit code, no stderr"
+        else:
+            # The breakage didn't actually break it; still proceed to repair phase
+            # (no-op repair is then a valid choice).
+            self._error_trace = "Script ran without observable error"
+        self._current_phase = "repair"
+        obs = ForgeObservation(
+            current_phase="repair",
+            task_id=self._current_task.task_id,
+            task_description=self._current_task.description,
+            target_category=primitive.category,
+            script_content=self._broken_script,
+            error_trace=self._error_trace,
+            library_versions=self.drift_engine.current_versions(),
+            episode_step=1,
+            done=False,
+            reward=0.0,
+            info={
+                "episode_id": self._episode_id,
+                "breakage_primitive": primitive.name,
+                "breakage_description": primitive.description,
+            },
+        )
+        self._last_obs = obs
+        return obs
+    def _handle_repair(self, repair: RepairAction) -> ForgeObservation:
+        repaired = apply_unified_diff(self._broken_script, repair.unified_diff or "")
+        t0 = time.time()
+        result = self.executor.execute(repaired, self._current_task)
+        result.script_content = repaired  # ensure verifier sees what we ran
+        wall_ms = int((time.time() - t0) * 1000)
+        visible_reward, visible_breakdown = compute_visible_reward(
+            result, self._current_task
+        )
+        held_out = compute_held_out_scores(
+            result, self._current_task, repair_diff=repair.unified_diff or ""
+        )
+        success = result.exit_code == 0
+        category = (
+            self._breakage_spec.get("primitive_type", "unknown")
+            if self._breakage_spec
+            else "unknown"
+        )
+        # Update Teacher's curriculum state
+        self.teacher.update(category, success)
+        self._current_phase = "done"
+        obs = ForgeObservation(
+            current_phase="done",
+            task_id=self._current_task.task_id,
+            task_description=self._current_task.description,
+            target_category=category,
+            script_content=repaired,
+            error_trace=result.stderr or None,
+            library_versions=self.drift_engine.current_versions(),
+            episode_step=2,
+            done=True,
+            reward=visible_reward,
+            reward_breakdown=visible_breakdown,
+            held_out_breakdown=held_out,
+            info={
+                "episode_id": self._episode_id,
+                "exit_code": result.exit_code,
+                "wall_time_ms": wall_ms,
+                "checkpoint_exists": result.checkpoint_exists,
+                "stdout_tail": "\n".join(result.stdout.splitlines()[-5:]),
+                "breakage_spec": self._breakage_spec,
+                "teacher_state": self.teacher.get_state(),
+            },
+        )
+        self._last_obs = obs
+        return obs
+    def _error_obs(self, message: str) -> ForgeObservation:
+        """Return a `done=True` error observation rather than raising."""
+        return ForgeObservation(
+            current_phase="done",
+            task_id=self._current_task.task_id if self._current_task else "",
+            task_description=self._current_task.description if self._current_task else "",
+            target_category=self._target_category,
+            script_content=self._broken_script or self._original_script,
+            error_trace=message,
+            library_versions=self.drift_engine.current_versions(),
+            episode_step=2,
+            done=True,
+            reward=0.0,
+            info={"error": message, "episode_id": self._episode_id},
+        )

forgeenv-space/forgeenv/env/observations.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Pydantic observation model for ForgeEnv."""
+from __future__ import annotations
+from typing import Any, Optional
+from pydantic import Field
+from openenv.core import Observation
+class ForgeObservation(Observation):
+    """What the agent (or the trainer's rollout function) sees at each step.
+    Inherits `done`, `reward`, `metadata` from the OpenEnv `Observation` base.
+    """
+    current_phase: str = Field(
+        ..., description="One of 'drift_gen', 'repair', 'verify', 'done'"
+    )
+    task_id: str = ""
+    task_description: str = ""
+    target_category: str = ""
+    script_content: str = Field(default="", description="Current state of the script")
+    error_trace: Optional[str] = None
+    library_versions: dict[str, str] = Field(default_factory=dict)
+    reward_breakdown: dict[str, Any] = Field(default_factory=dict)
+    held_out_breakdown: dict[str, float] = Field(default_factory=dict)
+    episode_step: int = 0
+    info: dict[str, Any] = Field(default_factory=dict)

forgeenv-space/forgeenv/env/server.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""FastAPI server for ForgeEnv (OpenEnv-compliant).
+Exposes /reset, /step, /state HTTP endpoints via OpenEnv's `create_app`.
+HF Spaces sets PORT=7860 automatically.
+"""
+from __future__ import annotations
+import os
+from fastapi.responses import HTMLResponse
+from openenv.core import create_app
+from forgeenv.env.actions import ForgeAction
+from forgeenv.env.forge_environment import ForgeEnvironment
+from forgeenv.env.observations import ForgeObservation
+app = create_app(
+    env=ForgeEnvironment,
+    action_cls=ForgeAction,
+    observation_cls=ForgeObservation,
+    env_name="forgeenv",
+)
+_LANDING_HTML = """<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>ForgeEnv — OpenEnv server</title>
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<style>
+  :root { color-scheme: light dark; }
+  body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+         max-width: 760px; margin: 2.5rem auto; padding: 0 1.25rem;
+         line-height: 1.55; color: #1f2937; background: #fafafa; }
+  @media (prefers-color-scheme: dark) { body { color: #e5e7eb; background: #0f172a; } }
+  h1 { font-size: 1.65rem; margin-bottom: 0.25rem; }
+  .sub { color: #6b7280; margin-top: 0; }
+  code, pre { font-family: ui-monospace, "SF Mono", Menlo, monospace; }
+  pre { background: rgba(127,127,127,0.12); padding: 0.9rem; border-radius: 8px;
+        overflow-x: auto; }
+  table { border-collapse: collapse; width: 100%; margin: 0.75rem 0 1.25rem; }
+  td, th { text-align: left; padding: 0.5rem 0.75rem;
+           border-bottom: 1px solid rgba(127,127,127,0.25); }
+  th { font-weight: 600; }
+  a { color: #2563eb; text-decoration: none; } a:hover { text-decoration: underline; }
+  .ok { color: #16a34a; font-weight: 600; }
+  .muted { color: #6b7280; font-size: 0.9rem; }
+  .pill { display: inline-block; padding: 0.1rem 0.5rem; border-radius: 999px;
+          background: rgba(34,197,94,0.15); color: #16a34a; font-size: 0.85rem; }
+</style>
+</head>
+<body>
+<h1>ForgeEnv 🔧 <span class="pill">running</span></h1>
+<p class="sub">OpenEnv-compliant RL environment for HuggingFace
+ecosystem repair under library version drift.</p>
+<p>This URL serves the environment over HTTP. It is not a UI — it's the
+runtime that <strong>training notebooks connect to</strong>. Open one of
+the endpoints below, or use the demo Space to try the trained Repair
+Agent in a browser.</p>
+<h2>Endpoints</h2>
+<table>
+  <tr><th>Method</th><th>Path</th><th>Purpose</th></tr>
+  <tr><td>GET </td><td><a href="/health">/health</a></td><td>Health probe</td></tr>
+  <tr><td>POST</td><td><code>/reset</code></td><td>Sample task, return drift-gen observation</td></tr>
+  <tr><td>POST</td><td><code>/step</code></td><td>Apply <code>ForgeAction</code> (breakage or repair)</td></tr>
+  <tr><td>GET </td><td><a href="/state">/state</a></td><td>Current internal state</td></tr>
+  <tr><td>GET </td><td><a href="/metadata">/metadata</a></td><td>Env name + version + schema URLs</td></tr>
+  <tr><td>GET </td><td><a href="/schema">/schema</a></td><td>Action / observation JSON schemas</td></tr>
+  <tr><td>GET </td><td><a href="/docs">/docs</a></td><td>Interactive Swagger UI</td></tr>
+</table>
+<h2>Quick start (Python)</h2>
+<pre><code>import asyncio
+from openenv.core import GenericEnvClient
+async def go():
+    client = GenericEnvClient(base_url="https://akhiilll-forgeenv.hf.space")
+    obs = await client.reset()
+    print(obs.observation["current_phase"], obs.observation["task_id"])
+asyncio.run(go())</code></pre>
+<h2>Project links</h2>
+<ul>
+  <li>Space card &amp; README:
+      <a href="https://huggingface.co/spaces/akhiilll/forgeenv" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv</a></li>
+  <li>Gradio demo:
+      <a href="https://huggingface.co/spaces/akhiilll/forgeenv-demo" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv-demo</a></li>
+  <li>Trained model (LoRA) <span class="muted">— published after the Colab training run finishes</span>:
+      <a href="https://huggingface.co/akhiilll/forgeenv-repair-agent" target="_blank" rel="noopener noreferrer">huggingface.co/akhiilll/forgeenv-repair-agent</a></li>
+</ul>
+<p class="muted">Tip: if links don't open from inside the embedded Space frame,
+right-click and choose <em>Open in new tab</em>, or open this URL directly
+at <a href="https://akhiilll-forgeenv.hf.space/" target="_blank" rel="noopener noreferrer">akhiilll-forgeenv.hf.space</a>.</p>
+</body>
+</html>"""
+def _attach_supplementary_routes(_app) -> None:
+    """Add /health and a friendly GET / landing page if not present."""
+    existing = {
+        getattr(r, "path", None) for r in getattr(_app, "routes", [])
+    }
+    if "/health" not in existing:
+        @_app.get("/health")
+        def _health() -> dict:
+            return {"status": "ok", "env": "forgeenv"}
+    if "/" not in existing:
+        @_app.get("/", response_class=HTMLResponse, include_in_schema=False)
+        def _root() -> str:
+            return _LANDING_HTML
+_attach_supplementary_routes(app)
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", "7860"))
+    uvicorn.run(app, host="0.0.0.0", port=port)

forgeenv-space/forgeenv/primitives/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/primitives/breakage_primitives.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""8 breakage primitives representing real HuggingFace/PyTorch ecosystem drift.
+Each primitive transforms a working script to simulate a library upgrade
+breakage. They double as the Drift Generator's structured action space.
+"""
+from __future__ import annotations
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+@dataclass
+class BreakagePrimitive(ABC):
+    """Abstract base class for all breakage types."""
+    category: str = field(default="generic", init=False)
+    name: str = field(default="BreakagePrimitive", init=False)
+    description: str = field(default="", init=False)
+    @abstractmethod
+    def apply(self, script: str) -> str:
+        """Transform `script` to introduce the breakage."""
+    def to_spec(self) -> dict:
+        """Serialize to JSON-compatible spec for the LLM action space."""
+        return {
+            "primitive_type": self.__class__.__name__,
+            "category": self.category,
+            "params": self._get_params(),
+        }
+    @abstractmethod
+    def _get_params(self) -> dict:
+        """Return a JSON-serializable dict of constructor parameters."""
+@dataclass
+class RenameApiCall(BreakagePrimitive):
+    """Rename a function/method call to simulate API deprecation."""
+    old_name: str = ""
+    new_name: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RenameApiCall"
+        self.description = f"Rename {self.old_name} -> {self.new_name}"
+    def apply(self, script: str) -> str:
+        if not self.old_name:
+            return script
+        # Use word-boundary replacement so we don't substring-match identifiers.
+        pattern = re.compile(rf"(?<!\w){re.escape(self.old_name)}(?!\w)")
+        return pattern.sub(self.new_name, script)
+    def _get_params(self) -> dict:
+        return {"old_name": self.old_name, "new_name": self.new_name}
+@dataclass
+class DeprecateImport(BreakagePrimitive):
+    """Change an import path to simulate module restructuring."""
+    old_module: str = ""
+    new_module: str = ""
+    def __post_init__(self) -> None:
+        self.category = "import_drift"
+        self.name = "DeprecateImport"
+        self.description = f"Move {self.old_module} -> {self.new_module}"
+    def apply(self, script: str) -> str:
+        if not self.old_module:
+            return script
+        return script.replace(self.old_module, self.new_module)
+    def _get_params(self) -> dict:
+        return {"old_module": self.old_module, "new_module": self.new_module}
+@dataclass
+class ChangeArgumentSignature(BreakagePrimitive):
+    """Remove an expected kwarg (and document a new required one)."""
+    function_name: str = ""
+    removed_arg: str = ""
+    added_arg: str = ""
+    added_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "ChangeArgumentSignature"
+        self.description = (
+            f"Change args of {self.function_name}: -{self.removed_arg} +{self.added_arg}"
+        )
+    def apply(self, script: str) -> str:
+        if not self.removed_arg:
+            return script
+        pattern = rf"(\b{re.escape(self.removed_arg)}\s*=\s*[^,)]+,?\s*)"
+        return re.sub(pattern, "", script)
+    def _get_params(self) -> dict:
+        return {
+            "function_name": self.function_name,
+            "removed_arg": self.removed_arg,
+            "added_arg": self.added_arg,
+            "added_value": self.added_value,
+        }
+@dataclass
+class ModifyConfigField(BreakagePrimitive):
+    """Change a config-class default value to simulate behaviour drift."""
+    config_class: str = ""
+    field_name: str = ""
+    new_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "config_drift"
+        self.name = "ModifyConfigField"
+        self.description = f"Change {self.config_class}.{self.field_name}"
+    def apply(self, script: str) -> str:
+        if not self.field_name:
+            return script
+        pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
+        return re.sub(pattern, rf"\g<1>{self.new_value}", script)
+    def _get_params(self) -> dict:
+        return {
+            "config_class": self.config_class,
+            "field_name": self.field_name,
+            "new_value": self.new_value,
+        }
+@dataclass
+class RestructureDatasetSchema(BreakagePrimitive):
+    """Rename a dataset column reference to simulate schema drift."""
+    old_column: str = ""
+    new_column: str = ""
+    def __post_init__(self) -> None:
+        self.category = "dataset_drift"
+        self.name = "RestructureDatasetSchema"
+        self.description = f"Rename column {self.old_column} -> {self.new_column}"
+    def apply(self, script: str) -> str:
+        if not self.old_column:
+            return script
+        return script.replace(
+            f'"{self.old_column}"', f'"{self.new_column}"'
+        ).replace(
+            f"'{self.old_column}'", f"'{self.new_column}'"
+        )
+    def _get_params(self) -> dict:
+        return {"old_column": self.old_column, "new_column": self.new_column}
+@dataclass
+class ChangeTokenizerBehavior(BreakagePrimitive):
+    """Change tokenizer call arguments."""
+    old_kwarg: str = ""
+    old_value: str = ""
+    new_kwarg: str = ""
+    new_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "tokenizer_drift"
+        self.name = "ChangeTokenizerBehavior"
+        self.description = f"Change tokenizer kwarg {self.old_kwarg}={self.old_value} -> {self.new_kwarg}={self.new_value}"
+    def apply(self, script: str) -> str:
+        if not self.old_kwarg:
+            return script
+        pattern = rf"{re.escape(self.old_kwarg)}\s*=\s*{re.escape(self.old_value)}"
+        replacement = f"{self.new_kwarg}={self.new_value}"
+        return re.sub(pattern, replacement, script)
+    def _get_params(self) -> dict:
+        return {
+            "old_kwarg": self.old_kwarg,
+            "old_value": self.old_value,
+            "new_kwarg": self.new_kwarg,
+            "new_value": self.new_value,
+        }
+@dataclass
+class RemoveDeprecatedMethod(BreakagePrimitive):
+    """Remove a method that has been deprecated, leaving a sentinel that
+    raises AttributeError-style errors when the script runs."""
+    class_name: str = ""
+    method_name: str = ""
+    replacement: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RemoveDeprecatedMethod"
+        self.description = f"Remove {self.class_name}.{self.method_name}"
+    def apply(self, script: str) -> str:
+        if not self.method_name:
+            return script
+        return script.replace(
+            f".{self.method_name}(", f".{self.method_name}_DEPRECATED("
+        )
+    def _get_params(self) -> dict:
+        return {
+            "class_name": self.class_name,
+            "method_name": self.method_name,
+            "replacement": self.replacement,
+        }
+@dataclass
+class ChangeReturnType(BreakagePrimitive):
+    """A function now returns a different structure (e.g. tuple -> object)."""
+    function_name: str = ""
+    old_access: str = ""
+    new_access: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "ChangeReturnType"
+        self.description = f"Change return type of {self.function_name}"
+    def apply(self, script: str) -> str:
+        if self.old_access and self.new_access:
+            return script.replace(self.old_access, self.new_access)
+        return script
+    def _get_params(self) -> dict:
+        return {
+            "function_name": self.function_name,
+            "old_access": self.old_access,
+            "new_access": self.new_access,
+        }
+PRIMITIVE_REGISTRY: dict[str, type[BreakagePrimitive]] = {
+    "RenameApiCall": RenameApiCall,
+    "DeprecateImport": DeprecateImport,
+    "ChangeArgumentSignature": ChangeArgumentSignature,
+    "ModifyConfigField": ModifyConfigField,
+    "RestructureDatasetSchema": RestructureDatasetSchema,
+    "ChangeTokenizerBehavior": ChangeTokenizerBehavior,
+    "RemoveDeprecatedMethod": RemoveDeprecatedMethod,
+    "ChangeReturnType": ChangeReturnType,
+}
+def parse_breakage_spec(spec: dict) -> BreakagePrimitive:
+    """Parse a JSON breakage spec into a BreakagePrimitive object.
+    Tolerates extra keys; ignores unknown params (LLMs hallucinate these).
+    """
+    ptype = spec.get("primitive_type", "")
+    params = spec.get("params", {}) or {}
+    if ptype not in PRIMITIVE_REGISTRY:
+        raise ValueError(
+            f"Unknown primitive type: {ptype!r}. "
+            f"Valid types: {list(PRIMITIVE_REGISTRY)}"
+        )
+    cls = PRIMITIVE_REGISTRY[ptype]
+    # Filter to known fields only so a hallucinated kwarg can't crash us.
+    valid_fields = {
+        f.name for f in cls.__dataclass_fields__.values() if f.init  # type: ignore[attr-defined]
+    }
+    filtered = {k: v for k, v in params.items() if k in valid_fields}
+    return cls(**filtered)

forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml ADDED Viewed

	@@ -0,0 +1,217 @@

+# Drift taxonomy: real HuggingFace/PyTorch breakages observed across version bumps.
+# Used to seed the Drift Generator's initial proposal distribution and to anchor
+# warm-start pair generation in things that actually happened in the wild.
+- version_range: "transformers 4.36 -> 4.45"
+  affected_api: "Trainer.evaluate"
+  description: "Trainer.evaluate() return type changed shape; metrics now nested under .metrics"
+  breakage_primitive: "ChangeReturnType"
+  params:
+    function_name: "evaluate"
+    old_access: "trainer.evaluate()"
+    new_access: "trainer.evaluate().metrics"
+  repair_primitive: "RestoreReturnAccess"
+  category: "api_drift"
+- version_range: "transformers 4.30 -> 4.40"
+  affected_api: "TrainingArguments.evaluation_strategy"
+  description: "Renamed evaluation_strategy -> eval_strategy"
+  breakage_primitive: "RenameApiCall"
+  params:
+    old_name: "evaluation_strategy"
+    new_name: "eval_strategy"
+  repair_primitive: "RestoreApiCall"
+  category: "api_drift"
+- version_range: "datasets 2.14 -> 3.0"
+  affected_api: "load_dataset"
+  description: "Default split column was renamed in some GLUE configs"
+  breakage_primitive: "RestructureDatasetSchema"
+  params:
+    old_column: "label"
+    new_column: "labels"
+  repair_primitive: "RestoreColumn"
+  category: "dataset_drift"
+- version_range: "transformers 4.40 -> 4.50"
+  affected_api: "Trainer.predict"
+  description: "Method removed; users should use evaluate() with prediction_loss_only=False"
+  breakage_primitive: "RemoveDeprecatedMethod"
+  params:
+    class_name: "Trainer"
+    method_name: "predict"
+    replacement: "evaluate"
+  repair_primitive: "RestoreMethod"
+  category: "api_drift"
+- version_range: "transformers 4.36 -> 4.40"
+  affected_api: "TrainingArguments"
+  description: "num_train_epochs default behavior changed; max_steps now preferred"
+  breakage_primitive: "ModifyConfigField"
+  params:
+    config_class: "TrainingArguments"
+    field_name: "num_train_epochs"
+    new_value: "0"
+  repair_primitive: "RestoreConfigField"
+  category: "config_drift"
+- version_range: "transformers 4.34 -> 4.42"
+  affected_api: "Tokenizer.__call__"
+  description: "padding=True semantics changed; users should pass padding='max_length'"
+  breakage_primitive: "ChangeTokenizerBehavior"
+  params:
+    old_kwarg: "padding"
+    old_value: "True"
+    new_kwarg: "padding"
+    new_value: '"max_length"'
+  repair_primitive: "RestoreTokenizerKwarg"
+  category: "tokenizer_drift"
+- version_range: "transformers 4.20 -> 4.30"
+  affected_api: "imports"
+  description: "transformers.training_args moved to transformers.training_args_pt"
+  breakage_primitive: "DeprecateImport"
+  params:
+    old_module: "from transformers.training_args"
+    new_module: "from transformers.training_args_pt"
+  repair_primitive: "RestoreImport"
+  category: "import_drift"
+- version_range: "transformers 4.45 -> 4.50"
+  affected_api: "save_pretrained"
+  description: "save_pretrained() now requires safe_serialization to default True"
+  breakage_primitive: "ChangeArgumentSignature"
+  params:
+    function_name: "save_pretrained"
+    removed_arg: "safe_serialization"
+    added_arg: "safe_serialization"
+    added_value: "True"
+  repair_primitive: "RestoreArgument"
+  category: "api_drift"
+- version_range: "datasets 2.18 -> 3.0"
+  affected_api: "Dataset.set_format"
+  description: "set_format(type='torch') signature stricter, columns required"
+  breakage_primitive: "ChangeArgumentSignature"
+  params:
+    function_name: "set_format"
+    removed_arg: "columns"
+    added_arg: "columns"
+    added_value: '["input_ids", "attention_mask", "labels"]'
+  repair_primitive: "RestoreArgument"
+  category: "api_drift"
+- version_range: "transformers 4.36 -> 4.45"
+  affected_api: "Tokenizer.__call__"
+  description: "max_length default reduced from 512 -> 256 for some tokenizers"
+  breakage_primitive: "ModifyConfigField"
+  params:
+    config_class: "tokenizer"
+    field_name: "max_length"
+    new_value: "256"
+  repair_primitive: "RestoreConfigField"
+  category: "tokenizer_drift"
+- version_range: "transformers 4.40 -> 4.45"
+  affected_api: "DataCollatorWithPadding"
+  description: "Renamed `tokenizer` -> `processing_class` in DataCollator constructors"
+  breakage_primitive: "RenameApiCall"
+  params:
+    old_name: "tokenizer"
+    new_name: "processing_class"
+  repair_primitive: "RestoreApiCall"
+  category: "api_drift"
+- version_range: "datasets 2.14 -> 2.18"
+  affected_api: "load_dataset"
+  description: "Some splits renamed train[:500] semantics changed"
+  breakage_primitive: "RestructureDatasetSchema"
+  params:
+    old_column: "sentence"
+    new_column: "text"
+  repair_primitive: "RestoreColumn"
+  category: "dataset_drift"
+- version_range: "transformers 4.45 -> 4.50"
+  affected_api: "Trainer"
+  description: "evaluation_strategy was deprecated and removed"
+  breakage_primitive: "RemoveDeprecatedMethod"
+  params:
+    class_name: "Trainer"
+    method_name: "evaluate"
+    replacement: "evaluate_legacy"
+  repair_primitive: "RestoreMethod"
+  category: "api_drift"
+- version_range: "transformers 4.30 -> 4.40"
+  affected_api: "PreTrainedModel.from_pretrained"
+  description: "torch_dtype now required for some quantized model paths"
+  breakage_primitive: "ChangeArgumentSignature"
+  params:
+    function_name: "from_pretrained"
+    removed_arg: "torch_dtype"
+    added_arg: "torch_dtype"
+    added_value: '"auto"'
+  repair_primitive: "RestoreArgument"
+  category: "api_drift"
+- version_range: "datasets 3.0 -> 3.2"
+  affected_api: "Dataset.rename_column"
+  description: "rename_column raises if target name exists"
+  breakage_primitive: "RestructureDatasetSchema"
+  params:
+    old_column: "labels"
+    new_column: "label"
+  repair_primitive: "RestoreColumn"
+  category: "dataset_drift"
+- version_range: "transformers 4.36 -> 4.42"
+  affected_api: "TrainingArguments.report_to"
+  description: "Default report_to changed from 'all' to 'none'"
+  breakage_primitive: "ModifyConfigField"
+  params:
+    config_class: "TrainingArguments"
+    field_name: "report_to"
+    new_value: '"all"'
+  repair_primitive: "RestoreConfigField"
+  category: "config_drift"
+- version_range: "transformers 4.40 -> 4.50"
+  affected_api: "imports"
+  description: "transformers.deepspeed moved to accelerate.utils.deepspeed"
+  breakage_primitive: "DeprecateImport"
+  params:
+    old_module: "from transformers.deepspeed"
+    new_module: "from accelerate.utils.deepspeed"
+  repair_primitive: "RestoreImport"
+  category: "import_drift"
+- version_range: "transformers 4.45 -> 4.50"
+  affected_api: "Tokenizer return"
+  description: "Tokenizer call output now returns a BatchEncoding with .encodings attribute"
+  breakage_primitive: "ChangeReturnType"
+  params:
+    function_name: "tokenizer"
+    old_access: "tokenizer(text)"
+    new_access: "tokenizer(text).encodings"
+  repair_primitive: "RestoreReturnAccess"
+  category: "api_drift"
+- version_range: "transformers 4.30 -> 4.40"
+  affected_api: "save_pretrained"
+  description: "save_pretrained -> save_pretrained_directory rename in some classes"
+  breakage_primitive: "RenameApiCall"
+  params:
+    old_name: "save_pretrained"
+    new_name: "save_pretrained_directory"
+  repair_primitive: "RestoreApiCall"
+  category: "api_drift"
+- version_range: "transformers 4.45 -> 4.50"
+  affected_api: "TrainingArguments.no_cuda"
+  description: "no_cuda renamed to use_cpu (logic inverted)"
+  breakage_primitive: "RenameApiCall"
+  params:
+    old_name: "no_cuda"
+    new_name: "use_cpu"
+  repair_primitive: "RestoreApiCall"
+  category: "config_drift"

forgeenv-space/forgeenv/primitives/repair_primitives.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""Repair primitives — direct inverses of the 8 breakage primitives.
+Used during warm-start data generation: for every (script, breakage)
+pair we know the canonical repair, so we can write SFT pairs.
+These are also useful for unit-testing the breakage primitives:
+apply(breakage) then apply(repair) should be (close to) the identity.
+"""
+from __future__ import annotations
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+@dataclass
+class RepairPrimitive(ABC):
+    category: str = field(default="generic", init=False)
+    name: str = field(default="RepairPrimitive", init=False)
+    description: str = field(default="", init=False)
+    @abstractmethod
+    def apply(self, script: str) -> str:
+        """Transform `script` to undo the corresponding breakage."""
+    def to_spec(self) -> dict:
+        return {
+            "primitive_type": self.__class__.__name__,
+            "category": self.category,
+            "params": self._get_params(),
+        }
+    @abstractmethod
+    def _get_params(self) -> dict:
+        """Return JSON-serializable constructor parameters."""
+@dataclass
+class RestoreApiCall(RepairPrimitive):
+    new_name: str = ""
+    old_name: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RestoreApiCall"
+        self.description = f"Rename {self.new_name} -> {self.old_name}"
+    def apply(self, script: str) -> str:
+        if not self.new_name:
+            return script
+        pattern = re.compile(rf"(?<!\w){re.escape(self.new_name)}(?!\w)")
+        return pattern.sub(self.old_name, script)
+    def _get_params(self) -> dict:
+        return {"new_name": self.new_name, "old_name": self.old_name}
+@dataclass
+class RestoreImport(RepairPrimitive):
+    new_module: str = ""
+    old_module: str = ""
+    def __post_init__(self) -> None:
+        self.category = "import_drift"
+        self.name = "RestoreImport"
+        self.description = f"Restore import {self.new_module} -> {self.old_module}"
+    def apply(self, script: str) -> str:
+        return script.replace(self.new_module, self.old_module)
+    def _get_params(self) -> dict:
+        return {"new_module": self.new_module, "old_module": self.old_module}
+@dataclass
+class RestoreArgument(RepairPrimitive):
+    """Re-add a removed argument to a function call."""
+    function_name: str = ""
+    arg_name: str = ""
+    arg_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RestoreArgument"
+        self.description = (
+            f"Add {self.arg_name}={self.arg_value} to {self.function_name}()"
+        )
+    def apply(self, script: str) -> str:
+        if not self.function_name:
+            return script
+        # Insert the kwarg right after the function-name's opening paren.
+        pattern = rf"({re.escape(self.function_name)}\s*\()(\s*)"
+        replacement = rf"\g<1>{self.arg_name}={self.arg_value}, \g<2>"
+        return re.sub(pattern, replacement, script, count=1)
+    def _get_params(self) -> dict:
+        return {
+            "function_name": self.function_name,
+            "arg_name": self.arg_name,
+            "arg_value": self.arg_value,
+        }
+@dataclass
+class RestoreConfigField(RepairPrimitive):
+    field_name: str = ""
+    old_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "config_drift"
+        self.name = "RestoreConfigField"
+        self.description = f"Restore {self.field_name}={self.old_value}"
+    def apply(self, script: str) -> str:
+        if not self.field_name:
+            return script
+        pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
+        return re.sub(pattern, rf"\g<1>{self.old_value}", script)
+    def _get_params(self) -> dict:
+        return {"field_name": self.field_name, "old_value": self.old_value}
+@dataclass
+class RestoreColumn(RepairPrimitive):
+    new_column: str = ""
+    old_column: str = ""
+    def __post_init__(self) -> None:
+        self.category = "dataset_drift"
+        self.name = "RestoreColumn"
+        self.description = f"Rename column {self.new_column} -> {self.old_column}"
+    def apply(self, script: str) -> str:
+        return script.replace(
+            f'"{self.new_column}"', f'"{self.old_column}"'
+        ).replace(
+            f"'{self.new_column}'", f"'{self.old_column}'"
+        )
+    def _get_params(self) -> dict:
+        return {"new_column": self.new_column, "old_column": self.old_column}
+@dataclass
+class RestoreTokenizerKwarg(RepairPrimitive):
+    new_kwarg: str = ""
+    new_value: str = ""
+    old_kwarg: str = ""
+    old_value: str = ""
+    def __post_init__(self) -> None:
+        self.category = "tokenizer_drift"
+        self.name = "RestoreTokenizerKwarg"
+        self.description = (
+            f"Restore tokenizer {self.new_kwarg}={self.new_value} -> "
+            f"{self.old_kwarg}={self.old_value}"
+        )
+    def apply(self, script: str) -> str:
+        if not self.new_kwarg:
+            return script
+        pattern = rf"{re.escape(self.new_kwarg)}\s*=\s*{re.escape(self.new_value)}"
+        replacement = f"{self.old_kwarg}={self.old_value}"
+        return re.sub(pattern, replacement, script)
+    def _get_params(self) -> dict:
+        return {
+            "new_kwarg": self.new_kwarg,
+            "new_value": self.new_value,
+            "old_kwarg": self.old_kwarg,
+            "old_value": self.old_value,
+        }
+@dataclass
+class RestoreMethod(RepairPrimitive):
+    method_name: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RestoreMethod"
+        self.description = f"Un-deprecate .{self.method_name}()"
+    def apply(self, script: str) -> str:
+        if not self.method_name:
+            return script
+        return script.replace(
+            f".{self.method_name}_DEPRECATED(", f".{self.method_name}("
+        )
+    def _get_params(self) -> dict:
+        return {"method_name": self.method_name}
+@dataclass
+class RestoreReturnAccess(RepairPrimitive):
+    new_access: str = ""
+    old_access: str = ""
+    def __post_init__(self) -> None:
+        self.category = "api_drift"
+        self.name = "RestoreReturnAccess"
+        self.description = f"Restore return-access {self.new_access} -> {self.old_access}"
+    def apply(self, script: str) -> str:
+        if not self.new_access:
+            return script
+        return script.replace(self.new_access, self.old_access)
+    def _get_params(self) -> dict:
+        return {"new_access": self.new_access, "old_access": self.old_access}
+REPAIR_REGISTRY: dict[str, type[RepairPrimitive]] = {
+    "RestoreApiCall": RestoreApiCall,
+    "RestoreImport": RestoreImport,
+    "RestoreArgument": RestoreArgument,
+    "RestoreConfigField": RestoreConfigField,
+    "RestoreColumn": RestoreColumn,
+    "RestoreTokenizerKwarg": RestoreTokenizerKwarg,
+    "RestoreMethod": RestoreMethod,
+    "RestoreReturnAccess": RestoreReturnAccess,
+}
+# Map a breakage primitive's class name to the repair-primitive class that
+# inverts it. Used by the warm-start pair generator and by the demo / repair
+# library curator.
+BREAKAGE_TO_REPAIR: dict[str, str] = {
+    "RenameApiCall": "RestoreApiCall",
+    "DeprecateImport": "RestoreImport",
+    "ChangeArgumentSignature": "RestoreArgument",
+    "ModifyConfigField": "RestoreConfigField",
+    "RestructureDatasetSchema": "RestoreColumn",
+    "ChangeTokenizerBehavior": "RestoreTokenizerKwarg",
+    "RemoveDeprecatedMethod": "RestoreMethod",
+    "ChangeReturnType": "RestoreReturnAccess",
+}

forgeenv-space/forgeenv/roles/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/roles/drift_generator.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Drift Generator parser + a deterministic baseline policy.
+In training the LLM produces a JSON breakage spec; we parse it. In rollouts
+where we want a baseline (or a fallback when the LLM emits malformed JSON)
+we use `BaselineDriftGenerator`, which samples from the per-category set of
+known good primitive parameterisations.
+"""
+from __future__ import annotations
+import json
+import random
+import re
+from dataclasses import dataclass
+from typing import Optional
+from forgeenv.primitives.breakage_primitives import (
+    PRIMITIVE_REGISTRY,
+    parse_breakage_spec,
+    BreakagePrimitive,
+)
+_JSON_RE = re.compile(r"\{[\s\S]*\}")
+def parse_drift_output(text: str) -> Optional[dict]:
+    """Extract a JSON object from possibly-noisy LLM output.
+    Handles markdown fences, prose preamble, trailing commas (best-effort).
+    Returns None on failure.
+    """
+    if not text:
+        return None
+    text = text.strip()
+    if text.startswith("```"):
+        text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
+        text = re.sub(r"\n?```$", "", text)
+    match = _JSON_RE.search(text)
+    if not match:
+        return None
+    blob = match.group(0)
+    try:
+        return json.loads(blob)
+    except json.JSONDecodeError:
+        cleaned = re.sub(r",\s*([}\]])", r"\1", blob)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            return None
+def parse_drift_to_primitive(text: str) -> Optional[BreakagePrimitive]:
+    """End-to-end: LLM text -> validated BreakagePrimitive (or None)."""
+    data = parse_drift_output(text)
+    if not isinstance(data, dict):
+        return None
+    try:
+        return parse_breakage_spec(data)
+    except (ValueError, TypeError):
+        return None
+# ---------------------------------------------------------------- baselines
+_DEFAULT_PARAMS_BY_TYPE: dict[str, list[dict]] = {
+    "RenameApiCall": [
+        {"old_name": "trainer.train", "new_name": "trainer.start_training"},
+        {"old_name": "save_pretrained", "new_name": "save_to_hub"},
+        {"old_name": "from_pretrained", "new_name": "load_from_hub"},
+    ],
+    "DeprecateImport": [
+        {
+            "old_module": "from transformers import Trainer",
+            "new_module": "from transformers.legacy import Trainer",
+        },
+        {
+            "old_module": "from transformers import TrainingArguments",
+            "new_module": "from transformers.training import TrainingArguments",
+        },
+    ],
+    "ChangeArgumentSignature": [
+        {
+            "function_name": "TrainingArguments",
+            "removed_arg": "num_train_epochs",
+            "added_arg": "max_steps",
+            "added_value": "1000",
+        },
+        {
+            "function_name": "TrainingArguments",
+            "removed_arg": "evaluation_strategy",
+            "added_arg": "eval_strategy",
+            "added_value": '"steps"',
+        },
+    ],
+    "ModifyConfigField": [
+        {"config_class": "TrainingArguments", "field_name": "learning_rate", "new_value": "5e-3"},
+        {"config_class": "TrainingArguments", "field_name": "per_device_train_batch_size", "new_value": "1"},
+    ],
+    "RestructureDatasetSchema": [
+        {"old_column": "text", "new_column": "input_text"},
+        {"old_column": "label", "new_column": "labels"},
+        {"old_column": "tokens", "new_column": "words"},
+    ],
+    "ChangeTokenizerBehavior": [
+        {"old_kwarg": "padding", "old_value": "True", "new_kwarg": "pad_to_max_length", "new_value": "True"},
+        {"old_kwarg": "truncation", "old_value": "True", "new_kwarg": "truncate", "new_value": "True"},
+    ],
+    "RemoveDeprecatedMethod": [
+        {"class_name": "Trainer", "method_name": "evaluate", "replacement": "evaluation_loop"},
+        {"class_name": "Trainer", "method_name": "save_model", "replacement": "save_to_hub"},
+    ],
+    "ChangeReturnType": [
+        {"function_name": "Trainer.predict", "old_access": ".predictions", "new_access": "[0]"},
+        {"function_name": "tokenizer", "old_access": '["input_ids"]', "new_access": ".input_ids"},
+    ],
+}
+@dataclass
+class BaselineDriftGenerator:
+    """Deterministic stand-in for the LLM Drift Generator.
+    Used for warm-start data, baseline rollouts, and unit tests.
+    """
+    seed: Optional[int] = None
+    def __post_init__(self) -> None:
+        self._rng = random.Random(self.seed) if self.seed is not None else random
+    def propose(
+        self, target_category: str = "", script: str = ""
+    ) -> dict:
+        """Produce a JSON-serializable breakage spec for `target_category`.
+        Order of preference:
+          1. A primitive of `target_category` whose default params apply to `script`.
+          2. A primitive of any type whose default params apply to `script`.
+          3. A primitive of `target_category` (no-op fallback).
+        """
+        preferred_types = (
+            [target_category] if target_category in _DEFAULT_PARAMS_BY_TYPE else []
+        )
+        all_types = list(_DEFAULT_PARAMS_BY_TYPE.keys())
+        for type_set in (preferred_types, all_types):
+            shuffled = self._rng.sample(type_set, len(type_set)) if type_set else []
+            for ptype in shuffled:
+                for params in self._rng.sample(
+                    _DEFAULT_PARAMS_BY_TYPE[ptype],
+                    len(_DEFAULT_PARAMS_BY_TYPE[ptype]),
+                ):
+                    if self._params_apply_to_script(ptype, params, script):
+                        return {"primitive_type": ptype, "params": dict(params)}
+        ptype = preferred_types[0] if preferred_types else all_types[0]
+        return {
+            "primitive_type": ptype,
+            "params": dict(_DEFAULT_PARAMS_BY_TYPE[ptype][0]),
+        }
+    @staticmethod
+    def _params_apply_to_script(ptype: str, params: dict, script: str) -> bool:
+        """Heuristic: would this primitive actually mutate `script`?"""
+        if not script:
+            return True
+        for key in ("old_name", "old_module", "removed_arg", "field_name", "old_column", "old_kwarg", "method_name", "old_access"):
+            if key in params and params[key] and params[key] in script:
+                return True
+        return False

forgeenv-space/forgeenv/roles/prompts.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""System and user prompts for the two RL roles.
+Both roles are trained from the same base policy (Qwen-2.5-Coder-7B) with
+LoRA adapters per role, so role prompts are the only thing distinguishing
+them at inference time. Keep them concise — every token is a token of GPU
+budget during GRPO rollouts.
+"""
+from __future__ import annotations
+from typing import Iterable
+PRIMITIVE_DESCRIPTIONS = {
+    "RenameApiCall": "Rename a function/method call (api_drift)",
+    "DeprecateImport": "Change an import path (import_drift)",
+    "ChangeArgumentSignature": "Remove an expected kwarg from a call (api_drift)",
+    "ModifyConfigField": "Change a config-class default (config_drift)",
+    "RestructureDatasetSchema": "Rename a dataset column reference (dataset_drift)",
+    "ChangeTokenizerBehavior": "Change tokenizer call kwargs (tokenizer_drift)",
+    "RemoveDeprecatedMethod": "Remove a method, leaving a sentinel _DEPRECATED suffix (api_drift)",
+    "ChangeReturnType": "Function returns a different structure (api_drift)",
+}
+DRIFT_GENERATOR_SYSTEM_PROMPT = """You are the Drift Generator.
+You see a working HuggingFace training script and the curriculum target category.
+Output exactly one JSON object describing a breakage primitive that simulates
+realistic library version drift. The primitive must:
+1. Be PLAUSIBLE — match the kind of breakage that happens between real
+   transformers/datasets/trl releases.
+2. Be SOLVABLE — the Repair Agent should be able to fix it from the error trace alone.
+3. Match the requested target_category.
+Output schema:
+{"primitive_type": "<one of the 8 types>", "params": { ... }}
+Available primitive types and parameter schemas:
+- RenameApiCall: {"old_name": str, "new_name": str}
+- DeprecateImport: {"old_module": str, "new_module": str}
+- ChangeArgumentSignature: {"function_name": str, "removed_arg": str, "added_arg": str, "added_value": str}
+- ModifyConfigField: {"config_class": str, "field_name": str, "new_value": str}
+- RestructureDatasetSchema: {"old_column": str, "new_column": str}
+- ChangeTokenizerBehavior: {"old_kwarg": str, "old_value": str, "new_kwarg": str, "new_value": str}
+- RemoveDeprecatedMethod: {"class_name": str, "method_name": str, "replacement": str}
+- ChangeReturnType: {"function_name": str, "old_access": str, "new_access": str}
+Output ONLY the JSON object — no commentary, no markdown fences.
+"""
+REPAIR_AGENT_SYSTEM_PROMPT = """You are the Repair Agent.
+You see a broken HuggingFace training script, an error trace, and the current
+library version snapshot. Output ONLY a unified diff that fixes the script.
+Rules:
+1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`
+   headers and `@@ ... @@` hunk markers.
+2. Make the MINIMAL change that resolves the error AND preserves the original
+   training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit
+   calls.
+3. Do NOT add any prose, markdown fences, or thinking output — diff only.
+4. If the error is unfixable, output an empty diff.
+"""
+def render_drift_generator_prompt(
+    script: str, target_category: str, library_versions: dict
+) -> str:
+    versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
+    return f"""Target category: {target_category}
+Library versions: {versions_str}
+Working script:
+```python
+{script}
+```
+Output JSON breakage primitive:"""
+def render_repair_agent_prompt(
+    broken_script: str,
+    error_trace: str,
+    library_versions: dict,
+    target_category: str = "",
+) -> str:
+    versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
+    return f"""Library versions: {versions_str}
+Target category hint: {target_category or 'unknown'}
+Broken script:
+```python
+{broken_script}
+```
+Error trace:
+{error_trace}
+Output unified diff (no prose, no fences):"""
+def list_primitive_descriptions() -> Iterable[str]:
+    return (f"- {k}: {v}" for k, v in PRIMITIVE_DESCRIPTIONS.items())

forgeenv-space/forgeenv/roles/repair_agent.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Repair Agent helpers: response sanitisation + a deterministic baseline.
+The Repair Agent's training output is a unified diff. LLMs frequently emit
+prose / fences / chain-of-thought before the diff; this module strips that
+preamble. The baseline policy uses the inverse-primitive map from
+`repair_primitives.py` to produce ground-truth diffs for warm-start.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Optional
+from forgeenv.env.diff_utils import make_unified_diff
+from forgeenv.primitives.breakage_primitives import (
+    parse_breakage_spec,
+    BreakagePrimitive,
+)
+from forgeenv.primitives.repair_primitives import (
+    BREAKAGE_TO_REPAIR,
+    REPAIR_REGISTRY,
+    RepairPrimitive,
+)
+_DIFF_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
+_FENCE_RE = re.compile(r"```[a-zA-Z]*\n([\s\S]*?)\n```")
+def extract_diff(raw_text: str) -> str:
+    """Pull the unified diff out of an LLM response.
+    Handles: code fences, leading prose / chain-of-thought, trailing notes.
+    """
+    if not raw_text:
+        return ""
+    raw_text = raw_text.strip()
+    fence_match = _FENCE_RE.search(raw_text)
+    if fence_match:
+        raw_text = fence_match.group(1).strip()
+    lines = raw_text.splitlines()
+    start = 0
+    for i, line in enumerate(lines):
+        if line.startswith(("---", "+++", "@@")):
+            start = i
+            break
+    return "\n".join(lines[start:])
+def looks_like_diff(text: str) -> bool:
+    if not text:
+        return False
+    has_header = "---" in text and "+++" in text
+    has_hunk = bool(_DIFF_HUNK_RE.search(text))
+    has_pm = any(line.startswith(("+", "-")) for line in text.splitlines())
+    return (has_header and has_hunk) or (has_hunk and has_pm)
+# ---------------------------------------------------------------- baselines
+@dataclass
+class BaselineRepairAgent:
+    """Deterministic Repair Agent that uses the primitive inverse map.
+    Used for warm-start dataset generation and baseline rollout comparisons.
+    """
+    def repair(
+        self,
+        broken_script: str,
+        breakage_spec: Optional[dict] = None,
+        original_script: str = "",
+    ) -> str:
+        """Return a unified diff (or full replacement script) that fixes the
+        broken script.
+        Strategy preference:
+          1. If `original_script` is provided, return a diff between the
+             broken script and the original (oracle). This is the warm-start
+             path — we always know the ground truth.
+          2. Otherwise try to invert the structured breakage_spec via the
+             repair-primitive registry.
+          3. Otherwise return an empty diff.
+        """
+        if original_script and original_script != broken_script:
+            return make_unified_diff(broken_script, original_script)
+        if breakage_spec:
+            try:
+                breakage = parse_breakage_spec(breakage_spec)
+            except (ValueError, TypeError):
+                breakage = None
+            if breakage is not None:
+                repair = _invert_breakage(breakage)
+                if repair is not None:
+                    repaired = repair.apply(broken_script)
+                    if repaired != broken_script:
+                        return make_unified_diff(broken_script, repaired)
+        return ""
+_PARAM_REMAP: dict[str, dict[str, str]] = {
+    "RenameApiCall": {"old_name": "old_name", "new_name": "new_name"},
+    "DeprecateImport": {"old_module": "old_module", "new_module": "new_module"},
+    "ChangeArgumentSignature": {
+        "function_name": "function_name",
+        "removed_arg": "arg_name",
+    },
+    "ModifyConfigField": {"field_name": "field_name"},
+    "RestructureDatasetSchema": {
+        "old_column": "old_column",
+        "new_column": "new_column",
+    },
+    "ChangeTokenizerBehavior": {
+        "old_kwarg": "old_kwarg",
+        "old_value": "old_value",
+        "new_kwarg": "new_kwarg",
+        "new_value": "new_value",
+    },
+    "RemoveDeprecatedMethod": {"method_name": "method_name"},
+    "ChangeReturnType": {"old_access": "old_access", "new_access": "new_access"},
+}
+def _invert_breakage(breakage: BreakagePrimitive) -> Optional[RepairPrimitive]:
+    breakage_name = type(breakage).__name__
+    repair_name = BREAKAGE_TO_REPAIR.get(breakage_name)
+    if repair_name is None:
+        return None
+    repair_cls = REPAIR_REGISTRY.get(repair_name)
+    if repair_cls is None:
+        return None
+    breakage_params = breakage._get_params()  # type: ignore[attr-defined]
+    remap = _PARAM_REMAP.get(breakage_name, {})
+    mapped: dict[str, str] = {}
+    for src_key, dst_key in remap.items():
+        if src_key in breakage_params:
+            mapped[dst_key] = breakage_params[src_key]
+    valid_fields = {
+        f.name
+        for f in repair_cls.__dataclass_fields__.values()  # type: ignore[attr-defined]
+        if f.init
+    }
+    filtered = {k: v for k, v in mapped.items() if k in valid_fields}
+    try:
+        return repair_cls(**filtered)
+    except TypeError:
+        return None

forgeenv-space/forgeenv/roles/teacher.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Teacher (curriculum controller).
+Deterministic — NOT an LLM. Maintains an EMA success rate per breakage
+category and routes the next episode toward the category where the
+Repair Agent is closest to a 50% success rate (R-Zero's difficulty band).
+"""
+from __future__ import annotations
+import random
+from dataclasses import dataclass, field
+@dataclass
+class Teacher:
+    categories: list[str]
+    alpha: float = 0.9
+    success_counts: dict[str, int] = field(default_factory=dict)
+    attempt_counts: dict[str, int] = field(default_factory=dict)
+    ema_success: dict[str, float] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        for category in self.categories:
+            self.success_counts.setdefault(category, 0)
+            self.attempt_counts.setdefault(category, 0)
+            self.ema_success.setdefault(category, 0.5)
+    def update(self, category: str, success: bool) -> None:
+        if category not in self.ema_success:
+            self.categories.append(category)
+            self.ema_success[category] = 0.5
+            self.success_counts[category] = 0
+            self.attempt_counts[category] = 0
+        self.attempt_counts[category] += 1
+        self.success_counts[category] += int(success)
+        rate = self.success_counts[category] / max(1, self.attempt_counts[category])
+        self.ema_success[category] = (
+            self.alpha * self.ema_success[category] + (1 - self.alpha) * rate
+        )
+    def select_next_category(self) -> str:
+        in_zone = {
+            c: abs(s - 0.5) for c, s in self.ema_success.items() if 0.3 <= s <= 0.7
+        }
+        if in_zone:
+            weights = [1.0 / (v + 0.01) for v in in_zone.values()]
+            return random.choices(list(in_zone.keys()), weights=weights, k=1)[0]
+        return min(self.ema_success, key=lambda c: abs(self.ema_success[c] - 0.5))
+    def get_state(self) -> dict:
+        return {
+            c: {
+                "ema_success": round(self.ema_success[c], 4),
+                "attempts": self.attempt_counts[c],
+                "successes": self.success_counts[c],
+            }
+            for c in self.categories
+        }

forgeenv-space/forgeenv/sandbox/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/sandbox/ast_validator.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""AST-based script validator.
+Catches forbidden imports and dangerous patterns BEFORE any execution
+happens. This is a critical defense against reward hacking via system
+calls, network access, or process manipulation.
+"""
+from __future__ import annotations
+import ast
+from forgeenv.tasks.models import ValidationResult
+FORBIDDEN_MODULES = {
+    "os",
+    "subprocess",
+    "socket",
+    "urllib",
+    "requests",
+    "ctypes",
+    "shutil",
+    "signal",
+    "multiprocessing",
+    "threading",
+}
+FORBIDDEN_FUNCTIONS = {"eval", "exec", "compile", "__import__"}
+def validate_script(script_content: str) -> ValidationResult:
+    """Parse a script as AST and reject forbidden patterns.
+    Returns a ValidationResult with `is_valid` and a list of `violations`.
+    """
+    violations: list[str] = []
+    try:
+        tree = ast.parse(script_content)
+    except SyntaxError as e:
+        return ValidationResult(is_valid=False, violations=[f"SyntaxError: {e}"])
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                module_root = alias.name.split(".")[0]
+                if module_root in FORBIDDEN_MODULES:
+                    violations.append(f"Forbidden import: {alias.name}")
+        if isinstance(node, ast.ImportFrom):
+            if node.module:
+                module_root = node.module.split(".")[0]
+                if module_root in FORBIDDEN_MODULES:
+                    violations.append(f"Forbidden import from: {node.module}")
+        if isinstance(node, ast.Call):
+            if isinstance(node.func, ast.Name):
+                if node.func.id in FORBIDDEN_FUNCTIONS:
+                    violations.append(f"Forbidden call: {node.func.id}()")
+            if isinstance(node.func, ast.Attribute):
+                if node.func.attr in FORBIDDEN_FUNCTIONS:
+                    violations.append(f"Forbidden call: .{node.func.attr}()")
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == "__builtins__":
+                    violations.append("Forbidden: __builtins__ assignment")
+    return ValidationResult(
+        is_valid=len(violations) == 0,
+        violations=violations,
+    )

forgeenv-space/forgeenv/sandbox/simulation_mode.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Fast simulation executor for development.
+Static-analysis-based execution simulator. Sub-100ms per call. No Docker
+required. The success probability of a simulated run depends on whether
+the script contains expected HF training markers (model imports, training
+calls, save calls). When the simulation succeeds, a synthetic decreasing
+loss curve is emitted; when it fails, a representative HF error is raised.
+"""
+from __future__ import annotations
+import random
+import time
+from typing import Optional
+from forgeenv.sandbox.ast_validator import validate_script
+from forgeenv.tasks.models import ExecutionResult, Task
+class SimulationExecutor:
+    """Simulates script execution via static analysis.
+    Use this throughout development phases. Real Docker execution is added
+    later for grounded final-stage verification.
+    """
+    def __init__(self, seed: Optional[int] = None) -> None:
+        self._rng = random.Random(seed) if seed is not None else random
+    def execute(
+        self, script_content: str, task: Optional[Task] = None
+    ) -> ExecutionResult:
+        start = time.time()
+        validation = validate_script(script_content)
+        if not validation.is_valid:
+            return ExecutionResult(
+                exit_code=1,
+                stdout="",
+                stderr=f"Validation failed: {'; '.join(validation.violations)}",
+                wall_time_ms=int((time.time() - start) * 1000),
+                script_content=script_content,
+            )
+        try:
+            compile(script_content, "<forge_script>", "exec")
+        except SyntaxError as e:
+            return ExecutionResult(
+                exit_code=1,
+                stdout="",
+                stderr=f"SyntaxError: {e}",
+                wall_time_ms=int((time.time() - start) * 1000),
+                script_content=script_content,
+            )
+        has_model_import = any(
+            kw in script_content
+            for kw in ("from transformers", "import torch", "from datasets")
+        )
+        has_training_call = any(
+            kw in script_content
+            for kw in ("trainer.train()", ".fit(", "train_loop", "for epoch")
+        )
+        has_save = any(
+            kw in script_content
+            for kw in ("save_pretrained", "save_model", "torch.save")
+        )
+        success_prob = 0.3
+        if has_model_import:
+            success_prob += 0.3
+        if has_training_call:
+            success_prob += 0.2
+        if has_save:
+            success_prob += 0.1
+        # Mark obviously broken patterns as definite failures even when
+        # they pass syntactic compilation. The simulator pretends to be a
+        # static linter that catches AttributeError / ImportError signatures
+        # before they would fire at runtime.
+        broken_markers = (
+            "_DEPRECATED(",
+            "transformers.legacy",
+            "from transformers.training import",
+            ".start_training(",
+            "load_from_hub(",
+            "save_to_hub(",
+            "pad_to_max_length=",
+            "evaluation_loop(",
+        )
+        if any(marker in script_content for marker in broken_markers):
+            success_prob = 0.0
+        # Patterns that look like dataset column drift: a renamed column
+        # that doesn't appear in real HF datasets.
+        import re as _re
+        if _re.search(r"['\"]input_text['\"]\s*[]:),]", script_content):
+            success_prob = min(success_prob, 0.05)
+        if _re.search(r"['\"]words['\"]\s*[]:),]", script_content):
+            success_prob = min(success_prob, 0.05)
+        # Tokenizer kwarg drift (truncate is not valid; truncation is).
+        if _re.search(r"\btruncate\s*=", script_content):
+            success_prob = min(success_prob, 0.05)
+        succeeded = self._rng.random() < success_prob
+        if succeeded:
+            steps = self._rng.randint(20, 50)
+            log_lines: list[str] = []
+            loss = self._rng.uniform(2.0, 4.0)
+            for step in range(1, steps + 1):
+                loss *= self._rng.uniform(0.92, 0.99)
+                log_lines.append(f"step={step} loss={loss:.4f}")
+            log_lines.append("eval_accuracy=0.78")
+            log_lines.append("TRAINING_COMPLETE")
+            return ExecutionResult(
+                exit_code=0,
+                stdout="\n".join(log_lines),
+                stderr="",
+                wall_time_ms=int((time.time() - start) * 1000)
+                + self._rng.randint(1000, 5000),
+                checkpoint_exists=True,
+                peak_memory_mb=self._rng.uniform(500, 2000),
+                script_content=script_content,
+            )
+        error_types = [
+            "ImportError: cannot import name 'OldTrainer' from 'transformers'",
+            "AttributeError: 'Trainer' object has no attribute 'evaluate_model'",
+            "KeyError: 'text' column not found in dataset",
+            "TypeError: __init__() got an unexpected keyword argument 'num_epochs'",
+            "RuntimeError: Expected input batch_size (16) to match target batch_size (32)",
+            "ModuleNotFoundError: No module named 'transformers.legacy'",
+        ]
+        return ExecutionResult(
+            exit_code=1,
+            stdout="",
+            stderr=self._rng.choice(error_types),
+            wall_time_ms=int((time.time() - start) * 1000)
+            + self._rng.randint(100, 500),
+            script_content=script_content,
+        )

forgeenv-space/forgeenv/tasks/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/tasks/models.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Core data models for ForgeEnv tasks and execution results.
+These are framework-internal dataclasses (not Pydantic) used throughout the
+simulation, verifier, and primitive layers. The OpenEnv-facing Pydantic
+models live in `forgeenv.env.actions` / `forgeenv.env.observations`.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class Task:
+    """A HuggingFace training script with execution metadata."""
+    task_id: str
+    description: str
+    script_content: str
+    difficulty: str  # "easy", "medium", "hard"
+    category: str = "general"
+    expected_loss_range: tuple[float, float] = (0.0, 5.0)
+    expected_accuracy_range: tuple[float, float] = (0.0, 1.0)
+    checkpoint_output_path: str = "/tmp/forge_output/checkpoint"
+@dataclass
+class ExecutionResult:
+    """Result of executing a Python script in the sandbox."""
+    exit_code: int
+    stdout: str
+    stderr: str
+    wall_time_ms: int
+    checkpoint_exists: bool = False
+    peak_memory_mb: float = 0.0
+    script_content: str = ""
+@dataclass
+class ValidationResult:
+    """Result of AST validation on a script."""
+    is_valid: bool
+    violations: list[str] = field(default_factory=list)

forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py ADDED Viewed

File without changes

forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""ALBERT-tiny extractive QA on 100-sample SQuAD subset."""
+from transformers import (
+    AutoTokenizer,
+    AutoModelForQuestionAnswering,
+    Trainer,
+    TrainingArguments,
+    DefaultDataCollator,
+)
+from datasets import load_dataset
+dataset = load_dataset("squad", split="train[:100]")
+tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+def prepare(examples):
+    enc = tokenizer(
+        examples["question"],
+        examples["context"],
+        max_length=128,
+        truncation="only_second",
+        padding="max_length",
+        return_offsets_mapping=True,
+    )
+    start_positions, end_positions = [], []
+    for i, offsets in enumerate(enc["offset_mapping"]):
+        answer = examples["answers"][i]
+        start_char = answer["answer_start"][0]
+        end_char = start_char + len(answer["text"][0])
+        token_start = next(
+            (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0
+        )
+        token_end = next(
+            (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start
+        )
+        start_positions.append(token_start)
+        end_positions.append(token_end)
+    enc["start_positions"] = start_positions
+    enc["end_positions"] = end_positions
+    enc.pop("offset_mapping")
+    return enc
+dataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)
+model = AutoModelForQuestionAnswering.from_pretrained("albert-base-v2")
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=4,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    data_collator=DefaultDataCollator(),
+)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset."""
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForTokenClassification,
+)
+from datasets import load_dataset
+dataset = load_dataset("conll2003", split="train[:200]")
+tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
+def tokenize_and_align(example):
+    enc = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, max_length=64)
+    word_ids = enc.word_ids()
+    labels = []
+    prev_id = None
+    for wid in word_ids:
+        if wid is None:
+            labels.append(-100)
+        elif wid != prev_id:
+            labels.append(example["ner_tags"][wid])
+        else:
+            labels.append(-100)
+        prev_id = wid
+    enc["labels"] = labels
+    return enc
+dataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)
+model = AutoModelForTokenClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=9)
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    data_collator=DataCollatorForTokenClassification(tokenizer),
+)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""DistilBERT fine-tuning on a tiny SST-2 subset.
+Minimal HuggingFace text-classification training script. Should complete
+in ~60s on CPU.
+"""
+from transformers import (
+    DistilBertTokenizer,
+    DistilBertForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+from datasets import load_dataset
+dataset = load_dataset("glue", "sst2", split="train[:500]")
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+def tokenize_function(examples):
+    return tokenizer(
+        examples["sentence"],
+        padding="max_length",
+        truncation=True,
+        max_length=64,
+    )
+dataset = dataset.map(tokenize_function, batched=True)
+dataset = dataset.rename_column("label", "labels")
+dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+model = DistilBertForSequenceClassification.from_pretrained(
+    "distilbert-base-uncased", num_labels=2
+)
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=16,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""ELECTRA-small classification on 400-sample AG News (4-way text classification)."""
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+from datasets import load_dataset
+dataset = load_dataset("ag_news", split="train[:400]")
+tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
+def tokenize(examples):
+    return tokenizer(
+        examples["text"],
+        padding="max_length",
+        truncation=True,
+        max_length=64,
+    )
+dataset = dataset.map(tokenize, batched=True)
+dataset = dataset.rename_column("label", "labels")
+dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+model = AutoModelForSequenceClassification.from_pretrained(
+    "google/electra-small-discriminator", num_labels=4
+)
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation)."""
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForLanguageModeling,
+)
+from datasets import load_dataset
+dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:300]")
+tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+tokenizer.pad_token = tokenizer.eos_token
+def tokenize(examples):
+    return tokenizer(examples["text"], truncation=True, max_length=64)
+dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
+model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=4,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Sklearn logistic-regression baseline on a 500-sample tabular task.
+Sanity baseline that doesn't require torch / transformers / datasets.
+"""
+import json
+import pickle
+from pathlib import Path
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+X, y = make_classification(
+    n_samples=500, n_features=20, n_informative=10, random_state=0
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+model = LogisticRegression(max_iter=200)
+for step in range(1, 11):
+    model.set_params(max_iter=step * 20)
+    model.fit(X_train, y_train)
+    train_loss = -np.mean(np.log(np.maximum(model.predict_proba(X_train)[np.arange(len(y_train)), y_train], 1e-9)))
+    print(f"step={step} loss={train_loss:.4f}")
+acc = model.score(X_test, y_test)
+print(f"eval_accuracy={acc:.4f}")
+ckpt_dir = Path("/tmp/forge_output/checkpoint")
+ckpt_dir.mkdir(parents=True, exist_ok=True)
+with open(ckpt_dir / "logreg.pkl", "wb") as f:
+    pickle.dump(model, f)
+with open(ckpt_dir / "metrics.json", "w") as f:
+    json.dump({"accuracy": acc}, f)
+print("TRAINING_COMPLETE")

forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""DistilRoberta sentiment classification on 400-sample IMDB subset."""
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+from datasets import load_dataset
+dataset = load_dataset("imdb", split="train[:400]")
+tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+def tokenize(examples):
+    return tokenizer(
+        examples["text"],
+        padding="max_length",
+        truncation=True,
+        max_length=64,
+    )
+dataset = dataset.map(tokenize, batched=True)
+dataset = dataset.rename_column("label", "labels")
+dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+model = AutoModelForSequenceClassification.from_pretrained(
+    "distilroberta-base", num_labels=2
+)
+training_args = TrainingArguments(
+    output_dir="/tmp/forge_output/checkpoint",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    logging_steps=5,
+    save_strategy="epoch",
+    no_cuda=True,
+    report_to="none",
+)
+trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
+trainer.train()
+trainer.save_model("/tmp/forge_output/checkpoint")
+print("TRAINING_COMPLETE")