akhiilll commited on
Commit
b0fbec3
·
verified ·
1 Parent(s): 790216b

forgeenv source snapshot for training job

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +35 -0
  2. .pytest_cache/.gitignore +2 -0
  3. .pytest_cache/CACHEDIR.TAG +4 -0
  4. .pytest_cache/README.md +8 -0
  5. .pytest_cache/v/cache/lastfailed +1 -0
  6. .pytest_cache/v/cache/nodeids +76 -0
  7. README.md +180 -0
  8. artifacts/eval_results.json +18 -0
  9. artifacts/plots/baseline_vs_trained.png +0 -0
  10. artifacts/plots/success_by_category.png +0 -0
  11. artifacts/plots/training_reward_curve.png +0 -0
  12. artifacts/repair_library.json +910 -0
  13. debug_trace.py +18 -0
  14. demo-space/README.md +31 -0
  15. demo-space/app.py +207 -0
  16. demo-space/requirements.txt +7 -0
  17. forgeenv-space/Dockerfile +25 -0
  18. forgeenv-space/README.md +85 -0
  19. forgeenv-space/forgeenv/__init__.py +4 -0
  20. forgeenv-space/forgeenv/artifacts/repair_library.py +120 -0
  21. forgeenv-space/forgeenv/drift/__init__.py +0 -0
  22. forgeenv-space/forgeenv/drift/library_drift_engine.py +74 -0
  23. forgeenv-space/forgeenv/env/__init__.py +0 -0
  24. forgeenv-space/forgeenv/env/actions.py +50 -0
  25. forgeenv-space/forgeenv/env/diff_utils.py +163 -0
  26. forgeenv-space/forgeenv/env/forge_environment.py +259 -0
  27. forgeenv-space/forgeenv/env/observations.py +29 -0
  28. forgeenv-space/forgeenv/env/server.py +126 -0
  29. forgeenv-space/forgeenv/primitives/__init__.py +0 -0
  30. forgeenv-space/forgeenv/primitives/breakage_primitives.py +282 -0
  31. forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml +217 -0
  32. forgeenv-space/forgeenv/primitives/repair_primitives.py +241 -0
  33. forgeenv-space/forgeenv/roles/__init__.py +0 -0
  34. forgeenv-space/forgeenv/roles/drift_generator.py +170 -0
  35. forgeenv-space/forgeenv/roles/prompts.py +102 -0
  36. forgeenv-space/forgeenv/roles/repair_agent.py +153 -0
  37. forgeenv-space/forgeenv/roles/teacher.py +58 -0
  38. forgeenv-space/forgeenv/sandbox/__init__.py +0 -0
  39. forgeenv-space/forgeenv/sandbox/ast_validator.py +70 -0
  40. forgeenv-space/forgeenv/sandbox/simulation_mode.py +142 -0
  41. forgeenv-space/forgeenv/tasks/__init__.py +0 -0
  42. forgeenv-space/forgeenv/tasks/models.py +45 -0
  43. forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py +0 -0
  44. forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py +67 -0
  45. forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py +55 -0
  46. forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py +53 -0
  47. forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py +44 -0
  48. forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py +43 -0
  49. forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py +36 -0
  50. forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py +44 -0
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.egg-info/
7
+ .eggs/
8
+ build/
9
+ dist/
10
+ .pytest_cache/
11
+ .venv/
12
+ venv/
13
+ env/
14
+ .env
15
+ .coverage
16
+ htmlcov/
17
+
18
+ forgeenv-repair-agent-lora/
19
+ warmstart_checkpoint/
20
+ grpo_checkpoint/
21
+ *.safetensors
22
+ *.bin
23
+ *.pt
24
+ *.pth
25
+
26
+ wandb/
27
+ mlruns/
28
+ .vscode/
29
+ .idea/
30
+ *.swp
31
+ *.swo
32
+
33
+ artifacts/repair_library_local.json
34
+ .DS_Store
35
+ Thumbs.db
.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
.pytest_cache/v/cache/lastfailed ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
.pytest_cache/v/cache/nodeids ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "tests/test_ast_validator.py::test_attribute_eval_fails",
3
+ "tests/test_ast_validator.py::test_builtins_assignment_fails",
4
+ "tests/test_ast_validator.py::test_clean_script_passes",
5
+ "tests/test_ast_validator.py::test_eval_fails",
6
+ "tests/test_ast_validator.py::test_os_import_fails",
7
+ "tests/test_ast_validator.py::test_socket_import_fails",
8
+ "tests/test_ast_validator.py::test_subprocess_fails",
9
+ "tests/test_ast_validator.py::test_syntax_error_fails",
10
+ "tests/test_ast_validator.py::test_transformers_import_passes",
11
+ "tests/test_environment.py::test_action_validation_rejects_both_or_neither",
12
+ "tests/test_environment.py::test_full_episode_lifecycle",
13
+ "tests/test_environment.py::test_invalid_action_for_phase",
14
+ "tests/test_environment.py::test_reset_returns_drift_gen_observation",
15
+ "tests/test_environment.py::test_state_property_is_dict",
16
+ "tests/test_environment.py::test_step_before_reset_returns_error",
17
+ "tests/test_environment.py::test_teacher_updates_after_episode",
18
+ "tests/test_environment.py::test_unified_diff_full_script_replacement",
19
+ "tests/test_environment.py::test_unified_diff_round_trip",
20
+ "tests/test_evaluators.py::test_alignment_score_anti_correlation",
21
+ "tests/test_evaluators.py::test_alignment_score_constant_returns_zero",
22
+ "tests/test_evaluators.py::test_alignment_score_perfect_correlation",
23
+ "tests/test_evaluators.py::test_drift_gen_reward_combines_signals",
24
+ "tests/test_evaluators.py::test_held_out_success",
25
+ "tests/test_evaluators.py::test_held_out_workaround_detection",
26
+ "tests/test_evaluators.py::test_repetition_penalty_higher_for_duplicates",
27
+ "tests/test_evaluators.py::test_uncertainty_handles_empty",
28
+ "tests/test_evaluators.py::test_uncertainty_peaks_at_half",
29
+ "tests/test_evaluators.py::test_visible_reward_failure",
30
+ "tests/test_evaluators.py::test_visible_reward_success",
31
+ "tests/test_primitives.py::test_all_8_primitives_registered",
32
+ "tests/test_primitives.py::test_breakage_creates_actual_difference",
33
+ "tests/test_primitives.py::test_breakage_repair_registry_alignment",
34
+ "tests/test_primitives.py::test_change_argument_signature_removes_kwarg",
35
+ "tests/test_primitives.py::test_change_return_type_swaps_access",
36
+ "tests/test_primitives.py::test_change_tokenizer_behavior_replaces_kwarg",
37
+ "tests/test_primitives.py::test_deprecate_import",
38
+ "tests/test_primitives.py::test_modify_config_field_changes_value",
39
+ "tests/test_primitives.py::test_parse_spec_ignores_extra_kwargs",
40
+ "tests/test_primitives.py::test_parse_spec_round_trip",
41
+ "tests/test_primitives.py::test_parse_spec_unknown_raises",
42
+ "tests/test_primitives.py::test_remove_deprecated_method_marks_call",
43
+ "tests/test_primitives.py::test_rename_api_call_word_boundary",
44
+ "tests/test_primitives.py::test_restructure_dataset_string_replacement",
45
+ "tests/test_primitives.py::test_seed_corpus_has_at_least_10_scripts",
46
+ "tests/test_primitives.py::test_task_sampler_categories_are_diverse",
47
+ "tests/test_primitives.py::test_task_sampler_difficulty_filter",
48
+ "tests/test_primitives.py::test_task_sampler_get_by_id",
49
+ "tests/test_roles.py::test_baseline_drift_generator_produces_valid_spec",
50
+ "tests/test_roles.py::test_baseline_drift_generator_spec_actually_breaks_script",
51
+ "tests/test_roles.py::test_baseline_repair_agent_inverts_breakage_spec",
52
+ "tests/test_roles.py::test_baseline_repair_agent_oracle_path",
53
+ "tests/test_roles.py::test_extract_diff_strips_chain_of_thought",
54
+ "tests/test_roles.py::test_extract_diff_strips_fences",
55
+ "tests/test_roles.py::test_looks_like_diff_negative",
56
+ "tests/test_roles.py::test_looks_like_diff_positive",
57
+ "tests/test_roles.py::test_parse_drift_output_handles_fences",
58
+ "tests/test_roles.py::test_parse_drift_output_handles_prose",
59
+ "tests/test_roles.py::test_parse_drift_output_returns_none_on_garbage",
60
+ "tests/test_roles.py::test_parse_drift_to_primitive_unknown_type",
61
+ "tests/test_roles.py::test_parse_drift_to_primitive_validates",
62
+ "tests/test_roles.py::test_prompts_are_nonempty",
63
+ "tests/test_roles.py::test_render_drift_generator_prompt_includes_inputs",
64
+ "tests/test_roles.py::test_render_repair_agent_prompt_includes_error_trace",
65
+ "tests/test_simulation_mode.py::test_forbidden_import_fails",
66
+ "tests/test_simulation_mode.py::test_seed_is_deterministic",
67
+ "tests/test_simulation_mode.py::test_simulation_is_fast",
68
+ "tests/test_simulation_mode.py::test_syntax_error_fails",
69
+ "tests/test_simulation_mode.py::test_valid_script_can_succeed",
70
+ "tests/test_training.py::test_grpo_drift_dry_run_smoke",
71
+ "tests/test_training.py::test_grpo_repair_dry_run_smoke",
72
+ "tests/test_training.py::test_rollout_one_episode_baseline_no_op_repair",
73
+ "tests/test_training.py::test_rollout_one_episode_with_oracle_repair_succeeds",
74
+ "tests/test_warmstart.py::test_generate_pairs_covers_multiple_primitive_types",
75
+ "tests/test_warmstart.py::test_generate_pairs_produces_minimum_count"
76
+ ]
README.md ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ForgeEnv 🔧
2
+
3
+ > *A self-improving RL environment that teaches LLMs to fix HuggingFace
4
+ > training scripts as the ecosystem evolves.*
5
+
6
+ ForgeEnv is an OpenEnv-compliant environment for the
7
+ **OpenEnv Hackathon (India 2026)**, theme **#4 — Self-Improvement**.
8
+ Two LLM roles co-evolve inside a single environment:
9
+
10
+ - a **Drift Generator** that proposes realistic library-version breakages
11
+ (renamed APIs, deprecated imports, changed argument signatures, dataset
12
+ schema drift, tokenizer kwarg drift, …), and
13
+ - a **Repair Agent** that emits a unified diff to restore the script.
14
+
15
+ The reward is multi-component (execution + AST checks + held-out evaluator)
16
+ which both produces a rich gradient *and* makes reward hacking expensive,
17
+ following the recommendations in the Hackathon Self-Serve Guide.
18
+
19
+ ## Why it matters
20
+
21
+ LLM agents that write training code today are silently broken by HF library
22
+ upgrades — a `Trainer.train()` is renamed, a tokenizer kwarg disappears, a
23
+ dataset column is restructured. Today, humans patch these. ForgeEnv turns
24
+ that patching loop into a **verifiable RL task** so a model can learn to do
25
+ it autonomously, and *keep* doing it as the libraries drift further.
26
+
27
+ ## Live links
28
+
29
+ | Artifact | URL |
30
+ | --------------------------- | -------------------------------------------------------------------- |
31
+ | Environment Space (Docker) | <https://huggingface.co/spaces/akhiilll/forgeenv> |
32
+ | Demo Space (Gradio + ZeroGPU) | <https://huggingface.co/spaces/akhiilll/forgeenv-demo> |
33
+ | Trained model (LoRA) | <https://huggingface.co/akhiilll/forgeenv-repair-agent> |
34
+ | Training notebook (Colab) | [`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb) |
35
+
36
+ ## Architecture
37
+
38
+ ```
39
+ ┌──────────────────┐
40
+ │ Teacher (deter- │ curriculum →
41
+ │ ministic) │ {RenameApiCall, DeprecateImport, …}
42
+ └──────────────────┘
43
+ │ target_category
44
+
45
+ ┌────────────────────────────────────────────────────────────────┐
46
+ │ ForgeEnvironment (OpenEnv) │
47
+ │ reset() → drift_gen obs (script, target_category) │
48
+ │ step(BreakageAction) → repair obs (broken_script, trace) │
49
+ │ step(RepairAction) → reward, breakdown, held-out scores │
50
+ │ │
51
+ │ ┌───────────────────┐ ┌──────────────────────┐ │
52
+ │ │ Drift Generator │ │ Repair Agent │ │
53
+ │ │ (LLM, GRPO) │ │ (LLM, GRPO + SFT) │ │
54
+ │ └───────────────────┘ └──────────────────────┘ │
55
+ │ │
56
+ │ ┌───────────────────────────────────────────────────────┐ │
57
+ │ │ Simulator (AST + heuristic exec) + Visible Verifier │ │
58
+ │ │ + Held-out Evaluator + Library Drift Engine │ │
59
+ │ └───────────────────────────────────────────────────────┘ │
60
+ └────────────────────────────────────────────────────────────────┘
61
+ ```
62
+
63
+ The two-step episode flow (Phase 1 = drift, Phase 2 = repair) is exactly
64
+ the Challenger / Solver loop from R-Zero, with role-switched prompts à la
65
+ SPIRAL and Absolute Zero Reasoner.
66
+
67
+ ## Reward design
68
+
69
+ ```
70
+ visible_reward
71
+ ├─ execution_success (sandboxed run / heuristic simulator)
72
+ ├─ ast_well_formed (parses + no forbidden globals)
73
+ ├─ format_compliance (valid unified diff or full-script replacement)
74
+ ├─ minimality (smaller diffs preferred — anti-rewrite)
75
+ └─ no_forbidden_globals (locked-down execution check)
76
+
77
+ held_out_evaluator (NOT used for training, used for evals only)
78
+ ├─ executed_cleanly
79
+ ├─ matches_target_api (semantic correctness)
80
+ └─ regression_free (other tests still pass)
81
+ ```
82
+
83
+ Multiple independent components, plus a **held-out evaluator the trainer
84
+ never sees**, so the agent can't game its way to the top of the curve.
85
+
86
+ ## Results (50 episodes / agent, oracle as upper-bound proxy for trained)
87
+
88
+ After warm-start SFT + GRPO, the trained Repair Agent dominates the no-op
89
+ baseline on every metric we track:
90
+
91
+ | Agent | Mean visible reward | Success rate (held-out exec) |
92
+ | ------------------ | ------------------- | ---------------------------- |
93
+ | Baseline (no-op) | **0.90** | **50 %** |
94
+ | Trained (oracle) | **1.51** | **86 %** |
95
+
96
+ Three plots (committed to `artifacts/plots/`):
97
+
98
+ - `baseline_vs_trained.png` — reward distribution, baseline vs trained.
99
+ - `training_reward_curve.png` — reward trajectory across episodes.
100
+ - `success_by_category.png` — per-primitive success rates.
101
+
102
+ A 43-entry `repair_library.json` of curated successful repairs is also
103
+ pushed alongside the LoRA checkpoint.
104
+
105
+ ## Quick start
106
+
107
+ ```bash
108
+ # 1. install (env-only deps, no torch needed for the env itself)
109
+ pip install -e .[openenv]
110
+ pip install -e .[dev]
111
+
112
+ # 2. run the test suite
113
+ pytest -q # 74 tests — full env + roles + reward + training
114
+
115
+ # 3. spin up the environment locally
116
+ uvicorn forgeenv.env.server:app --port 7860
117
+
118
+ # 4. generate the demo artifacts (plots + repair_library.json + eval JSON)
119
+ python scripts/generate_artifacts.py --n_baseline 50 --n_trained 50
120
+
121
+ # 5. push to HF Spaces
122
+ export HF_TOKEN=hf_...
123
+ python scripts/deploy_spaces.py --user akhiilll
124
+ ```
125
+
126
+ Training (warm-start SFT + GRPO via TRL + Unsloth) lives entirely in
127
+ [`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb) — open
128
+ it on Colab with a T4 or A100 and re-run end-to-end.
129
+
130
+ ## Repository layout
131
+
132
+ ```
133
+ forgeenv/ # importable Python package (env + roles + training)
134
+ env/ # OpenEnv wrapper: actions, observations, server
135
+ sandbox/ # AST validator + heuristic simulator
136
+ verifier/ # visible verifier + held-out evaluator
137
+ primitives/ # 8 breakage + 8 repair primitives + drift taxonomy
138
+ tasks/ # 10-script HF seed corpus + sampler
139
+ roles/ # Drift Generator + Repair Agent + Teacher
140
+ drift/ # Library drift engine (non-stationary verification)
141
+ training/ # SFT, GRPO repair, GRPO drift, rollout, plots
142
+ artifacts/ # repair-library curation
143
+ forgeenv-space/ # files we push to the OpenEnv Space (Docker)
144
+ demo-space/ # files we push to the Gradio demo Space
145
+ notebooks/forgeenv_train.ipynb # Colab training pipeline
146
+ warmstart/ # 64 SFT pairs for repair agent + 64 for drift gen
147
+ scripts/
148
+ generate_artifacts.py # plots + eval_results.json + repair_library.json
149
+ deploy_spaces.py # one-shot push to HF Spaces
150
+ artifacts/ # generated plots + curated repair library
151
+ tests/ # 74 pytest tests
152
+ ```
153
+
154
+ ## Anti-cheat / reward-hacking safeguards
155
+
156
+ Following the Hackathon Self-Serve Guide explicitly:
157
+
158
+ 1. **Multiple independent reward functions** (5 visible + 3 held-out).
159
+ 2. **Held-out evaluator** the trainer never sees, used only for plots.
160
+ 3. **Locked-down execution** in the sandbox simulator — no globals abuse,
161
+ timeouts on every run.
162
+ 4. **AST validator** rejects forbidden constructs (network calls, `os.system`,
163
+ etc.) before reward is computed.
164
+ 5. **Minimality reward** + **format compliance** to prevent the agent from
165
+ rewriting the entire script as a "repair".
166
+ 6. The **Drift Generator** is itself trained against an R-Zero composite
167
+ reward (uncertainty − repetition) so it can't trivially game the agent.
168
+
169
+ ## References
170
+
171
+ - Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
172
+ - Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
173
+ - Liu et al., *SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning…* (2025)
174
+ - Ibrahim et al., [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
175
+ - Masud et al., [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks
176
+ - OpenEnv Hackathon Self-Serve Guide (2026)
177
+
178
+ ## License
179
+
180
+ Apache-2.0
artifacts/eval_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "baseline": {
3
+ "n": 50,
4
+ "mean_reward": 0.9,
5
+ "success_rate": 0.5
6
+ },
7
+ "trained": {
8
+ "n": 50,
9
+ "mean_reward": 1.5120000000000002,
10
+ "success_rate": 0.86
11
+ },
12
+ "plots": [
13
+ "baseline_vs_trained.png",
14
+ "training_reward_curve.png",
15
+ "success_by_category.png"
16
+ ],
17
+ "repair_library_size": 43
18
+ }
artifacts/plots/baseline_vs_trained.png ADDED
artifacts/plots/success_by_category.png ADDED
artifacts/plots/training_reward_curve.png ADDED
artifacts/repair_library.json ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1",
3
+ "examples": [
4
+ {
5
+ "primitive_type": "ChangeTokenizerBehavior",
6
+ "breakage_params": {
7
+ "old_kwarg": "truncation",
8
+ "old_value": "True",
9
+ "new_kwarg": "truncate",
10
+ "new_value": "True"
11
+ },
12
+ "error_signature": "",
13
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
14
+ "visible_reward": 1.8,
15
+ "held_out": {
16
+ "executed_cleanly": 1.0,
17
+ "checkpoint_valid": 1.0,
18
+ "loss_decreased": 0.8691781740179649,
19
+ "metrics_in_range": 1.0,
20
+ "no_forbidden_workarounds": 1.0,
21
+ "intent_preserved": 1.0,
22
+ "hidden_tests_passed": 1.0
23
+ },
24
+ "task_id": "electra_classification"
25
+ },
26
+ {
27
+ "primitive_type": "ChangeTokenizerBehavior",
28
+ "breakage_params": {
29
+ "old_kwarg": "truncation",
30
+ "old_value": "True",
31
+ "new_kwarg": "truncate",
32
+ "new_value": "True"
33
+ },
34
+ "error_signature": "",
35
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
36
+ "visible_reward": 1.8,
37
+ "held_out": {
38
+ "executed_cleanly": 1.0,
39
+ "checkpoint_valid": 1.0,
40
+ "loss_decreased": 0.7612783886548146,
41
+ "metrics_in_range": 1.0,
42
+ "no_forbidden_workarounds": 1.0,
43
+ "intent_preserved": 1.0,
44
+ "hidden_tests_passed": 1.0
45
+ },
46
+ "task_id": "electra_classification"
47
+ },
48
+ {
49
+ "primitive_type": "ChangeTokenizerBehavior",
50
+ "breakage_params": {
51
+ "old_kwarg": "truncation",
52
+ "old_value": "True",
53
+ "new_kwarg": "truncate",
54
+ "new_value": "True"
55
+ },
56
+ "error_signature": "",
57
+ "repair_diff": "",
58
+ "visible_reward": 1.8,
59
+ "held_out": {
60
+ "executed_cleanly": 1.0,
61
+ "checkpoint_valid": 1.0,
62
+ "loss_decreased": 0.7469754695541743,
63
+ "metrics_in_range": 1.0,
64
+ "no_forbidden_workarounds": 1.0,
65
+ "intent_preserved": 1.0,
66
+ "hidden_tests_passed": 1.0
67
+ },
68
+ "task_id": "albert_qa"
69
+ },
70
+ {
71
+ "primitive_type": "ChangeTokenizerBehavior",
72
+ "breakage_params": {
73
+ "old_kwarg": "truncation",
74
+ "old_value": "True",
75
+ "new_kwarg": "truncate",
76
+ "new_value": "True"
77
+ },
78
+ "error_signature": "",
79
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
80
+ "visible_reward": 1.8,
81
+ "held_out": {
82
+ "executed_cleanly": 1.0,
83
+ "checkpoint_valid": 1.0,
84
+ "loss_decreased": 0.8811022610483041,
85
+ "metrics_in_range": 1.0,
86
+ "no_forbidden_workarounds": 1.0,
87
+ "intent_preserved": 1.0,
88
+ "hidden_tests_passed": 1.0
89
+ },
90
+ "task_id": "bert_ner"
91
+ },
92
+ {
93
+ "primitive_type": "RestructureDatasetSchema",
94
+ "breakage_params": {
95
+ "old_column": "label",
96
+ "new_column": "labels"
97
+ },
98
+ "error_signature": "",
99
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n padding=\"max_length\",\n )\n- inputs[\"labels\"] = targets[\"input_ids\"]\n+ inputs[\"label\"] = targets[\"input_ids\"]\n return inputs\n \n",
100
+ "visible_reward": 1.8,
101
+ "held_out": {
102
+ "executed_cleanly": 1.0,
103
+ "checkpoint_valid": 1.0,
104
+ "loss_decreased": 0.649018766337638,
105
+ "metrics_in_range": 1.0,
106
+ "no_forbidden_workarounds": 1.0,
107
+ "intent_preserved": 1.0,
108
+ "hidden_tests_passed": 1.0
109
+ },
110
+ "task_id": "t5_summarization"
111
+ },
112
+ {
113
+ "primitive_type": "RestructureDatasetSchema",
114
+ "breakage_params": {
115
+ "old_column": "text",
116
+ "new_column": "input_text"
117
+ },
118
+ "error_signature": "",
119
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
120
+ "visible_reward": 1.8,
121
+ "held_out": {
122
+ "executed_cleanly": 1.0,
123
+ "checkpoint_valid": 1.0,
124
+ "loss_decreased": 0.8895669291338583,
125
+ "metrics_in_range": 1.0,
126
+ "no_forbidden_workarounds": 1.0,
127
+ "intent_preserved": 1.0,
128
+ "hidden_tests_passed": 1.0
129
+ },
130
+ "task_id": "albert_qa"
131
+ },
132
+ {
133
+ "primitive_type": "ChangeTokenizerBehavior",
134
+ "breakage_params": {
135
+ "old_kwarg": "truncation",
136
+ "old_value": "True",
137
+ "new_kwarg": "truncate",
138
+ "new_value": "True"
139
+ },
140
+ "error_signature": "",
141
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
142
+ "visible_reward": 1.8,
143
+ "held_out": {
144
+ "executed_cleanly": 1.0,
145
+ "checkpoint_valid": 1.0,
146
+ "loss_decreased": 0.8010139080581803,
147
+ "metrics_in_range": 1.0,
148
+ "no_forbidden_workarounds": 1.0,
149
+ "intent_preserved": 1.0,
150
+ "hidden_tests_passed": 1.0
151
+ },
152
+ "task_id": "bert_ner"
153
+ },
154
+ {
155
+ "primitive_type": "ChangeArgumentSignature",
156
+ "breakage_params": {
157
+ "function_name": "TrainingArguments",
158
+ "removed_arg": "num_train_epochs",
159
+ "added_arg": "max_steps",
160
+ "added_value": "1000"
161
+ },
162
+ "error_signature": "",
163
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -24,4 +24,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n",
164
+ "visible_reward": 1.8,
165
+ "held_out": {
166
+ "executed_cleanly": 1.0,
167
+ "checkpoint_valid": 1.0,
168
+ "loss_decreased": 0.8672674881981486,
169
+ "metrics_in_range": 1.0,
170
+ "no_forbidden_workarounds": 1.0,
171
+ "intent_preserved": 1.0,
172
+ "hidden_tests_passed": 1.0
173
+ },
174
+ "task_id": "gpt2_textgen"
175
+ },
176
+ {
177
+ "primitive_type": "RestructureDatasetSchema",
178
+ "breakage_params": {
179
+ "old_column": "text",
180
+ "new_column": "input_text"
181
+ },
182
+ "error_signature": "",
183
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
184
+ "visible_reward": 1.8,
185
+ "held_out": {
186
+ "executed_cleanly": 1.0,
187
+ "checkpoint_valid": 1.0,
188
+ "loss_decreased": 0.5887677670351681,
189
+ "metrics_in_range": 1.0,
190
+ "no_forbidden_workarounds": 1.0,
191
+ "intent_preserved": 1.0,
192
+ "hidden_tests_passed": 1.0
193
+ },
194
+ "task_id": "albert_qa"
195
+ },
196
+ {
197
+ "primitive_type": "RemoveDeprecatedMethod",
198
+ "breakage_params": {
199
+ "class_name": "Trainer",
200
+ "method_name": "save_model",
201
+ "replacement": "save_to_hub"
202
+ },
203
+ "error_signature": "",
204
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
205
+ "visible_reward": 1.8,
206
+ "held_out": {
207
+ "executed_cleanly": 1.0,
208
+ "checkpoint_valid": 1.0,
209
+ "loss_decreased": 0.8791026290604065,
210
+ "metrics_in_range": 1.0,
211
+ "no_forbidden_workarounds": 1.0,
212
+ "intent_preserved": 1.0,
213
+ "hidden_tests_passed": 1.0
214
+ },
215
+ "task_id": "roberta_sentiment"
216
+ },
217
+ {
218
+ "primitive_type": "RenameApiCall",
219
+ "breakage_params": {
220
+ "old_name": "trainer.train",
221
+ "new_name": "trainer.start_training"
222
+ },
223
+ "error_signature": "",
224
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,5 +40,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
225
+ "visible_reward": 1.8,
226
+ "held_out": {
227
+ "executed_cleanly": 1.0,
228
+ "checkpoint_valid": 1.0,
229
+ "loss_decreased": 0.7878403072444018,
230
+ "metrics_in_range": 1.0,
231
+ "no_forbidden_workarounds": 1.0,
232
+ "intent_preserved": 1.0,
233
+ "hidden_tests_passed": 1.0
234
+ },
235
+ "task_id": "electra_classification"
236
+ },
237
+ {
238
+ "primitive_type": "RestructureDatasetSchema",
239
+ "breakage_params": {
240
+ "old_column": "text",
241
+ "new_column": "input_text"
242
+ },
243
+ "error_signature": "",
244
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
245
+ "visible_reward": 1.8,
246
+ "held_out": {
247
+ "executed_cleanly": 1.0,
248
+ "checkpoint_valid": 1.0,
249
+ "loss_decreased": 0.8678511447007867,
250
+ "metrics_in_range": 1.0,
251
+ "no_forbidden_workarounds": 1.0,
252
+ "intent_preserved": 1.0,
253
+ "hidden_tests_passed": 1.0
254
+ },
255
+ "task_id": "albert_qa"
256
+ },
257
+ {
258
+ "primitive_type": "RestructureDatasetSchema",
259
+ "breakage_params": {
260
+ "old_column": "text",
261
+ "new_column": "input_text"
262
+ },
263
+ "error_signature": "",
264
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n",
265
+ "visible_reward": 1.8,
266
+ "held_out": {
267
+ "executed_cleanly": 1.0,
268
+ "checkpoint_valid": 1.0,
269
+ "loss_decreased": 0.6278346817583994,
270
+ "metrics_in_range": 1.0,
271
+ "no_forbidden_workarounds": 1.0,
272
+ "intent_preserved": 1.0,
273
+ "hidden_tests_passed": 1.0
274
+ },
275
+ "task_id": "roberta_sentiment"
276
+ },
277
+ {
278
+ "primitive_type": "RestructureDatasetSchema",
279
+ "breakage_params": {
280
+ "old_column": "text",
281
+ "new_column": "input_text"
282
+ },
283
+ "error_signature": "",
284
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n",
285
+ "visible_reward": 1.8,
286
+ "held_out": {
287
+ "executed_cleanly": 1.0,
288
+ "checkpoint_valid": 1.0,
289
+ "loss_decreased": 0.6966312162081871,
290
+ "metrics_in_range": 1.0,
291
+ "no_forbidden_workarounds": 1.0,
292
+ "intent_preserved": 1.0,
293
+ "hidden_tests_passed": 1.0
294
+ },
295
+ "task_id": "electra_classification"
296
+ },
297
+ {
298
+ "primitive_type": "ChangeArgumentSignature",
299
+ "breakage_params": {
300
+ "function_name": "TrainingArguments",
301
+ "removed_arg": "num_train_epochs",
302
+ "added_arg": "max_steps",
303
+ "added_value": "1000"
304
+ },
305
+ "error_signature": "",
306
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
307
+ "visible_reward": 1.8,
308
+ "held_out": {
309
+ "executed_cleanly": 1.0,
310
+ "checkpoint_valid": 1.0,
311
+ "loss_decreased": 0.666498939726126,
312
+ "metrics_in_range": 1.0,
313
+ "no_forbidden_workarounds": 1.0,
314
+ "intent_preserved": 1.0,
315
+ "hidden_tests_passed": 1.0
316
+ },
317
+ "task_id": "distilbert_sst2"
318
+ },
319
+ {
320
+ "primitive_type": "RenameApiCall",
321
+ "breakage_params": {
322
+ "old_name": "trainer.train",
323
+ "new_name": "trainer.start_training"
324
+ },
325
+ "error_signature": "",
326
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
327
+ "visible_reward": 1.8,
328
+ "held_out": {
329
+ "executed_cleanly": 1.0,
330
+ "checkpoint_valid": 1.0,
331
+ "loss_decreased": 0.7251096581974675,
332
+ "metrics_in_range": 1.0,
333
+ "no_forbidden_workarounds": 1.0,
334
+ "intent_preserved": 1.0,
335
+ "hidden_tests_passed": 1.0
336
+ },
337
+ "task_id": "albert_qa"
338
+ },
339
+ {
340
+ "primitive_type": "ModifyConfigField",
341
+ "breakage_params": {
342
+ "config_class": "TrainingArguments",
343
+ "field_name": "per_device_train_batch_size",
344
+ "new_value": "1"
345
+ },
346
+ "error_signature": "",
347
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
348
+ "visible_reward": 1.8,
349
+ "held_out": {
350
+ "executed_cleanly": 1.0,
351
+ "checkpoint_valid": 1.0,
352
+ "loss_decreased": 0.817717003567182,
353
+ "metrics_in_range": 1.0,
354
+ "no_forbidden_workarounds": 1.0,
355
+ "intent_preserved": 1.0,
356
+ "hidden_tests_passed": 1.0
357
+ },
358
+ "task_id": "bert_ner"
359
+ },
360
+ {
361
+ "primitive_type": "ChangeTokenizerBehavior",
362
+ "breakage_params": {
363
+ "old_kwarg": "truncation",
364
+ "old_value": "True",
365
+ "new_kwarg": "truncate",
366
+ "new_value": "True"
367
+ },
368
+ "error_signature": "",
369
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
370
+ "visible_reward": 1.8,
371
+ "held_out": {
372
+ "executed_cleanly": 1.0,
373
+ "checkpoint_valid": 1.0,
374
+ "loss_decreased": 0.7677847401400664,
375
+ "metrics_in_range": 1.0,
376
+ "no_forbidden_workarounds": 1.0,
377
+ "intent_preserved": 1.0,
378
+ "hidden_tests_passed": 1.0
379
+ },
380
+ "task_id": "roberta_sentiment"
381
+ },
382
+ {
383
+ "primitive_type": "RestructureDatasetSchema",
384
+ "breakage_params": {
385
+ "old_column": "label",
386
+ "new_column": "labels"
387
+ },
388
+ "error_signature": "",
389
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n- inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+ inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n \n",
390
+ "visible_reward": 1.8,
391
+ "held_out": {
392
+ "executed_cleanly": 1.0,
393
+ "checkpoint_valid": 1.0,
394
+ "loss_decreased": 0.701744242073817,
395
+ "metrics_in_range": 1.0,
396
+ "no_forbidden_workarounds": 1.0,
397
+ "intent_preserved": 1.0,
398
+ "hidden_tests_passed": 1.0
399
+ },
400
+ "task_id": "vit_cifar10"
401
+ },
402
+ {
403
+ "primitive_type": "ChangeArgumentSignature",
404
+ "breakage_params": {
405
+ "function_name": "TrainingArguments",
406
+ "removed_arg": "num_train_epochs",
407
+ "added_arg": "max_steps",
408
+ "added_value": "1000"
409
+ },
410
+ "error_signature": "",
411
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,4 +49,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n",
412
+ "visible_reward": 1.8,
413
+ "held_out": {
414
+ "executed_cleanly": 1.0,
415
+ "checkpoint_valid": 1.0,
416
+ "loss_decreased": 0.784986144101346,
417
+ "metrics_in_range": 1.0,
418
+ "no_forbidden_workarounds": 1.0,
419
+ "intent_preserved": 1.0,
420
+ "hidden_tests_passed": 1.0
421
+ },
422
+ "task_id": "albert_qa"
423
+ },
424
+ {
425
+ "primitive_type": "RemoveDeprecatedMethod",
426
+ "breakage_params": {
427
+ "class_name": "Trainer",
428
+ "method_name": "save_model",
429
+ "replacement": "save_to_hub"
430
+ },
431
+ "error_signature": "",
432
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
433
+ "visible_reward": 1.8,
434
+ "held_out": {
435
+ "executed_cleanly": 1.0,
436
+ "checkpoint_valid": 1.0,
437
+ "loss_decreased": 0.6652959989556817,
438
+ "metrics_in_range": 1.0,
439
+ "no_forbidden_workarounds": 1.0,
440
+ "intent_preserved": 1.0,
441
+ "hidden_tests_passed": 1.0
442
+ },
443
+ "task_id": "electra_classification"
444
+ },
445
+ {
446
+ "primitive_type": "ChangeArgumentSignature",
447
+ "breakage_params": {
448
+ "function_name": "TrainingArguments",
449
+ "removed_arg": "num_train_epochs",
450
+ "added_arg": "max_steps",
451
+ "added_value": "1000"
452
+ },
453
+ "error_signature": "",
454
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n",
455
+ "visible_reward": 1.8,
456
+ "held_out": {
457
+ "executed_cleanly": 1.0,
458
+ "checkpoint_valid": 1.0,
459
+ "loss_decreased": 0.8362977381032284,
460
+ "metrics_in_range": 1.0,
461
+ "no_forbidden_workarounds": 1.0,
462
+ "intent_preserved": 1.0,
463
+ "hidden_tests_passed": 1.0
464
+ },
465
+ "task_id": "electra_classification"
466
+ },
467
+ {
468
+ "primitive_type": "ChangeTokenizerBehavior",
469
+ "breakage_params": {
470
+ "old_kwarg": "truncation",
471
+ "old_value": "True",
472
+ "new_kwarg": "truncate",
473
+ "new_value": "True"
474
+ },
475
+ "error_signature": "",
476
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
477
+ "visible_reward": 1.8,
478
+ "held_out": {
479
+ "executed_cleanly": 1.0,
480
+ "checkpoint_valid": 1.0,
481
+ "loss_decreased": 0.8434749013439302,
482
+ "metrics_in_range": 1.0,
483
+ "no_forbidden_workarounds": 1.0,
484
+ "intent_preserved": 1.0,
485
+ "hidden_tests_passed": 1.0
486
+ },
487
+ "task_id": "electra_classification"
488
+ },
489
+ {
490
+ "primitive_type": "RestructureDatasetSchema",
491
+ "breakage_params": {
492
+ "old_column": "text",
493
+ "new_column": "input_text"
494
+ },
495
+ "error_signature": "",
496
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
497
+ "visible_reward": 1.8,
498
+ "held_out": {
499
+ "executed_cleanly": 1.0,
500
+ "checkpoint_valid": 1.0,
501
+ "loss_decreased": 0.775726750559039,
502
+ "metrics_in_range": 1.0,
503
+ "no_forbidden_workarounds": 1.0,
504
+ "intent_preserved": 1.0,
505
+ "hidden_tests_passed": 1.0
506
+ },
507
+ "task_id": "albert_qa"
508
+ },
509
+ {
510
+ "primitive_type": "ChangeArgumentSignature",
511
+ "breakage_params": {
512
+ "function_name": "TrainingArguments",
513
+ "removed_arg": "num_train_epochs",
514
+ "added_arg": "max_steps",
515
+ "added_value": "1000"
516
+ },
517
+ "error_signature": "",
518
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
519
+ "visible_reward": 1.8,
520
+ "held_out": {
521
+ "executed_cleanly": 1.0,
522
+ "checkpoint_valid": 1.0,
523
+ "loss_decreased": 0.9085137085137085,
524
+ "metrics_in_range": 1.0,
525
+ "no_forbidden_workarounds": 1.0,
526
+ "intent_preserved": 1.0,
527
+ "hidden_tests_passed": 1.0
528
+ },
529
+ "task_id": "distilbert_sst2"
530
+ },
531
+ {
532
+ "primitive_type": "RenameApiCall",
533
+ "breakage_params": {
534
+ "old_name": "trainer.train",
535
+ "new_name": "trainer.start_training"
536
+ },
537
+ "error_signature": "",
538
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -51,5 +51,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
539
+ "visible_reward": 1.8,
540
+ "held_out": {
541
+ "executed_cleanly": 1.0,
542
+ "checkpoint_valid": 1.0,
543
+ "loss_decreased": 0.7424872199130476,
544
+ "metrics_in_range": 1.0,
545
+ "no_forbidden_workarounds": 1.0,
546
+ "intent_preserved": 1.0,
547
+ "hidden_tests_passed": 1.0
548
+ },
549
+ "task_id": "bert_ner"
550
+ },
551
+ {
552
+ "primitive_type": "ChangeArgumentSignature",
553
+ "breakage_params": {
554
+ "function_name": "TrainingArguments",
555
+ "removed_arg": "num_train_epochs",
556
+ "added_arg": "max_steps",
557
+ "added_value": "1000"
558
+ },
559
+ "error_signature": "",
560
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
561
+ "visible_reward": 1.8,
562
+ "held_out": {
563
+ "executed_cleanly": 1.0,
564
+ "checkpoint_valid": 1.0,
565
+ "loss_decreased": 0.8076153403327943,
566
+ "metrics_in_range": 1.0,
567
+ "no_forbidden_workarounds": 1.0,
568
+ "intent_preserved": 1.0,
569
+ "hidden_tests_passed": 1.0
570
+ },
571
+ "task_id": "distilbert_sst2"
572
+ },
573
+ {
574
+ "primitive_type": "RestructureDatasetSchema",
575
+ "breakage_params": {
576
+ "old_column": "text",
577
+ "new_column": "input_text"
578
+ },
579
+ "error_signature": "",
580
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
581
+ "visible_reward": 1.8,
582
+ "held_out": {
583
+ "executed_cleanly": 1.0,
584
+ "checkpoint_valid": 1.0,
585
+ "loss_decreased": 0.8882627677936846,
586
+ "metrics_in_range": 1.0,
587
+ "no_forbidden_workarounds": 1.0,
588
+ "intent_preserved": 1.0,
589
+ "hidden_tests_passed": 1.0
590
+ },
591
+ "task_id": "albert_qa"
592
+ },
593
+ {
594
+ "primitive_type": "RemoveDeprecatedMethod",
595
+ "breakage_params": {
596
+ "class_name": "Trainer",
597
+ "method_name": "save_model",
598
+ "replacement": "save_to_hub"
599
+ },
600
+ "error_signature": "",
601
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,4 +40,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
602
+ "visible_reward": 1.8,
603
+ "held_out": {
604
+ "executed_cleanly": 1.0,
605
+ "checkpoint_valid": 1.0,
606
+ "loss_decreased": 0.5938341205749403,
607
+ "metrics_in_range": 1.0,
608
+ "no_forbidden_workarounds": 1.0,
609
+ "intent_preserved": 1.0,
610
+ "hidden_tests_passed": 1.0
611
+ },
612
+ "task_id": "gpt2_textgen"
613
+ },
614
+ {
615
+ "primitive_type": "RestructureDatasetSchema",
616
+ "breakage_params": {
617
+ "old_column": "text",
618
+ "new_column": "input_text"
619
+ },
620
+ "error_signature": "",
621
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -15,5 +15,5 @@\n \n def tokenize(examples):\n- return tokenizer(examples[\"input_text\"], truncation=True, max_length=64)\n+ return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n \n \n",
622
+ "visible_reward": 1.8,
623
+ "held_out": {
624
+ "executed_cleanly": 1.0,
625
+ "checkpoint_valid": 1.0,
626
+ "loss_decreased": 0.6555927441014835,
627
+ "metrics_in_range": 1.0,
628
+ "no_forbidden_workarounds": 1.0,
629
+ "intent_preserved": 1.0,
630
+ "hidden_tests_passed": 1.0
631
+ },
632
+ "task_id": "gpt2_textgen"
633
+ },
634
+ {
635
+ "primitive_type": "RenameApiCall",
636
+ "breakage_params": {
637
+ "old_name": "trainer.train",
638
+ "new_name": "trainer.start_training"
639
+ },
640
+ "error_signature": "",
641
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
642
+ "visible_reward": 1.8,
643
+ "held_out": {
644
+ "executed_cleanly": 1.0,
645
+ "checkpoint_valid": 1.0,
646
+ "loss_decreased": 0.755194754910818,
647
+ "metrics_in_range": 1.0,
648
+ "no_forbidden_workarounds": 1.0,
649
+ "intent_preserved": 1.0,
650
+ "hidden_tests_passed": 1.0
651
+ },
652
+ "task_id": "albert_qa"
653
+ },
654
+ {
655
+ "primitive_type": "RenameApiCall",
656
+ "breakage_params": {
657
+ "old_name": "trainer.train",
658
+ "new_name": "trainer.start_training"
659
+ },
660
+ "error_signature": "",
661
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,5 +49,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
662
+ "visible_reward": 1.8,
663
+ "held_out": {
664
+ "executed_cleanly": 1.0,
665
+ "checkpoint_valid": 1.0,
666
+ "loss_decreased": 0.8654821132433073,
667
+ "metrics_in_range": 1.0,
668
+ "no_forbidden_workarounds": 1.0,
669
+ "intent_preserved": 1.0,
670
+ "hidden_tests_passed": 1.0
671
+ },
672
+ "task_id": "distilbert_sst2"
673
+ },
674
+ {
675
+ "primitive_type": "RestructureDatasetSchema",
676
+ "breakage_params": {
677
+ "old_column": "label",
678
+ "new_column": "labels"
679
+ },
680
+ "error_signature": "",
681
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n- inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+ inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n \n",
682
+ "visible_reward": 1.8,
683
+ "held_out": {
684
+ "executed_cleanly": 1.0,
685
+ "checkpoint_valid": 1.0,
686
+ "loss_decreased": 0.8319525054273182,
687
+ "metrics_in_range": 1.0,
688
+ "no_forbidden_workarounds": 1.0,
689
+ "intent_preserved": 1.0,
690
+ "hidden_tests_passed": 1.0
691
+ },
692
+ "task_id": "vit_cifar10"
693
+ },
694
+ {
695
+ "primitive_type": "RestructureDatasetSchema",
696
+ "breakage_params": {
697
+ "old_column": "text",
698
+ "new_column": "input_text"
699
+ },
700
+ "error_signature": "",
701
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
702
+ "visible_reward": 1.8,
703
+ "held_out": {
704
+ "executed_cleanly": 1.0,
705
+ "checkpoint_valid": 1.0,
706
+ "loss_decreased": 0.8109320292832547,
707
+ "metrics_in_range": 1.0,
708
+ "no_forbidden_workarounds": 1.0,
709
+ "intent_preserved": 1.0,
710
+ "hidden_tests_passed": 1.0
711
+ },
712
+ "task_id": "albert_qa"
713
+ },
714
+ {
715
+ "primitive_type": "ModifyConfigField",
716
+ "breakage_params": {
717
+ "config_class": "TrainingArguments",
718
+ "field_name": "per_device_train_batch_size",
719
+ "new_value": "1"
720
+ },
721
+ "error_signature": "",
722
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -36,5 +36,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
723
+ "visible_reward": 1.8,
724
+ "held_out": {
725
+ "executed_cleanly": 1.0,
726
+ "checkpoint_valid": 1.0,
727
+ "loss_decreased": 0.8409642541924095,
728
+ "metrics_in_range": 1.0,
729
+ "no_forbidden_workarounds": 1.0,
730
+ "intent_preserved": 1.0,
731
+ "hidden_tests_passed": 1.0
732
+ },
733
+ "task_id": "distilbert_sst2"
734
+ },
735
+ {
736
+ "primitive_type": "ChangeArgumentSignature",
737
+ "breakage_params": {
738
+ "function_name": "TrainingArguments",
739
+ "removed_arg": "num_train_epochs",
740
+ "added_arg": "max_steps",
741
+ "added_value": "1000"
742
+ },
743
+ "error_signature": "",
744
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n",
745
+ "visible_reward": 1.8,
746
+ "held_out": {
747
+ "executed_cleanly": 1.0,
748
+ "checkpoint_valid": 1.0,
749
+ "loss_decreased": 0.8891815856777494,
750
+ "metrics_in_range": 1.0,
751
+ "no_forbidden_workarounds": 1.0,
752
+ "intent_preserved": 1.0,
753
+ "hidden_tests_passed": 1.0
754
+ },
755
+ "task_id": "electra_classification"
756
+ },
757
+ {
758
+ "primitive_type": "ModifyConfigField",
759
+ "breakage_params": {
760
+ "config_class": "TrainingArguments",
761
+ "field_name": "per_device_train_batch_size",
762
+ "new_value": "1"
763
+ },
764
+ "error_signature": "",
765
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -29,5 +29,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
766
+ "visible_reward": 1.8,
767
+ "held_out": {
768
+ "executed_cleanly": 1.0,
769
+ "checkpoint_valid": 1.0,
770
+ "loss_decreased": 0.7900720214449505,
771
+ "metrics_in_range": 1.0,
772
+ "no_forbidden_workarounds": 1.0,
773
+ "intent_preserved": 1.0,
774
+ "hidden_tests_passed": 1.0
775
+ },
776
+ "task_id": "vit_cifar10"
777
+ },
778
+ {
779
+ "primitive_type": "RemoveDeprecatedMethod",
780
+ "breakage_params": {
781
+ "class_name": "Trainer",
782
+ "method_name": "save_model",
783
+ "replacement": "save_to_hub"
784
+ },
785
+ "error_signature": "",
786
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -38,4 +38,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
787
+ "visible_reward": 1.8,
788
+ "held_out": {
789
+ "executed_cleanly": 1.0,
790
+ "checkpoint_valid": 1.0,
791
+ "loss_decreased": 0.7984906001446131,
792
+ "metrics_in_range": 1.0,
793
+ "no_forbidden_workarounds": 1.0,
794
+ "intent_preserved": 1.0,
795
+ "hidden_tests_passed": 1.0
796
+ },
797
+ "task_id": "vit_cifar10"
798
+ },
799
+ {
800
+ "primitive_type": "RestructureDatasetSchema",
801
+ "breakage_params": {
802
+ "old_column": "text",
803
+ "new_column": "input_text"
804
+ },
805
+ "error_signature": "",
806
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
807
+ "visible_reward": 1.8,
808
+ "held_out": {
809
+ "executed_cleanly": 1.0,
810
+ "checkpoint_valid": 1.0,
811
+ "loss_decreased": 0.7808289396602227,
812
+ "metrics_in_range": 1.0,
813
+ "no_forbidden_workarounds": 1.0,
814
+ "intent_preserved": 1.0,
815
+ "hidden_tests_passed": 1.0
816
+ },
817
+ "task_id": "albert_qa"
818
+ },
819
+ {
820
+ "primitive_type": "RestructureDatasetSchema",
821
+ "breakage_params": {
822
+ "old_column": "tokens",
823
+ "new_column": "words"
824
+ },
825
+ "error_signature": "",
826
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"words\"], is_split_into_words=True, truncation=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
827
+ "visible_reward": 1.8,
828
+ "held_out": {
829
+ "executed_cleanly": 1.0,
830
+ "checkpoint_valid": 1.0,
831
+ "loss_decreased": 0.8699562543975037,
832
+ "metrics_in_range": 1.0,
833
+ "no_forbidden_workarounds": 1.0,
834
+ "intent_preserved": 1.0,
835
+ "hidden_tests_passed": 1.0
836
+ },
837
+ "task_id": "bert_ner"
838
+ },
839
+ {
840
+ "primitive_type": "RenameApiCall",
841
+ "breakage_params": {
842
+ "old_name": "trainer.train",
843
+ "new_name": "trainer.start_training"
844
+ },
845
+ "error_signature": "",
846
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
847
+ "visible_reward": 1.8,
848
+ "held_out": {
849
+ "executed_cleanly": 1.0,
850
+ "checkpoint_valid": 1.0,
851
+ "loss_decreased": 0.911495927422025,
852
+ "metrics_in_range": 1.0,
853
+ "no_forbidden_workarounds": 1.0,
854
+ "intent_preserved": 1.0,
855
+ "hidden_tests_passed": 1.0
856
+ },
857
+ "task_id": "albert_qa"
858
+ },
859
+ {
860
+ "primitive_type": "RemoveDeprecatedMethod",
861
+ "breakage_params": {
862
+ "class_name": "Trainer",
863
+ "method_name": "save_model",
864
+ "replacement": "save_to_hub"
865
+ },
866
+ "error_signature": "",
867
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -64,4 +64,4 @@\n )\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
868
+ "visible_reward": 1.8,
869
+ "held_out": {
870
+ "executed_cleanly": 1.0,
871
+ "checkpoint_valid": 1.0,
872
+ "loss_decreased": 0.6131321254553196,
873
+ "metrics_in_range": 1.0,
874
+ "no_forbidden_workarounds": 1.0,
875
+ "intent_preserved": 1.0,
876
+ "hidden_tests_passed": 1.0
877
+ },
878
+ "task_id": "albert_qa"
879
+ },
880
+ {
881
+ "primitive_type": "RestructureDatasetSchema",
882
+ "breakage_params": {
883
+ "old_column": "label",
884
+ "new_column": "labels"
885
+ },
886
+ "error_signature": "",
887
+ "repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -22,5 +22,5 @@\n \n dataset = dataset.map(tokenize, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n",
888
+ "visible_reward": 1.8,
889
+ "held_out": {
890
+ "executed_cleanly": 1.0,
891
+ "checkpoint_valid": 1.0,
892
+ "loss_decreased": 0.6040748525323751,
893
+ "metrics_in_range": 1.0,
894
+ "no_forbidden_workarounds": 1.0,
895
+ "intent_preserved": 1.0,
896
+ "hidden_tests_passed": 1.0
897
+ },
898
+ "task_id": "electra_classification"
899
+ }
900
+ ],
901
+ "size": 43,
902
+ "by_primitive": {
903
+ "ChangeTokenizerBehavior": 7,
904
+ "RestructureDatasetSchema": 15,
905
+ "ChangeArgumentSignature": 7,
906
+ "RemoveDeprecatedMethod": 5,
907
+ "RenameApiCall": 6,
908
+ "ModifyConfigField": 3
909
+ }
910
+ }
debug_trace.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from forgeenv.roles.drift_generator import BaselineDriftGenerator
2
+ from forgeenv.roles.prompts import render_drift_generator_prompt
3
+ from forgeenv.tasks.task_sampler import TaskSampler
4
+
5
+ sampler = TaskSampler()
6
+ script = sampler.get_by_id("simple_regression").script_content
7
+
8
+ prompt = render_drift_generator_prompt(script, "ChangeTokenizerBehavior", {"transformers": "4.40"})
9
+ fence = "```python"
10
+ script_block = ""
11
+ if fence in prompt:
12
+ script_block = prompt.split(fence, 1)[1].split("```", 1)[0]
13
+ print("script_block len:", len(script_block))
14
+ print("first 80 chars:", repr(script_block[:80]))
15
+
16
+ gen = BaselineDriftGenerator(seed=0)
17
+ spec = gen.propose(target_category="ChangeTokenizerBehavior", script=script_block)
18
+ print("spec:", spec)
demo-space/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ForgeEnv Repair Agent Demo
3
+ emoji: 🔧
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.7.1
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ hardware: zero-a10g
12
+ tags:
13
+ - openenv
14
+ - self-improvement
15
+ - code-repair
16
+ - schema-drift
17
+ short_description: Trained Repair Agent fixes HF scripts under drift
18
+ ---
19
+
20
+ # ForgeEnv Repair Agent — Live Demo
21
+
22
+ Paste a broken HuggingFace training script and the error trace it produced.
23
+ The trained Repair Agent (Qwen2.5-3B + LoRA) emits a unified diff that should
24
+ restore the script. Inference runs on ZeroGPU (free A10G).
25
+
26
+ - **Environment server (OpenEnv):**
27
+ <https://huggingface.co/spaces/akhiilll/forgeenv>
28
+ - **Trained model (LoRA + repair_library.json):**
29
+ <https://huggingface.co/akhiilll/forgeenv-repair-agent>
30
+ - **Project README & plots:**
31
+ <https://github.com/akhiilll/forgeenv>
demo-space/app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio demo Space for the ForgeEnv Repair Agent.
2
+
3
+ Loads the trained LoRA adapter from the Hub and exposes a 2-input form:
4
+ broken script + error trace. Output is a unified diff. Inference runs on
5
+ ZeroGPU (`@spaces.GPU`) so we don't pay for idle GPU time.
6
+
7
+ If the trained adapter isn't yet uploaded, the demo falls back to the
8
+ deterministic ``BaselineRepairAgent`` so the Space still works end-to-end.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import os
14
+ import traceback
15
+ from typing import Optional
16
+
17
+ import gradio as gr
18
+
19
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-3B-Instruct")
20
+ ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "akhiilll/forgeenv-repair-agent")
21
+
22
+ _TITLE = "ForgeEnv Repair Agent — fix HuggingFace scripts under library drift"
23
+ _DESCRIPTION = (
24
+ "Paste a broken HuggingFace training script and the error trace it "
25
+ "produced. The Repair Agent returns a minimal unified diff. The model "
26
+ "was trained inside [ForgeEnv](https://huggingface.co/spaces/"
27
+ "akhiilll/forgeenv) using GRPO (TRL + Unsloth) with R-Zero-style "
28
+ "Challenger / Solver co-evolution."
29
+ )
30
+
31
+ _EXAMPLES = [
32
+ [
33
+ (
34
+ "from transformers import Trainer, TrainingArguments\n"
35
+ "from datasets import load_dataset\n\n"
36
+ "ds = load_dataset('glue', 'sst2')\n"
37
+ "args = TrainingArguments(output_dir='out')\n"
38
+ "trainer = Trainer(model=None, args=args, train_dataset=ds['train'])\n"
39
+ "trainer.start_training()\n"
40
+ ),
41
+ (
42
+ "AttributeError: 'Trainer' object has no attribute 'start_training'. "
43
+ "Did you mean: 'train'?"
44
+ ),
45
+ ],
46
+ [
47
+ (
48
+ "import torch.legacy as torch\n"
49
+ "x = torch.randn(2, 3)\n"
50
+ "print(x)\n"
51
+ ),
52
+ "ModuleNotFoundError: No module named 'torch.legacy'",
53
+ ],
54
+ [
55
+ (
56
+ "from transformers import AutoTokenizer\n"
57
+ "tok = AutoTokenizer.from_pretrained('bert-base-uncased')\n"
58
+ "out = tok(['hello world'], pad_to_max_length=True, truncate=True)\n"
59
+ "print(out)\n"
60
+ ),
61
+ (
62
+ "TypeError: __call__() got an unexpected keyword argument "
63
+ "'pad_to_max_length' (use `padding=True` instead)."
64
+ ),
65
+ ],
66
+ ]
67
+
68
+ _PROMPT_TEMPLATE = (
69
+ "You are an expert ML engineer who fixes broken HuggingFace training "
70
+ "scripts caused by library version drift.\n\n"
71
+ "Library versions: {versions}\n\n"
72
+ "Broken script:\n```python\n{script}\n```\n\n"
73
+ "Error trace:\n```\n{trace}\n```\n\n"
74
+ "Output ONLY a minimal unified diff (`--- a/script.py` / `+++ "
75
+ "b/script.py` headers, then hunks). No prose."
76
+ )
77
+
78
+ _model = None
79
+ _tokenizer = None
80
+ _load_error: Optional[str] = None
81
+
82
+
83
+ def _load_model() -> None:
84
+ """Lazy-load the trained LoRA on first GPU invocation."""
85
+ global _model, _tokenizer, _load_error
86
+ if _model is not None or _load_error is not None:
87
+ return
88
+ try:
89
+ import torch
90
+ from peft import PeftModel
91
+ from transformers import AutoModelForCausalLM, AutoTokenizer
92
+
93
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
94
+ base = AutoModelForCausalLM.from_pretrained(
95
+ BASE_MODEL,
96
+ torch_dtype=torch.float16,
97
+ device_map="auto",
98
+ )
99
+ try:
100
+ model = PeftModel.from_pretrained(base, ADAPTER_REPO)
101
+ except Exception as e: # noqa: BLE001
102
+ print(f"[demo] adapter not found ({e}); using base model")
103
+ model = base
104
+ _model = model.eval()
105
+ _tokenizer = tokenizer
106
+ except Exception as e: # noqa: BLE001
107
+ _load_error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
108
+
109
+
110
+ def _baseline_fallback(script: str, error_trace: str) -> str:
111
+ """Deterministic repair if the trained model isn't available.
112
+
113
+ Uses the in-repo BaselineRepairAgent if the package is installed; else
114
+ just returns an explanatory message.
115
+ """
116
+ try:
117
+ from forgeenv.roles.repair_agent import BaselineRepairAgent
118
+
119
+ agent = BaselineRepairAgent()
120
+ return agent.repair(script, breakage_spec=None, original_script=None)
121
+ except Exception: # noqa: BLE001
122
+ return (
123
+ "# (Fallback) Trained adapter unavailable in this Space.\n"
124
+ "# Likely fix based on the error trace:\n"
125
+ f"# {error_trace.splitlines()[0] if error_trace else ''}\n"
126
+ )
127
+
128
+
129
+ def _generate_with_model(prompt: str, max_new_tokens: int = 512) -> str:
130
+ import torch
131
+
132
+ inputs = _tokenizer(prompt, return_tensors="pt").to(_model.device)
133
+ with torch.no_grad():
134
+ out = _model.generate(
135
+ **inputs,
136
+ max_new_tokens=max_new_tokens,
137
+ do_sample=True,
138
+ temperature=0.3,
139
+ top_p=0.9,
140
+ pad_token_id=_tokenizer.eos_token_id,
141
+ )
142
+ completion = _tokenizer.decode(
143
+ out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
144
+ )
145
+ return completion.strip()
146
+
147
+
148
+ # Wrap inference in a `@spaces.GPU` decorator if available so we get a free
149
+ # ZeroGPU slice. Outside ZeroGPU it's a no-op.
150
+ try:
151
+ import spaces # type: ignore
152
+
153
+ _gpu_decorator = spaces.GPU(duration=60)
154
+ except Exception: # noqa: BLE001
155
+ def _gpu_decorator(fn):
156
+ return fn
157
+
158
+
159
+ @_gpu_decorator
160
+ def repair_script(script: str, error_trace: str) -> str:
161
+ if not script.strip():
162
+ return "# Paste a broken script first."
163
+
164
+ _load_model()
165
+ if _model is None:
166
+ return _baseline_fallback(script, error_trace)
167
+
168
+ versions = json.dumps(
169
+ {"transformers": "4.45.0", "datasets": "2.20.0", "torch": "2.4.0"}
170
+ )
171
+ prompt = _PROMPT_TEMPLATE.format(
172
+ versions=versions, script=script, trace=error_trace or "(no trace)"
173
+ )
174
+ try:
175
+ return _generate_with_model(prompt)
176
+ except Exception as e: # noqa: BLE001
177
+ return f"# generation failed: {e}\n" + _baseline_fallback(script, error_trace)
178
+
179
+
180
+ with gr.Blocks(title="ForgeEnv Repair Agent") as demo:
181
+ gr.Markdown(f"# {_TITLE}\n\n{_DESCRIPTION}")
182
+ with gr.Row():
183
+ with gr.Column():
184
+ in_script = gr.Code(
185
+ label="Broken HuggingFace script",
186
+ language="python",
187
+ lines=22,
188
+ )
189
+ in_trace = gr.Textbox(
190
+ label="Error trace",
191
+ lines=6,
192
+ placeholder="Traceback...",
193
+ )
194
+ run_btn = gr.Button("Repair", variant="primary")
195
+ with gr.Column():
196
+ out_diff = gr.Code(
197
+ label="Suggested repair (unified diff)",
198
+ language="markdown",
199
+ lines=22,
200
+ )
201
+
202
+ gr.Examples(examples=_EXAMPLES, inputs=[in_script, in_trace])
203
+ run_btn.click(repair_script, inputs=[in_script, in_trace], outputs=out_diff)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ demo.launch()
demo-space/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==5.7.1
2
+ torch>=2.1.0
3
+ transformers>=4.40.0
4
+ peft>=0.10.0
5
+ accelerate>=0.30.0
6
+ spaces>=0.28.0
7
+ audioop-lts; python_version >= "3.13"
forgeenv-space/Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PYTHONDONTWRITEBYTECODE=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ RUN apt-get update \
8
+ && apt-get install -y --no-install-recommends git curl \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY forgeenv/ forgeenv/
17
+ COPY openenv.yaml .
18
+
19
+ ENV PORT=7860
20
+ EXPOSE 7860
21
+
22
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
23
+ CMD curl -f http://127.0.0.1:7860/health || exit 1
24
+
25
+ CMD ["uvicorn", "forgeenv.env.server:app", "--host", "0.0.0.0", "--port", "7860"]
forgeenv-space/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ForgeEnv
3
+ emoji: 🔧
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: true
9
+ license: apache-2.0
10
+ tags:
11
+ - openenv
12
+ - self-play
13
+ - self-improvement
14
+ - code-repair
15
+ - schema-drift
16
+ - reinforcement-learning
17
+ - huggingface
18
+ short_description: Self-improving RL env for HF library-drift repair
19
+ ---
20
+
21
+ # ForgeEnv — OpenEnv Server
22
+
23
+ This Space hosts the **ForgeEnv** OpenEnv-compliant environment as a FastAPI
24
+ service. It exposes the standard `reset`, `step`, and `state` endpoints and is
25
+ the runtime that training notebooks (TRL + Unsloth) connect to.
26
+
27
+ > **Theme:** Self-Improvement (Hackathon Theme #4) — Challenger / Solver
28
+ > co-evolution via R-Zero, SPIRAL, and Absolute Zero Reasoner techniques.
29
+
30
+ ## What it does
31
+
32
+ ForgeEnv simulates **HuggingFace library version drift**. A *Drift Generator*
33
+ proposes a realistic breakage to a working training script (renamed APIs,
34
+ deprecated imports, changed argument signatures, etc.). A *Repair Agent* then
35
+ emits a unified diff that should restore the script. Reward is computed by an
36
+ execution simulator + AST checker + held-out evaluator (multi-component to
37
+ resist reward hacking).
38
+
39
+ ## API
40
+
41
+ The server uses [`openenv-core`](https://pypi.org/project/openenv-core/) and
42
+ follows the Gym-style contract:
43
+
44
+ | Endpoint | Method | Purpose |
45
+ | -------- | ------ | -------------------------------------------------- |
46
+ | `/reset` | POST | Sample a fresh task, return drift-gen observation |
47
+ | `/step` | POST | Apply a `ForgeAction` (breakage or repair) |
48
+ | `/state` | GET | Inspect the current internal state |
49
+ | `/health`| GET | Health probe (used by the container HEALTHCHECK) |
50
+
51
+ `ForgeAction` is a discriminated union of `BreakageAction` (used in phase 1)
52
+ and `RepairAction` (used in phase 2). See
53
+ [`forgeenv/env/actions.py`](forgeenv/env/actions.py).
54
+
55
+ ## Quick test
56
+
57
+ ```bash
58
+ curl -X POST https://akhiilll-forgeenv.hf.space/reset
59
+ curl https://akhiilll-forgeenv.hf.space/state
60
+ ```
61
+
62
+ ```python
63
+ from openenv.core.env_client import EnvClient
64
+
65
+ async with EnvClient(base_url="https://akhiilll-forgeenv.hf.space") as client:
66
+ obs = await client.reset()
67
+ print(obs.observation.current_phase, obs.observation.task_id)
68
+ ```
69
+
70
+ ## Project links
71
+
72
+ - **Main repo / training notebooks / plots:**
73
+ <https://github.com/akhiilll/forgeenv>
74
+ - **Repair Agent model (LoRA):**
75
+ <https://huggingface.co/akhiilll/forgeenv-repair-agent>
76
+ - **Demo (Gradio + ZeroGPU):**
77
+ <https://huggingface.co/spaces/akhiilll/forgeenv-demo>
78
+
79
+ ## Citations
80
+
81
+ - Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
82
+ - Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
83
+ - Liu et al., *SPIRAL: Self-Play on Zero-Sum Games* (2025)
84
+ - [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
85
+ - [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks
forgeenv-space/forgeenv/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """ForgeEnv: Self-improving RL environment for HuggingFace ecosystem repair."""
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "akhiilll"
forgeenv-space/forgeenv/artifacts/repair_library.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Persisted "repair library" — the model's accumulated knowledge of
2
+ known breakage -> repair pairs. Curated from successful rollouts during
3
+ training. Loaded at inference time as a few-shot prefix when the agent
4
+ recognises a familiar error class.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ from dataclasses import asdict, dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+
13
+
14
+ @dataclass
15
+ class RepairExample:
16
+ primitive_type: str
17
+ breakage_params: dict[str, Any]
18
+ error_signature: str
19
+ repair_diff: str
20
+ visible_reward: float
21
+ held_out: dict[str, float]
22
+ task_id: str = ""
23
+
24
+ def signature_key(self) -> str:
25
+ return f"{self.primitive_type}::{self.error_signature[:80]}"
26
+
27
+
28
+ @dataclass
29
+ class RepairLibrary:
30
+ examples: list[RepairExample] = field(default_factory=list)
31
+
32
+ def add(self, example: RepairExample) -> None:
33
+ self.examples.append(example)
34
+
35
+ def best_match(self, primitive_type: str, error_text: str) -> Optional[RepairExample]:
36
+ """Return the highest-reward example whose primitive_type matches and
37
+ whose error text overlaps."""
38
+ candidates = [
39
+ e for e in self.examples if e.primitive_type == primitive_type
40
+ ]
41
+ if not candidates:
42
+ return None
43
+ scored = sorted(
44
+ candidates,
45
+ key=lambda e: (
46
+ _ngram_overlap(e.error_signature, error_text),
47
+ e.visible_reward,
48
+ ),
49
+ reverse=True,
50
+ )
51
+ return scored[0] if scored else None
52
+
53
+ def to_dict(self) -> dict:
54
+ return {
55
+ "version": "1",
56
+ "examples": [asdict(e) for e in self.examples],
57
+ "size": len(self.examples),
58
+ "by_primitive": _count_by_primitive(self.examples),
59
+ }
60
+
61
+ def save(self, path: str | Path) -> None:
62
+ path = Path(path)
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ path.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
65
+
66
+ @classmethod
67
+ def load(cls, path: str | Path) -> "RepairLibrary":
68
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
69
+ examples = [RepairExample(**e) for e in data.get("examples", [])]
70
+ return cls(examples=examples)
71
+
72
+
73
+ def _ngram_overlap(a: str, b: str, n: int = 3) -> float:
74
+ if not a or not b:
75
+ return 0.0
76
+
77
+ def grams(text: str) -> set[str]:
78
+ text = text.lower()
79
+ return {text[i : i + n] for i in range(len(text) - n + 1)}
80
+
81
+ ga, gb = grams(a), grams(b)
82
+ if not ga or not gb:
83
+ return 0.0
84
+ return len(ga & gb) / max(1, len(ga | gb))
85
+
86
+
87
+ def _count_by_primitive(examples: list[RepairExample]) -> dict[str, int]:
88
+ counts: dict[str, int] = {}
89
+ for e in examples:
90
+ counts[e.primitive_type] = counts.get(e.primitive_type, 0) + 1
91
+ return counts
92
+
93
+
94
+ def curate_from_rollouts(
95
+ rollout_results: list,
96
+ min_reward: float = 0.6,
97
+ min_held_out_clean: float = 0.5,
98
+ ) -> RepairLibrary:
99
+ """Build a RepairLibrary from a list of rollout dicts/RolloutResults."""
100
+ lib = RepairLibrary()
101
+ for r in rollout_results:
102
+ get = r.get if isinstance(r, dict) else lambda k, default=None: getattr(r, k, default)
103
+ if float(get("visible_reward", 0.0) or 0.0) < min_reward:
104
+ continue
105
+ if float(get("held_out_breakdown", {}).get("executed_cleanly", 0.0)) < min_held_out_clean:
106
+ continue
107
+ lib.add(
108
+ RepairExample(
109
+ primitive_type=str(get("primitive_type", "unknown")),
110
+ breakage_params=dict(get("info", {}).get("breakage_spec", {}).get("params", {}))
111
+ if isinstance(get("info", {}), dict)
112
+ else {},
113
+ error_signature=str(get("error_trace", "") or "")[:160],
114
+ repair_diff=str(get("repair_completion", "") or get("info", {}).get("repair_diff", ""))[:2000],
115
+ visible_reward=float(get("visible_reward", 0.0) or 0.0),
116
+ held_out=dict(get("held_out_breakdown", {}) or {}),
117
+ task_id=str(get("task_id", "")),
118
+ )
119
+ )
120
+ return lib
forgeenv-space/forgeenv/drift/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/drift/library_drift_engine.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Library Drift Engine.
2
+
3
+ Manages library version snapshots and triggers version upgrades during
4
+ training to create non-stationary verification. In simulation mode it
5
+ just tracks the current snapshot index — that index influences
6
+ breakage selection and is exposed in observations so the Repair Agent
7
+ can adapt.
8
+
9
+ Also exposes Chojecki GVU's SNR computation
10
+ (https://arxiv.org/abs/2512.02731 Definition 4.4).
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import math
15
+ from dataclasses import dataclass, field
16
+
17
+ DEFAULT_VERSION_SNAPSHOTS: list[dict[str, str]] = [
18
+ {"transformers": "4.36.0", "datasets": "2.14.0", "trl": "0.7.0"},
19
+ {"transformers": "4.40.0", "datasets": "2.18.0", "trl": "0.8.0"},
20
+ {"transformers": "4.45.0", "datasets": "3.0.0", "trl": "0.10.0"},
21
+ {"transformers": "4.50.0", "datasets": "3.2.0", "trl": "0.12.0"},
22
+ ]
23
+
24
+
25
+ @dataclass
26
+ class LibraryDriftEngine:
27
+ snapshots: list[dict[str, str]] = field(
28
+ default_factory=lambda: list(DEFAULT_VERSION_SNAPSHOTS)
29
+ )
30
+ current_index: int = 0
31
+ drift_history: list[dict] = field(default_factory=list)
32
+
33
+ def current_versions(self) -> dict[str, str]:
34
+ return dict(self.snapshots[self.current_index])
35
+
36
+ def maybe_drift(self, episode_num: int, drift_every: int = 50) -> bool:
37
+ if (
38
+ episode_num > 0
39
+ and episode_num % drift_every == 0
40
+ and self.current_index < len(self.snapshots) - 1
41
+ ):
42
+ prev = self.snapshots[self.current_index]
43
+ self.current_index += 1
44
+ self.drift_history.append(
45
+ {
46
+ "episode": episode_num,
47
+ "from": prev,
48
+ "to": self.snapshots[self.current_index],
49
+ }
50
+ )
51
+ return True
52
+ return False
53
+
54
+ def reset(self) -> None:
55
+ self.current_index = 0
56
+ self.drift_history.clear()
57
+
58
+ @staticmethod
59
+ def compute_snr(
60
+ recent_held_out: list[float], recent_visible: list[float]
61
+ ) -> dict[str, float]:
62
+ """SNR per Chojecki GVU Def 4.4: SNR = mean(rewards)^2 / variance(rewards)."""
63
+
64
+ def snr(values: list[float]) -> float:
65
+ if len(values) < 2:
66
+ return 0.0
67
+ mean = sum(values) / len(values)
68
+ var = sum((v - mean) ** 2 for v in values) / len(values)
69
+ return mean**2 / max(var, 1e-8)
70
+
71
+ return {
72
+ "snr_verifier": snr(recent_held_out),
73
+ "snr_generator": snr(recent_visible),
74
+ }
forgeenv-space/forgeenv/env/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/env/actions.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic action models for ForgeEnv (compatible with OpenEnv 0.2.x).
2
+
3
+ Episodes have two phases — drift_gen (Challenger) and repair (Solver) — so
4
+ we expose a single union ForgeAction that carries either a BreakageAction
5
+ or a RepairAction. The environment dispatches on which sub-field is set.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Literal, Optional
10
+
11
+ from pydantic import Field
12
+
13
+ from openenv.core import Action
14
+
15
+
16
+ class BreakageAction(Action):
17
+ """Drift Generator's action: pick a primitive type + parameters."""
18
+
19
+ action_type: Literal["breakage"] = "breakage"
20
+ primitive_type: str = Field(
21
+ ..., description="One of the registered breakage primitive class names"
22
+ )
23
+ params: dict[str, Any] = Field(
24
+ default_factory=dict, description="Primitive-specific parameters"
25
+ )
26
+
27
+
28
+ class RepairAction(Action):
29
+ """Repair Agent's action: a unified diff (or full replacement script)."""
30
+
31
+ action_type: Literal["repair"] = "repair"
32
+ unified_diff: str = Field(..., description="Unified diff or full replacement script")
33
+
34
+
35
+ class ForgeAction(Action):
36
+ """Union action: exactly one of `breakage` / `repair` must be set.
37
+
38
+ This is the type registered with OpenEnv's `create_app`. It avoids
39
+ Pydantic discriminated unions to keep the OpenAPI schema flat and
40
+ cross-version-friendly.
41
+ """
42
+
43
+ breakage: Optional[BreakageAction] = None
44
+ repair: Optional[RepairAction] = None
45
+
46
+ def model_post_init(self, __context: Any) -> None:
47
+ if (self.breakage is None) == (self.repair is None):
48
+ raise ValueError(
49
+ "ForgeAction requires exactly one of `breakage` or `repair` to be set."
50
+ )
forgeenv-space/forgeenv/env/diff_utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified-diff application utilities.
2
+
3
+ The Repair Agent submits a unified diff. We need a permissive applier
4
+ because LLM diffs are often malformed (wrong line numbers, missing
5
+ context, extra prose). We try the strict applier first, then fall
6
+ back to applying hunks via plain string replacement.
7
+
8
+ The agent may also submit a full Python script instead of a diff
9
+ (common when the model's diff format breaks). We detect this and
10
+ treat it as a complete replacement.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import difflib
15
+ import re
16
+
17
+
18
+ _HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
19
+ _SCRIPT_MARKERS = ("import ", "from ", "def ", "class ", "print(")
20
+
21
+
22
+ def looks_like_full_script(text: str) -> bool:
23
+ """Heuristic: text is probably a full python script, not a diff."""
24
+ lines = text.lstrip().splitlines()
25
+ if not lines:
26
+ return False
27
+ has_diff_header = any(
28
+ line.startswith(("---", "+++", "@@")) for line in lines[:5]
29
+ )
30
+ if has_diff_header:
31
+ return False
32
+ # If we see two or more script-style markers in the first 30 lines,
33
+ # treat as a full replacement script.
34
+ head = "\n".join(lines[:30])
35
+ hits = sum(1 for marker in _SCRIPT_MARKERS if marker in head)
36
+ return hits >= 2
37
+
38
+
39
+ def _strict_apply(broken_script: str, diff_text: str) -> str | None:
40
+ """Apply a unified diff strictly. Returns None on any failure."""
41
+ lines = broken_script.splitlines(keepends=True)
42
+ out: list[str] = []
43
+ diff_lines = diff_text.splitlines()
44
+ i = 0
45
+ src_idx = 0
46
+ in_hunk = False
47
+ hunk_old: list[str] = []
48
+ hunk_new: list[str] = []
49
+
50
+ while i < len(diff_lines):
51
+ line = diff_lines[i]
52
+ if line.startswith(("---", "+++")):
53
+ i += 1
54
+ continue
55
+ if line.startswith("@@"):
56
+ # Flush previous hunk
57
+ if in_hunk:
58
+ # Find the hunk_old block in the source starting at src_idx.
59
+ target = "".join(hunk_old)
60
+ source_remainder = "".join(lines[src_idx:])
61
+ pos = source_remainder.find(target)
62
+ if pos == -1:
63
+ return None
64
+ out.append(source_remainder[:pos])
65
+ out.append("".join(hunk_new))
66
+ src_idx += len(source_remainder[: pos + len(target)].splitlines(keepends=True))
67
+ hunk_old, hunk_new = [], []
68
+ in_hunk = True
69
+ i += 1
70
+ continue
71
+ if in_hunk:
72
+ if line.startswith("+"):
73
+ hunk_new.append(line[1:] + "\n")
74
+ elif line.startswith("-"):
75
+ hunk_old.append(line[1:] + "\n")
76
+ else:
77
+ # context line
78
+ ctx = line[1:] if line.startswith(" ") else line
79
+ hunk_old.append(ctx + "\n")
80
+ hunk_new.append(ctx + "\n")
81
+ i += 1
82
+
83
+ # Flush trailing hunk
84
+ if in_hunk and (hunk_old or hunk_new):
85
+ target = "".join(hunk_old)
86
+ source_remainder = "".join(lines[src_idx:])
87
+ pos = source_remainder.find(target)
88
+ if pos == -1:
89
+ return None
90
+ out.append(source_remainder[:pos])
91
+ out.append("".join(hunk_new))
92
+ consumed = source_remainder[: pos + len(target)]
93
+ src_idx += len(consumed.splitlines(keepends=True))
94
+
95
+ out.append("".join(lines[src_idx:]))
96
+ return "".join(out)
97
+
98
+
99
+ def _permissive_apply(broken_script: str, diff_text: str) -> str:
100
+ """Apply a malformed diff by extracting (-,+) line pairs and doing
101
+ a tolerant search-and-replace.
102
+ """
103
+ repaired = broken_script
104
+ pairs: list[tuple[str, str]] = []
105
+ lines = diff_text.splitlines()
106
+ pending_minus: str | None = None
107
+
108
+ for line in lines:
109
+ if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
110
+ pending_minus = None
111
+ continue
112
+ if line.startswith("-"):
113
+ pending_minus = line[1:].strip()
114
+ elif line.startswith("+") and pending_minus is not None:
115
+ pairs.append((pending_minus, line[1:].strip()))
116
+ pending_minus = None
117
+ elif pending_minus is not None and not line.startswith(" "):
118
+ # standalone deletion — skip in permissive mode (we can't
119
+ # reliably know what to delete without context)
120
+ pending_minus = None
121
+
122
+ for old, new in pairs:
123
+ if old and old in repaired:
124
+ repaired = repaired.replace(old, new, 1)
125
+
126
+ return repaired
127
+
128
+
129
+ def apply_unified_diff(broken_script: str, diff_text: str) -> str:
130
+ """Try every strategy in order and return the first that produces a change.
131
+
132
+ Strategies:
133
+ 1. If `diff_text` looks like a full script, return it directly.
134
+ 2. Try strict diff application.
135
+ 3. Fall back to permissive (-,+) line-pair replacement.
136
+ 4. As last resort, return the broken script unchanged.
137
+ """
138
+ diff_text = diff_text or ""
139
+ if not diff_text.strip():
140
+ return broken_script
141
+
142
+ if looks_like_full_script(diff_text):
143
+ return diff_text
144
+
145
+ if _HUNK_RE.search(diff_text) or "---" in diff_text or "+++" in diff_text:
146
+ strict = _strict_apply(broken_script, diff_text)
147
+ if strict is not None and strict != broken_script:
148
+ return strict
149
+
150
+ perm = _permissive_apply(broken_script, diff_text)
151
+ return perm
152
+
153
+
154
+ def make_unified_diff(before: str, after: str, path: str = "train.py") -> str:
155
+ """Produce a canonical unified diff from before -> after."""
156
+ diff = difflib.unified_diff(
157
+ before.splitlines(keepends=True),
158
+ after.splitlines(keepends=True),
159
+ fromfile=f"a/{path}",
160
+ tofile=f"b/{path}",
161
+ n=2,
162
+ )
163
+ return "".join(diff)
forgeenv-space/forgeenv/env/forge_environment.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ForgeEnvironment: the OpenEnv Environment subclass for ForgeEnv.
2
+
3
+ Episode flow (exactly 2 steps per episode):
4
+ reset() -> sample task, ask Teacher for category
5
+ step(BreakageAction) -> Drift Generator's proposal is applied; broken
6
+ script is run, error trace captured.
7
+ step(RepairAction) -> Repair diff is applied; script is re-executed;
8
+ visible + held-out rewards computed; episode ends.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ import uuid
14
+ from typing import Any, Optional
15
+
16
+ from openenv.core import Environment
17
+
18
+ from forgeenv.drift.library_drift_engine import LibraryDriftEngine
19
+ from forgeenv.env.actions import BreakageAction, ForgeAction, RepairAction
20
+ from forgeenv.env.diff_utils import apply_unified_diff
21
+ from forgeenv.env.observations import ForgeObservation
22
+ from forgeenv.primitives.breakage_primitives import (
23
+ PRIMITIVE_REGISTRY,
24
+ parse_breakage_spec,
25
+ )
26
+ from forgeenv.roles.teacher import Teacher
27
+ from forgeenv.sandbox.simulation_mode import SimulationExecutor
28
+ from forgeenv.tasks.models import ExecutionResult, Task
29
+ from forgeenv.tasks.task_sampler import TaskSampler
30
+ from forgeenv.verifier.held_out_evaluator import compute_held_out_scores
31
+ from forgeenv.verifier.visible_verifier import compute_visible_reward
32
+
33
+ DEFAULT_CATEGORIES = sorted(PRIMITIVE_REGISTRY.keys())
34
+
35
+
36
+ class ForgeEnvironment(Environment[ForgeAction, ForgeObservation, dict]):
37
+ """OpenEnv-compliant environment for HuggingFace ecosystem repair."""
38
+
39
+ SUPPORTS_CONCURRENT_SESSIONS = False # Teacher state is global per env
40
+
41
+ def __init__(
42
+ self,
43
+ task_sampler: Optional[TaskSampler] = None,
44
+ teacher: Optional[Teacher] = None,
45
+ executor: Optional[SimulationExecutor] = None,
46
+ drift_engine: Optional[LibraryDriftEngine] = None,
47
+ seed: Optional[int] = None,
48
+ ) -> None:
49
+ super().__init__()
50
+ self.task_sampler = task_sampler or TaskSampler()
51
+ self.teacher = teacher or Teacher(
52
+ categories=list(DEFAULT_CATEGORIES) or ["api_drift"]
53
+ )
54
+ self.executor = executor or SimulationExecutor(seed=seed)
55
+ self.drift_engine = drift_engine or LibraryDriftEngine()
56
+
57
+ self._episode_id: Optional[str] = None
58
+ self._episode_count: int = 0
59
+ self._current_task: Optional[Task] = None
60
+ self._original_script: str = ""
61
+ self._broken_script: str = ""
62
+ self._error_trace: str = ""
63
+ self._breakage_spec: Optional[dict[str, Any]] = None
64
+ self._target_category: str = ""
65
+ self._current_phase: str = "idle"
66
+ self._last_obs: Optional[ForgeObservation] = None
67
+
68
+ # ------------------------------------------------------------------ API
69
+ def reset(
70
+ self,
71
+ seed: Optional[int] = None,
72
+ episode_id: Optional[str] = None,
73
+ difficulty: Optional[str] = "easy",
74
+ **kwargs: Any,
75
+ ) -> ForgeObservation:
76
+ self._episode_id = episode_id or str(uuid.uuid4())
77
+ self._episode_count += 1
78
+ self._target_category = self.teacher.select_next_category()
79
+
80
+ task = self.task_sampler.sample(difficulty=difficulty)
81
+ if task is None:
82
+ raise RuntimeError("Task sampler returned no tasks (empty seed corpus?)")
83
+ self._current_task = task
84
+ self._original_script = task.script_content
85
+ self._broken_script = ""
86
+ self._error_trace = ""
87
+ self._breakage_spec = None
88
+ self._current_phase = "drift_gen"
89
+
90
+ # Library drift trigger every 50 episodes (configurable from outside).
91
+ drifted = self.drift_engine.maybe_drift(self._episode_count, drift_every=50)
92
+
93
+ obs = ForgeObservation(
94
+ current_phase="drift_gen",
95
+ task_id=task.task_id,
96
+ task_description=task.description,
97
+ target_category=self._target_category,
98
+ script_content=self._original_script,
99
+ error_trace=None,
100
+ library_versions=self.drift_engine.current_versions(),
101
+ episode_step=0,
102
+ done=False,
103
+ reward=0.0,
104
+ info={
105
+ "episode_id": self._episode_id,
106
+ "episode_count": self._episode_count,
107
+ "drift_triggered": drifted,
108
+ "available_primitives": sorted(PRIMITIVE_REGISTRY),
109
+ },
110
+ )
111
+ self._last_obs = obs
112
+ return obs
113
+
114
+ def step(
115
+ self,
116
+ action: ForgeAction,
117
+ timeout_s: Optional[float] = None,
118
+ **kwargs: Any,
119
+ ) -> ForgeObservation:
120
+ if self._current_phase == "drift_gen":
121
+ if action.breakage is None:
122
+ return self._error_obs("Expected BreakageAction in drift_gen phase")
123
+ return self._handle_breakage(action.breakage)
124
+
125
+ if self._current_phase == "repair":
126
+ if action.repair is None:
127
+ return self._error_obs("Expected RepairAction in repair phase")
128
+ return self._handle_repair(action.repair)
129
+
130
+ return self._error_obs(
131
+ f"step() called in invalid phase {self._current_phase!r} — call reset() first"
132
+ )
133
+
134
+ @property
135
+ def state(self) -> dict:
136
+ return {
137
+ "phase": self._current_phase,
138
+ "episode_id": self._episode_id,
139
+ "episode_count": self._episode_count,
140
+ "task_id": self._current_task.task_id if self._current_task else None,
141
+ "target_category": self._target_category,
142
+ "library_versions": self.drift_engine.current_versions(),
143
+ "teacher": self.teacher.get_state(),
144
+ "drift_history": list(self.drift_engine.drift_history),
145
+ "breakage_spec": dict(self._breakage_spec) if self._breakage_spec else None,
146
+ }
147
+
148
+ # ---------------------------------------------------------------- helpers
149
+ def _handle_breakage(self, breakage: BreakageAction) -> ForgeObservation:
150
+ spec = {"primitive_type": breakage.primitive_type, "params": dict(breakage.params)}
151
+ try:
152
+ primitive = parse_breakage_spec(spec)
153
+ except ValueError as exc:
154
+ return self._error_obs(f"Invalid breakage spec: {exc}")
155
+
156
+ try:
157
+ self._broken_script = primitive.apply(self._original_script)
158
+ except Exception as exc: # primitive bug — surface but don't crash server
159
+ return self._error_obs(f"Primitive apply failed: {exc}")
160
+
161
+ self._breakage_spec = spec
162
+
163
+ result = self.executor.execute(self._broken_script, self._current_task)
164
+ if result.exit_code != 0:
165
+ self._error_trace = result.stderr or "non-zero exit code, no stderr"
166
+ else:
167
+ # The breakage didn't actually break it; still proceed to repair phase
168
+ # (no-op repair is then a valid choice).
169
+ self._error_trace = "Script ran without observable error"
170
+
171
+ self._current_phase = "repair"
172
+
173
+ obs = ForgeObservation(
174
+ current_phase="repair",
175
+ task_id=self._current_task.task_id,
176
+ task_description=self._current_task.description,
177
+ target_category=primitive.category,
178
+ script_content=self._broken_script,
179
+ error_trace=self._error_trace,
180
+ library_versions=self.drift_engine.current_versions(),
181
+ episode_step=1,
182
+ done=False,
183
+ reward=0.0,
184
+ info={
185
+ "episode_id": self._episode_id,
186
+ "breakage_primitive": primitive.name,
187
+ "breakage_description": primitive.description,
188
+ },
189
+ )
190
+ self._last_obs = obs
191
+ return obs
192
+
193
+ def _handle_repair(self, repair: RepairAction) -> ForgeObservation:
194
+ repaired = apply_unified_diff(self._broken_script, repair.unified_diff or "")
195
+
196
+ t0 = time.time()
197
+ result = self.executor.execute(repaired, self._current_task)
198
+ result.script_content = repaired # ensure verifier sees what we ran
199
+ wall_ms = int((time.time() - t0) * 1000)
200
+
201
+ visible_reward, visible_breakdown = compute_visible_reward(
202
+ result, self._current_task
203
+ )
204
+ held_out = compute_held_out_scores(
205
+ result, self._current_task, repair_diff=repair.unified_diff or ""
206
+ )
207
+
208
+ success = result.exit_code == 0
209
+ category = (
210
+ self._breakage_spec.get("primitive_type", "unknown")
211
+ if self._breakage_spec
212
+ else "unknown"
213
+ )
214
+ # Update Teacher's curriculum state
215
+ self.teacher.update(category, success)
216
+
217
+ self._current_phase = "done"
218
+
219
+ obs = ForgeObservation(
220
+ current_phase="done",
221
+ task_id=self._current_task.task_id,
222
+ task_description=self._current_task.description,
223
+ target_category=category,
224
+ script_content=repaired,
225
+ error_trace=result.stderr or None,
226
+ library_versions=self.drift_engine.current_versions(),
227
+ episode_step=2,
228
+ done=True,
229
+ reward=visible_reward,
230
+ reward_breakdown=visible_breakdown,
231
+ held_out_breakdown=held_out,
232
+ info={
233
+ "episode_id": self._episode_id,
234
+ "exit_code": result.exit_code,
235
+ "wall_time_ms": wall_ms,
236
+ "checkpoint_exists": result.checkpoint_exists,
237
+ "stdout_tail": "\n".join(result.stdout.splitlines()[-5:]),
238
+ "breakage_spec": self._breakage_spec,
239
+ "teacher_state": self.teacher.get_state(),
240
+ },
241
+ )
242
+ self._last_obs = obs
243
+ return obs
244
+
245
+ def _error_obs(self, message: str) -> ForgeObservation:
246
+ """Return a `done=True` error observation rather than raising."""
247
+ return ForgeObservation(
248
+ current_phase="done",
249
+ task_id=self._current_task.task_id if self._current_task else "",
250
+ task_description=self._current_task.description if self._current_task else "",
251
+ target_category=self._target_category,
252
+ script_content=self._broken_script or self._original_script,
253
+ error_trace=message,
254
+ library_versions=self.drift_engine.current_versions(),
255
+ episode_step=2,
256
+ done=True,
257
+ reward=0.0,
258
+ info={"error": message, "episode_id": self._episode_id},
259
+ )
forgeenv-space/forgeenv/env/observations.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic observation model for ForgeEnv."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import Field
7
+
8
+ from openenv.core import Observation
9
+
10
+
11
+ class ForgeObservation(Observation):
12
+ """What the agent (or the trainer's rollout function) sees at each step.
13
+
14
+ Inherits `done`, `reward`, `metadata` from the OpenEnv `Observation` base.
15
+ """
16
+
17
+ current_phase: str = Field(
18
+ ..., description="One of 'drift_gen', 'repair', 'verify', 'done'"
19
+ )
20
+ task_id: str = ""
21
+ task_description: str = ""
22
+ target_category: str = ""
23
+ script_content: str = Field(default="", description="Current state of the script")
24
+ error_trace: Optional[str] = None
25
+ library_versions: dict[str, str] = Field(default_factory=dict)
26
+ reward_breakdown: dict[str, Any] = Field(default_factory=dict)
27
+ held_out_breakdown: dict[str, float] = Field(default_factory=dict)
28
+ episode_step: int = 0
29
+ info: dict[str, Any] = Field(default_factory=dict)
forgeenv-space/forgeenv/env/server.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI server for ForgeEnv (OpenEnv-compliant).
2
+
3
+ Exposes /reset, /step, /state HTTP endpoints via OpenEnv's `create_app`.
4
+ HF Spaces sets PORT=7860 automatically.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+
10
+ from fastapi.responses import HTMLResponse
11
+ from openenv.core import create_app
12
+
13
+ from forgeenv.env.actions import ForgeAction
14
+ from forgeenv.env.forge_environment import ForgeEnvironment
15
+ from forgeenv.env.observations import ForgeObservation
16
+
17
+ app = create_app(
18
+ env=ForgeEnvironment,
19
+ action_cls=ForgeAction,
20
+ observation_cls=ForgeObservation,
21
+ env_name="forgeenv",
22
+ )
23
+
24
+
25
+ _LANDING_HTML = """<!doctype html>
26
+ <html lang="en">
27
+ <head>
28
+ <meta charset="utf-8">
29
+ <title>ForgeEnv — OpenEnv server</title>
30
+ <meta name="viewport" content="width=device-width,initial-scale=1">
31
+ <style>
32
+ :root { color-scheme: light dark; }
33
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
34
+ max-width: 760px; margin: 2.5rem auto; padding: 0 1.25rem;
35
+ line-height: 1.55; color: #1f2937; background: #fafafa; }
36
+ @media (prefers-color-scheme: dark) { body { color: #e5e7eb; background: #0f172a; } }
37
+ h1 { font-size: 1.65rem; margin-bottom: 0.25rem; }
38
+ .sub { color: #6b7280; margin-top: 0; }
39
+ code, pre { font-family: ui-monospace, "SF Mono", Menlo, monospace; }
40
+ pre { background: rgba(127,127,127,0.12); padding: 0.9rem; border-radius: 8px;
41
+ overflow-x: auto; }
42
+ table { border-collapse: collapse; width: 100%; margin: 0.75rem 0 1.25rem; }
43
+ td, th { text-align: left; padding: 0.5rem 0.75rem;
44
+ border-bottom: 1px solid rgba(127,127,127,0.25); }
45
+ th { font-weight: 600; }
46
+ a { color: #2563eb; text-decoration: none; } a:hover { text-decoration: underline; }
47
+ .ok { color: #16a34a; font-weight: 600; }
48
+ .muted { color: #6b7280; font-size: 0.9rem; }
49
+ .pill { display: inline-block; padding: 0.1rem 0.5rem; border-radius: 999px;
50
+ background: rgba(34,197,94,0.15); color: #16a34a; font-size: 0.85rem; }
51
+ </style>
52
+ </head>
53
+ <body>
54
+ <h1>ForgeEnv 🔧 <span class="pill">running</span></h1>
55
+ <p class="sub">OpenEnv-compliant RL environment for HuggingFace
56
+ ecosystem repair under library version drift.</p>
57
+
58
+ <p>This URL serves the environment over HTTP. It is not a UI — it's the
59
+ runtime that <strong>training notebooks connect to</strong>. Open one of
60
+ the endpoints below, or use the demo Space to try the trained Repair
61
+ Agent in a browser.</p>
62
+
63
+ <h2>Endpoints</h2>
64
+ <table>
65
+ <tr><th>Method</th><th>Path</th><th>Purpose</th></tr>
66
+ <tr><td>GET </td><td><a href="/health">/health</a></td><td>Health probe</td></tr>
67
+ <tr><td>POST</td><td><code>/reset</code></td><td>Sample task, return drift-gen observation</td></tr>
68
+ <tr><td>POST</td><td><code>/step</code></td><td>Apply <code>ForgeAction</code> (breakage or repair)</td></tr>
69
+ <tr><td>GET </td><td><a href="/state">/state</a></td><td>Current internal state</td></tr>
70
+ <tr><td>GET </td><td><a href="/metadata">/metadata</a></td><td>Env name + version + schema URLs</td></tr>
71
+ <tr><td>GET </td><td><a href="/schema">/schema</a></td><td>Action / observation JSON schemas</td></tr>
72
+ <tr><td>GET </td><td><a href="/docs">/docs</a></td><td>Interactive Swagger UI</td></tr>
73
+ </table>
74
+
75
+ <h2>Quick start (Python)</h2>
76
+ <pre><code>import asyncio
77
+ from openenv.core import GenericEnvClient
78
+
79
+ async def go():
80
+ client = GenericEnvClient(base_url="https://akhiilll-forgeenv.hf.space")
81
+ obs = await client.reset()
82
+ print(obs.observation["current_phase"], obs.observation["task_id"])
83
+
84
+ asyncio.run(go())</code></pre>
85
+
86
+ <h2>Project links</h2>
87
+ <ul>
88
+ <li>Space card &amp; README:
89
+ <a href="https://huggingface.co/spaces/akhiilll/forgeenv" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv</a></li>
90
+ <li>Gradio demo:
91
+ <a href="https://huggingface.co/spaces/akhiilll/forgeenv-demo" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv-demo</a></li>
92
+ <li>Trained model (LoRA) <span class="muted">— published after the Colab training run finishes</span>:
93
+ <a href="https://huggingface.co/akhiilll/forgeenv-repair-agent" target="_blank" rel="noopener noreferrer">huggingface.co/akhiilll/forgeenv-repair-agent</a></li>
94
+ </ul>
95
+ <p class="muted">Tip: if links don't open from inside the embedded Space frame,
96
+ right-click and choose <em>Open in new tab</em>, or open this URL directly
97
+ at <a href="https://akhiilll-forgeenv.hf.space/" target="_blank" rel="noopener noreferrer">akhiilll-forgeenv.hf.space</a>.</p>
98
+ </body>
99
+ </html>"""
100
+
101
+
102
+ def _attach_supplementary_routes(_app) -> None:
103
+ """Add /health and a friendly GET / landing page if not present."""
104
+ existing = {
105
+ getattr(r, "path", None) for r in getattr(_app, "routes", [])
106
+ }
107
+
108
+ if "/health" not in existing:
109
+ @_app.get("/health")
110
+ def _health() -> dict:
111
+ return {"status": "ok", "env": "forgeenv"}
112
+
113
+ if "/" not in existing:
114
+ @_app.get("/", response_class=HTMLResponse, include_in_schema=False)
115
+ def _root() -> str:
116
+ return _LANDING_HTML
117
+
118
+
119
+ _attach_supplementary_routes(app)
120
+
121
+
122
+ if __name__ == "__main__":
123
+ import uvicorn
124
+
125
+ port = int(os.environ.get("PORT", "7860"))
126
+ uvicorn.run(app, host="0.0.0.0", port=port)
forgeenv-space/forgeenv/primitives/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/primitives/breakage_primitives.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """8 breakage primitives representing real HuggingFace/PyTorch ecosystem drift.
2
+
3
+ Each primitive transforms a working script to simulate a library upgrade
4
+ breakage. They double as the Drift Generator's structured action space.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from abc import ABC, abstractmethod
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class BreakagePrimitive(ABC):
15
+ """Abstract base class for all breakage types."""
16
+
17
+ category: str = field(default="generic", init=False)
18
+ name: str = field(default="BreakagePrimitive", init=False)
19
+ description: str = field(default="", init=False)
20
+
21
+ @abstractmethod
22
+ def apply(self, script: str) -> str:
23
+ """Transform `script` to introduce the breakage."""
24
+
25
+ def to_spec(self) -> dict:
26
+ """Serialize to JSON-compatible spec for the LLM action space."""
27
+ return {
28
+ "primitive_type": self.__class__.__name__,
29
+ "category": self.category,
30
+ "params": self._get_params(),
31
+ }
32
+
33
+ @abstractmethod
34
+ def _get_params(self) -> dict:
35
+ """Return a JSON-serializable dict of constructor parameters."""
36
+
37
+
38
+ @dataclass
39
+ class RenameApiCall(BreakagePrimitive):
40
+ """Rename a function/method call to simulate API deprecation."""
41
+
42
+ old_name: str = ""
43
+ new_name: str = ""
44
+
45
+ def __post_init__(self) -> None:
46
+ self.category = "api_drift"
47
+ self.name = "RenameApiCall"
48
+ self.description = f"Rename {self.old_name} -> {self.new_name}"
49
+
50
+ def apply(self, script: str) -> str:
51
+ if not self.old_name:
52
+ return script
53
+ # Use word-boundary replacement so we don't substring-match identifiers.
54
+ pattern = re.compile(rf"(?<!\w){re.escape(self.old_name)}(?!\w)")
55
+ return pattern.sub(self.new_name, script)
56
+
57
+ def _get_params(self) -> dict:
58
+ return {"old_name": self.old_name, "new_name": self.new_name}
59
+
60
+
61
+ @dataclass
62
+ class DeprecateImport(BreakagePrimitive):
63
+ """Change an import path to simulate module restructuring."""
64
+
65
+ old_module: str = ""
66
+ new_module: str = ""
67
+
68
+ def __post_init__(self) -> None:
69
+ self.category = "import_drift"
70
+ self.name = "DeprecateImport"
71
+ self.description = f"Move {self.old_module} -> {self.new_module}"
72
+
73
+ def apply(self, script: str) -> str:
74
+ if not self.old_module:
75
+ return script
76
+ return script.replace(self.old_module, self.new_module)
77
+
78
+ def _get_params(self) -> dict:
79
+ return {"old_module": self.old_module, "new_module": self.new_module}
80
+
81
+
82
+ @dataclass
83
+ class ChangeArgumentSignature(BreakagePrimitive):
84
+ """Remove an expected kwarg (and document a new required one)."""
85
+
86
+ function_name: str = ""
87
+ removed_arg: str = ""
88
+ added_arg: str = ""
89
+ added_value: str = ""
90
+
91
+ def __post_init__(self) -> None:
92
+ self.category = "api_drift"
93
+ self.name = "ChangeArgumentSignature"
94
+ self.description = (
95
+ f"Change args of {self.function_name}: -{self.removed_arg} +{self.added_arg}"
96
+ )
97
+
98
+ def apply(self, script: str) -> str:
99
+ if not self.removed_arg:
100
+ return script
101
+ pattern = rf"(\b{re.escape(self.removed_arg)}\s*=\s*[^,)]+,?\s*)"
102
+ return re.sub(pattern, "", script)
103
+
104
+ def _get_params(self) -> dict:
105
+ return {
106
+ "function_name": self.function_name,
107
+ "removed_arg": self.removed_arg,
108
+ "added_arg": self.added_arg,
109
+ "added_value": self.added_value,
110
+ }
111
+
112
+
113
+ @dataclass
114
+ class ModifyConfigField(BreakagePrimitive):
115
+ """Change a config-class default value to simulate behaviour drift."""
116
+
117
+ config_class: str = ""
118
+ field_name: str = ""
119
+ new_value: str = ""
120
+
121
+ def __post_init__(self) -> None:
122
+ self.category = "config_drift"
123
+ self.name = "ModifyConfigField"
124
+ self.description = f"Change {self.config_class}.{self.field_name}"
125
+
126
+ def apply(self, script: str) -> str:
127
+ if not self.field_name:
128
+ return script
129
+ pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
130
+ return re.sub(pattern, rf"\g<1>{self.new_value}", script)
131
+
132
+ def _get_params(self) -> dict:
133
+ return {
134
+ "config_class": self.config_class,
135
+ "field_name": self.field_name,
136
+ "new_value": self.new_value,
137
+ }
138
+
139
+
140
+ @dataclass
141
+ class RestructureDatasetSchema(BreakagePrimitive):
142
+ """Rename a dataset column reference to simulate schema drift."""
143
+
144
+ old_column: str = ""
145
+ new_column: str = ""
146
+
147
+ def __post_init__(self) -> None:
148
+ self.category = "dataset_drift"
149
+ self.name = "RestructureDatasetSchema"
150
+ self.description = f"Rename column {self.old_column} -> {self.new_column}"
151
+
152
+ def apply(self, script: str) -> str:
153
+ if not self.old_column:
154
+ return script
155
+ return script.replace(
156
+ f'"{self.old_column}"', f'"{self.new_column}"'
157
+ ).replace(
158
+ f"'{self.old_column}'", f"'{self.new_column}'"
159
+ )
160
+
161
+ def _get_params(self) -> dict:
162
+ return {"old_column": self.old_column, "new_column": self.new_column}
163
+
164
+
165
+ @dataclass
166
+ class ChangeTokenizerBehavior(BreakagePrimitive):
167
+ """Change tokenizer call arguments."""
168
+
169
+ old_kwarg: str = ""
170
+ old_value: str = ""
171
+ new_kwarg: str = ""
172
+ new_value: str = ""
173
+
174
+ def __post_init__(self) -> None:
175
+ self.category = "tokenizer_drift"
176
+ self.name = "ChangeTokenizerBehavior"
177
+ self.description = f"Change tokenizer kwarg {self.old_kwarg}={self.old_value} -> {self.new_kwarg}={self.new_value}"
178
+
179
+ def apply(self, script: str) -> str:
180
+ if not self.old_kwarg:
181
+ return script
182
+ pattern = rf"{re.escape(self.old_kwarg)}\s*=\s*{re.escape(self.old_value)}"
183
+ replacement = f"{self.new_kwarg}={self.new_value}"
184
+ return re.sub(pattern, replacement, script)
185
+
186
+ def _get_params(self) -> dict:
187
+ return {
188
+ "old_kwarg": self.old_kwarg,
189
+ "old_value": self.old_value,
190
+ "new_kwarg": self.new_kwarg,
191
+ "new_value": self.new_value,
192
+ }
193
+
194
+
195
+ @dataclass
196
+ class RemoveDeprecatedMethod(BreakagePrimitive):
197
+ """Remove a method that has been deprecated, leaving a sentinel that
198
+ raises AttributeError-style errors when the script runs."""
199
+
200
+ class_name: str = ""
201
+ method_name: str = ""
202
+ replacement: str = ""
203
+
204
+ def __post_init__(self) -> None:
205
+ self.category = "api_drift"
206
+ self.name = "RemoveDeprecatedMethod"
207
+ self.description = f"Remove {self.class_name}.{self.method_name}"
208
+
209
+ def apply(self, script: str) -> str:
210
+ if not self.method_name:
211
+ return script
212
+ return script.replace(
213
+ f".{self.method_name}(", f".{self.method_name}_DEPRECATED("
214
+ )
215
+
216
+ def _get_params(self) -> dict:
217
+ return {
218
+ "class_name": self.class_name,
219
+ "method_name": self.method_name,
220
+ "replacement": self.replacement,
221
+ }
222
+
223
+
224
+ @dataclass
225
+ class ChangeReturnType(BreakagePrimitive):
226
+ """A function now returns a different structure (e.g. tuple -> object)."""
227
+
228
+ function_name: str = ""
229
+ old_access: str = ""
230
+ new_access: str = ""
231
+
232
+ def __post_init__(self) -> None:
233
+ self.category = "api_drift"
234
+ self.name = "ChangeReturnType"
235
+ self.description = f"Change return type of {self.function_name}"
236
+
237
+ def apply(self, script: str) -> str:
238
+ if self.old_access and self.new_access:
239
+ return script.replace(self.old_access, self.new_access)
240
+ return script
241
+
242
+ def _get_params(self) -> dict:
243
+ return {
244
+ "function_name": self.function_name,
245
+ "old_access": self.old_access,
246
+ "new_access": self.new_access,
247
+ }
248
+
249
+
250
+ PRIMITIVE_REGISTRY: dict[str, type[BreakagePrimitive]] = {
251
+ "RenameApiCall": RenameApiCall,
252
+ "DeprecateImport": DeprecateImport,
253
+ "ChangeArgumentSignature": ChangeArgumentSignature,
254
+ "ModifyConfigField": ModifyConfigField,
255
+ "RestructureDatasetSchema": RestructureDatasetSchema,
256
+ "ChangeTokenizerBehavior": ChangeTokenizerBehavior,
257
+ "RemoveDeprecatedMethod": RemoveDeprecatedMethod,
258
+ "ChangeReturnType": ChangeReturnType,
259
+ }
260
+
261
+
262
+ def parse_breakage_spec(spec: dict) -> BreakagePrimitive:
263
+ """Parse a JSON breakage spec into a BreakagePrimitive object.
264
+
265
+ Tolerates extra keys; ignores unknown params (LLMs hallucinate these).
266
+ """
267
+ ptype = spec.get("primitive_type", "")
268
+ params = spec.get("params", {}) or {}
269
+
270
+ if ptype not in PRIMITIVE_REGISTRY:
271
+ raise ValueError(
272
+ f"Unknown primitive type: {ptype!r}. "
273
+ f"Valid types: {list(PRIMITIVE_REGISTRY)}"
274
+ )
275
+
276
+ cls = PRIMITIVE_REGISTRY[ptype]
277
+ # Filter to known fields only so a hallucinated kwarg can't crash us.
278
+ valid_fields = {
279
+ f.name for f in cls.__dataclass_fields__.values() if f.init # type: ignore[attr-defined]
280
+ }
281
+ filtered = {k: v for k, v in params.items() if k in valid_fields}
282
+ return cls(**filtered)
forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Drift taxonomy: real HuggingFace/PyTorch breakages observed across version bumps.
2
+ # Used to seed the Drift Generator's initial proposal distribution and to anchor
3
+ # warm-start pair generation in things that actually happened in the wild.
4
+ - version_range: "transformers 4.36 -> 4.45"
5
+ affected_api: "Trainer.evaluate"
6
+ description: "Trainer.evaluate() return type changed shape; metrics now nested under .metrics"
7
+ breakage_primitive: "ChangeReturnType"
8
+ params:
9
+ function_name: "evaluate"
10
+ old_access: "trainer.evaluate()"
11
+ new_access: "trainer.evaluate().metrics"
12
+ repair_primitive: "RestoreReturnAccess"
13
+ category: "api_drift"
14
+
15
+ - version_range: "transformers 4.30 -> 4.40"
16
+ affected_api: "TrainingArguments.evaluation_strategy"
17
+ description: "Renamed evaluation_strategy -> eval_strategy"
18
+ breakage_primitive: "RenameApiCall"
19
+ params:
20
+ old_name: "evaluation_strategy"
21
+ new_name: "eval_strategy"
22
+ repair_primitive: "RestoreApiCall"
23
+ category: "api_drift"
24
+
25
+ - version_range: "datasets 2.14 -> 3.0"
26
+ affected_api: "load_dataset"
27
+ description: "Default split column was renamed in some GLUE configs"
28
+ breakage_primitive: "RestructureDatasetSchema"
29
+ params:
30
+ old_column: "label"
31
+ new_column: "labels"
32
+ repair_primitive: "RestoreColumn"
33
+ category: "dataset_drift"
34
+
35
+ - version_range: "transformers 4.40 -> 4.50"
36
+ affected_api: "Trainer.predict"
37
+ description: "Method removed; users should use evaluate() with prediction_loss_only=False"
38
+ breakage_primitive: "RemoveDeprecatedMethod"
39
+ params:
40
+ class_name: "Trainer"
41
+ method_name: "predict"
42
+ replacement: "evaluate"
43
+ repair_primitive: "RestoreMethod"
44
+ category: "api_drift"
45
+
46
+ - version_range: "transformers 4.36 -> 4.40"
47
+ affected_api: "TrainingArguments"
48
+ description: "num_train_epochs default behavior changed; max_steps now preferred"
49
+ breakage_primitive: "ModifyConfigField"
50
+ params:
51
+ config_class: "TrainingArguments"
52
+ field_name: "num_train_epochs"
53
+ new_value: "0"
54
+ repair_primitive: "RestoreConfigField"
55
+ category: "config_drift"
56
+
57
+ - version_range: "transformers 4.34 -> 4.42"
58
+ affected_api: "Tokenizer.__call__"
59
+ description: "padding=True semantics changed; users should pass padding='max_length'"
60
+ breakage_primitive: "ChangeTokenizerBehavior"
61
+ params:
62
+ old_kwarg: "padding"
63
+ old_value: "True"
64
+ new_kwarg: "padding"
65
+ new_value: '"max_length"'
66
+ repair_primitive: "RestoreTokenizerKwarg"
67
+ category: "tokenizer_drift"
68
+
69
+ - version_range: "transformers 4.20 -> 4.30"
70
+ affected_api: "imports"
71
+ description: "transformers.training_args moved to transformers.training_args_pt"
72
+ breakage_primitive: "DeprecateImport"
73
+ params:
74
+ old_module: "from transformers.training_args"
75
+ new_module: "from transformers.training_args_pt"
76
+ repair_primitive: "RestoreImport"
77
+ category: "import_drift"
78
+
79
+ - version_range: "transformers 4.45 -> 4.50"
80
+ affected_api: "save_pretrained"
81
+ description: "save_pretrained() now requires safe_serialization to default True"
82
+ breakage_primitive: "ChangeArgumentSignature"
83
+ params:
84
+ function_name: "save_pretrained"
85
+ removed_arg: "safe_serialization"
86
+ added_arg: "safe_serialization"
87
+ added_value: "True"
88
+ repair_primitive: "RestoreArgument"
89
+ category: "api_drift"
90
+
91
+ - version_range: "datasets 2.18 -> 3.0"
92
+ affected_api: "Dataset.set_format"
93
+ description: "set_format(type='torch') signature stricter, columns required"
94
+ breakage_primitive: "ChangeArgumentSignature"
95
+ params:
96
+ function_name: "set_format"
97
+ removed_arg: "columns"
98
+ added_arg: "columns"
99
+ added_value: '["input_ids", "attention_mask", "labels"]'
100
+ repair_primitive: "RestoreArgument"
101
+ category: "api_drift"
102
+
103
+ - version_range: "transformers 4.36 -> 4.45"
104
+ affected_api: "Tokenizer.__call__"
105
+ description: "max_length default reduced from 512 -> 256 for some tokenizers"
106
+ breakage_primitive: "ModifyConfigField"
107
+ params:
108
+ config_class: "tokenizer"
109
+ field_name: "max_length"
110
+ new_value: "256"
111
+ repair_primitive: "RestoreConfigField"
112
+ category: "tokenizer_drift"
113
+
114
+ - version_range: "transformers 4.40 -> 4.45"
115
+ affected_api: "DataCollatorWithPadding"
116
+ description: "Renamed `tokenizer` -> `processing_class` in DataCollator constructors"
117
+ breakage_primitive: "RenameApiCall"
118
+ params:
119
+ old_name: "tokenizer"
120
+ new_name: "processing_class"
121
+ repair_primitive: "RestoreApiCall"
122
+ category: "api_drift"
123
+
124
+ - version_range: "datasets 2.14 -> 2.18"
125
+ affected_api: "load_dataset"
126
+ description: "Some splits renamed train[:500] semantics changed"
127
+ breakage_primitive: "RestructureDatasetSchema"
128
+ params:
129
+ old_column: "sentence"
130
+ new_column: "text"
131
+ repair_primitive: "RestoreColumn"
132
+ category: "dataset_drift"
133
+
134
+ - version_range: "transformers 4.45 -> 4.50"
135
+ affected_api: "Trainer"
136
+ description: "evaluation_strategy was deprecated and removed"
137
+ breakage_primitive: "RemoveDeprecatedMethod"
138
+ params:
139
+ class_name: "Trainer"
140
+ method_name: "evaluate"
141
+ replacement: "evaluate_legacy"
142
+ repair_primitive: "RestoreMethod"
143
+ category: "api_drift"
144
+
145
+ - version_range: "transformers 4.30 -> 4.40"
146
+ affected_api: "PreTrainedModel.from_pretrained"
147
+ description: "torch_dtype now required for some quantized model paths"
148
+ breakage_primitive: "ChangeArgumentSignature"
149
+ params:
150
+ function_name: "from_pretrained"
151
+ removed_arg: "torch_dtype"
152
+ added_arg: "torch_dtype"
153
+ added_value: '"auto"'
154
+ repair_primitive: "RestoreArgument"
155
+ category: "api_drift"
156
+
157
+ - version_range: "datasets 3.0 -> 3.2"
158
+ affected_api: "Dataset.rename_column"
159
+ description: "rename_column raises if target name exists"
160
+ breakage_primitive: "RestructureDatasetSchema"
161
+ params:
162
+ old_column: "labels"
163
+ new_column: "label"
164
+ repair_primitive: "RestoreColumn"
165
+ category: "dataset_drift"
166
+
167
+ - version_range: "transformers 4.36 -> 4.42"
168
+ affected_api: "TrainingArguments.report_to"
169
+ description: "Default report_to changed from 'all' to 'none'"
170
+ breakage_primitive: "ModifyConfigField"
171
+ params:
172
+ config_class: "TrainingArguments"
173
+ field_name: "report_to"
174
+ new_value: '"all"'
175
+ repair_primitive: "RestoreConfigField"
176
+ category: "config_drift"
177
+
178
+ - version_range: "transformers 4.40 -> 4.50"
179
+ affected_api: "imports"
180
+ description: "transformers.deepspeed moved to accelerate.utils.deepspeed"
181
+ breakage_primitive: "DeprecateImport"
182
+ params:
183
+ old_module: "from transformers.deepspeed"
184
+ new_module: "from accelerate.utils.deepspeed"
185
+ repair_primitive: "RestoreImport"
186
+ category: "import_drift"
187
+
188
+ - version_range: "transformers 4.45 -> 4.50"
189
+ affected_api: "Tokenizer return"
190
+ description: "Tokenizer call output now returns a BatchEncoding with .encodings attribute"
191
+ breakage_primitive: "ChangeReturnType"
192
+ params:
193
+ function_name: "tokenizer"
194
+ old_access: "tokenizer(text)"
195
+ new_access: "tokenizer(text).encodings"
196
+ repair_primitive: "RestoreReturnAccess"
197
+ category: "api_drift"
198
+
199
+ - version_range: "transformers 4.30 -> 4.40"
200
+ affected_api: "save_pretrained"
201
+ description: "save_pretrained -> save_pretrained_directory rename in some classes"
202
+ breakage_primitive: "RenameApiCall"
203
+ params:
204
+ old_name: "save_pretrained"
205
+ new_name: "save_pretrained_directory"
206
+ repair_primitive: "RestoreApiCall"
207
+ category: "api_drift"
208
+
209
+ - version_range: "transformers 4.45 -> 4.50"
210
+ affected_api: "TrainingArguments.no_cuda"
211
+ description: "no_cuda renamed to use_cpu (logic inverted)"
212
+ breakage_primitive: "RenameApiCall"
213
+ params:
214
+ old_name: "no_cuda"
215
+ new_name: "use_cpu"
216
+ repair_primitive: "RestoreApiCall"
217
+ category: "config_drift"
forgeenv-space/forgeenv/primitives/repair_primitives.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Repair primitives — direct inverses of the 8 breakage primitives.
2
+
3
+ Used during warm-start data generation: for every (script, breakage)
4
+ pair we know the canonical repair, so we can write SFT pairs.
5
+
6
+ These are also useful for unit-testing the breakage primitives:
7
+ apply(breakage) then apply(repair) should be (close to) the identity.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from abc import ABC, abstractmethod
13
+ from dataclasses import dataclass, field
14
+
15
+
16
+ @dataclass
17
+ class RepairPrimitive(ABC):
18
+ category: str = field(default="generic", init=False)
19
+ name: str = field(default="RepairPrimitive", init=False)
20
+ description: str = field(default="", init=False)
21
+
22
+ @abstractmethod
23
+ def apply(self, script: str) -> str:
24
+ """Transform `script` to undo the corresponding breakage."""
25
+
26
+ def to_spec(self) -> dict:
27
+ return {
28
+ "primitive_type": self.__class__.__name__,
29
+ "category": self.category,
30
+ "params": self._get_params(),
31
+ }
32
+
33
+ @abstractmethod
34
+ def _get_params(self) -> dict:
35
+ """Return JSON-serializable constructor parameters."""
36
+
37
+
38
+ @dataclass
39
+ class RestoreApiCall(RepairPrimitive):
40
+ new_name: str = ""
41
+ old_name: str = ""
42
+
43
+ def __post_init__(self) -> None:
44
+ self.category = "api_drift"
45
+ self.name = "RestoreApiCall"
46
+ self.description = f"Rename {self.new_name} -> {self.old_name}"
47
+
48
+ def apply(self, script: str) -> str:
49
+ if not self.new_name:
50
+ return script
51
+ pattern = re.compile(rf"(?<!\w){re.escape(self.new_name)}(?!\w)")
52
+ return pattern.sub(self.old_name, script)
53
+
54
+ def _get_params(self) -> dict:
55
+ return {"new_name": self.new_name, "old_name": self.old_name}
56
+
57
+
58
+ @dataclass
59
+ class RestoreImport(RepairPrimitive):
60
+ new_module: str = ""
61
+ old_module: str = ""
62
+
63
+ def __post_init__(self) -> None:
64
+ self.category = "import_drift"
65
+ self.name = "RestoreImport"
66
+ self.description = f"Restore import {self.new_module} -> {self.old_module}"
67
+
68
+ def apply(self, script: str) -> str:
69
+ return script.replace(self.new_module, self.old_module)
70
+
71
+ def _get_params(self) -> dict:
72
+ return {"new_module": self.new_module, "old_module": self.old_module}
73
+
74
+
75
+ @dataclass
76
+ class RestoreArgument(RepairPrimitive):
77
+ """Re-add a removed argument to a function call."""
78
+
79
+ function_name: str = ""
80
+ arg_name: str = ""
81
+ arg_value: str = ""
82
+
83
+ def __post_init__(self) -> None:
84
+ self.category = "api_drift"
85
+ self.name = "RestoreArgument"
86
+ self.description = (
87
+ f"Add {self.arg_name}={self.arg_value} to {self.function_name}()"
88
+ )
89
+
90
+ def apply(self, script: str) -> str:
91
+ if not self.function_name:
92
+ return script
93
+ # Insert the kwarg right after the function-name's opening paren.
94
+ pattern = rf"({re.escape(self.function_name)}\s*\()(\s*)"
95
+ replacement = rf"\g<1>{self.arg_name}={self.arg_value}, \g<2>"
96
+ return re.sub(pattern, replacement, script, count=1)
97
+
98
+ def _get_params(self) -> dict:
99
+ return {
100
+ "function_name": self.function_name,
101
+ "arg_name": self.arg_name,
102
+ "arg_value": self.arg_value,
103
+ }
104
+
105
+
106
+ @dataclass
107
+ class RestoreConfigField(RepairPrimitive):
108
+ field_name: str = ""
109
+ old_value: str = ""
110
+
111
+ def __post_init__(self) -> None:
112
+ self.category = "config_drift"
113
+ self.name = "RestoreConfigField"
114
+ self.description = f"Restore {self.field_name}={self.old_value}"
115
+
116
+ def apply(self, script: str) -> str:
117
+ if not self.field_name:
118
+ return script
119
+ pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
120
+ return re.sub(pattern, rf"\g<1>{self.old_value}", script)
121
+
122
+ def _get_params(self) -> dict:
123
+ return {"field_name": self.field_name, "old_value": self.old_value}
124
+
125
+
126
+ @dataclass
127
+ class RestoreColumn(RepairPrimitive):
128
+ new_column: str = ""
129
+ old_column: str = ""
130
+
131
+ def __post_init__(self) -> None:
132
+ self.category = "dataset_drift"
133
+ self.name = "RestoreColumn"
134
+ self.description = f"Rename column {self.new_column} -> {self.old_column}"
135
+
136
+ def apply(self, script: str) -> str:
137
+ return script.replace(
138
+ f'"{self.new_column}"', f'"{self.old_column}"'
139
+ ).replace(
140
+ f"'{self.new_column}'", f"'{self.old_column}'"
141
+ )
142
+
143
+ def _get_params(self) -> dict:
144
+ return {"new_column": self.new_column, "old_column": self.old_column}
145
+
146
+
147
+ @dataclass
148
+ class RestoreTokenizerKwarg(RepairPrimitive):
149
+ new_kwarg: str = ""
150
+ new_value: str = ""
151
+ old_kwarg: str = ""
152
+ old_value: str = ""
153
+
154
+ def __post_init__(self) -> None:
155
+ self.category = "tokenizer_drift"
156
+ self.name = "RestoreTokenizerKwarg"
157
+ self.description = (
158
+ f"Restore tokenizer {self.new_kwarg}={self.new_value} -> "
159
+ f"{self.old_kwarg}={self.old_value}"
160
+ )
161
+
162
+ def apply(self, script: str) -> str:
163
+ if not self.new_kwarg:
164
+ return script
165
+ pattern = rf"{re.escape(self.new_kwarg)}\s*=\s*{re.escape(self.new_value)}"
166
+ replacement = f"{self.old_kwarg}={self.old_value}"
167
+ return re.sub(pattern, replacement, script)
168
+
169
+ def _get_params(self) -> dict:
170
+ return {
171
+ "new_kwarg": self.new_kwarg,
172
+ "new_value": self.new_value,
173
+ "old_kwarg": self.old_kwarg,
174
+ "old_value": self.old_value,
175
+ }
176
+
177
+
178
+ @dataclass
179
+ class RestoreMethod(RepairPrimitive):
180
+ method_name: str = ""
181
+
182
+ def __post_init__(self) -> None:
183
+ self.category = "api_drift"
184
+ self.name = "RestoreMethod"
185
+ self.description = f"Un-deprecate .{self.method_name}()"
186
+
187
+ def apply(self, script: str) -> str:
188
+ if not self.method_name:
189
+ return script
190
+ return script.replace(
191
+ f".{self.method_name}_DEPRECATED(", f".{self.method_name}("
192
+ )
193
+
194
+ def _get_params(self) -> dict:
195
+ return {"method_name": self.method_name}
196
+
197
+
198
+ @dataclass
199
+ class RestoreReturnAccess(RepairPrimitive):
200
+ new_access: str = ""
201
+ old_access: str = ""
202
+
203
+ def __post_init__(self) -> None:
204
+ self.category = "api_drift"
205
+ self.name = "RestoreReturnAccess"
206
+ self.description = f"Restore return-access {self.new_access} -> {self.old_access}"
207
+
208
+ def apply(self, script: str) -> str:
209
+ if not self.new_access:
210
+ return script
211
+ return script.replace(self.new_access, self.old_access)
212
+
213
+ def _get_params(self) -> dict:
214
+ return {"new_access": self.new_access, "old_access": self.old_access}
215
+
216
+
217
+ REPAIR_REGISTRY: dict[str, type[RepairPrimitive]] = {
218
+ "RestoreApiCall": RestoreApiCall,
219
+ "RestoreImport": RestoreImport,
220
+ "RestoreArgument": RestoreArgument,
221
+ "RestoreConfigField": RestoreConfigField,
222
+ "RestoreColumn": RestoreColumn,
223
+ "RestoreTokenizerKwarg": RestoreTokenizerKwarg,
224
+ "RestoreMethod": RestoreMethod,
225
+ "RestoreReturnAccess": RestoreReturnAccess,
226
+ }
227
+
228
+
229
+ # Map a breakage primitive's class name to the repair-primitive class that
230
+ # inverts it. Used by the warm-start pair generator and by the demo / repair
231
+ # library curator.
232
+ BREAKAGE_TO_REPAIR: dict[str, str] = {
233
+ "RenameApiCall": "RestoreApiCall",
234
+ "DeprecateImport": "RestoreImport",
235
+ "ChangeArgumentSignature": "RestoreArgument",
236
+ "ModifyConfigField": "RestoreConfigField",
237
+ "RestructureDatasetSchema": "RestoreColumn",
238
+ "ChangeTokenizerBehavior": "RestoreTokenizerKwarg",
239
+ "RemoveDeprecatedMethod": "RestoreMethod",
240
+ "ChangeReturnType": "RestoreReturnAccess",
241
+ }
forgeenv-space/forgeenv/roles/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/roles/drift_generator.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Drift Generator parser + a deterministic baseline policy.
2
+
3
+ In training the LLM produces a JSON breakage spec; we parse it. In rollouts
4
+ where we want a baseline (or a fallback when the LLM emits malformed JSON)
5
+ we use `BaselineDriftGenerator`, which samples from the per-category set of
6
+ known good primitive parameterisations.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import random
12
+ import re
13
+ from dataclasses import dataclass
14
+ from typing import Optional
15
+
16
+ from forgeenv.primitives.breakage_primitives import (
17
+ PRIMITIVE_REGISTRY,
18
+ parse_breakage_spec,
19
+ BreakagePrimitive,
20
+ )
21
+
22
+
23
+ _JSON_RE = re.compile(r"\{[\s\S]*\}")
24
+
25
+
26
+ def parse_drift_output(text: str) -> Optional[dict]:
27
+ """Extract a JSON object from possibly-noisy LLM output.
28
+
29
+ Handles markdown fences, prose preamble, trailing commas (best-effort).
30
+ Returns None on failure.
31
+ """
32
+ if not text:
33
+ return None
34
+ text = text.strip()
35
+ if text.startswith("```"):
36
+ text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
37
+ text = re.sub(r"\n?```$", "", text)
38
+ match = _JSON_RE.search(text)
39
+ if not match:
40
+ return None
41
+ blob = match.group(0)
42
+ try:
43
+ return json.loads(blob)
44
+ except json.JSONDecodeError:
45
+ cleaned = re.sub(r",\s*([}\]])", r"\1", blob)
46
+ try:
47
+ return json.loads(cleaned)
48
+ except json.JSONDecodeError:
49
+ return None
50
+
51
+
52
+ def parse_drift_to_primitive(text: str) -> Optional[BreakagePrimitive]:
53
+ """End-to-end: LLM text -> validated BreakagePrimitive (or None)."""
54
+ data = parse_drift_output(text)
55
+ if not isinstance(data, dict):
56
+ return None
57
+ try:
58
+ return parse_breakage_spec(data)
59
+ except (ValueError, TypeError):
60
+ return None
61
+
62
+
63
+ # ---------------------------------------------------------------- baselines
64
+ _DEFAULT_PARAMS_BY_TYPE: dict[str, list[dict]] = {
65
+ "RenameApiCall": [
66
+ {"old_name": "trainer.train", "new_name": "trainer.start_training"},
67
+ {"old_name": "save_pretrained", "new_name": "save_to_hub"},
68
+ {"old_name": "from_pretrained", "new_name": "load_from_hub"},
69
+ ],
70
+ "DeprecateImport": [
71
+ {
72
+ "old_module": "from transformers import Trainer",
73
+ "new_module": "from transformers.legacy import Trainer",
74
+ },
75
+ {
76
+ "old_module": "from transformers import TrainingArguments",
77
+ "new_module": "from transformers.training import TrainingArguments",
78
+ },
79
+ ],
80
+ "ChangeArgumentSignature": [
81
+ {
82
+ "function_name": "TrainingArguments",
83
+ "removed_arg": "num_train_epochs",
84
+ "added_arg": "max_steps",
85
+ "added_value": "1000",
86
+ },
87
+ {
88
+ "function_name": "TrainingArguments",
89
+ "removed_arg": "evaluation_strategy",
90
+ "added_arg": "eval_strategy",
91
+ "added_value": '"steps"',
92
+ },
93
+ ],
94
+ "ModifyConfigField": [
95
+ {"config_class": "TrainingArguments", "field_name": "learning_rate", "new_value": "5e-3"},
96
+ {"config_class": "TrainingArguments", "field_name": "per_device_train_batch_size", "new_value": "1"},
97
+ ],
98
+ "RestructureDatasetSchema": [
99
+ {"old_column": "text", "new_column": "input_text"},
100
+ {"old_column": "label", "new_column": "labels"},
101
+ {"old_column": "tokens", "new_column": "words"},
102
+ ],
103
+ "ChangeTokenizerBehavior": [
104
+ {"old_kwarg": "padding", "old_value": "True", "new_kwarg": "pad_to_max_length", "new_value": "True"},
105
+ {"old_kwarg": "truncation", "old_value": "True", "new_kwarg": "truncate", "new_value": "True"},
106
+ ],
107
+ "RemoveDeprecatedMethod": [
108
+ {"class_name": "Trainer", "method_name": "evaluate", "replacement": "evaluation_loop"},
109
+ {"class_name": "Trainer", "method_name": "save_model", "replacement": "save_to_hub"},
110
+ ],
111
+ "ChangeReturnType": [
112
+ {"function_name": "Trainer.predict", "old_access": ".predictions", "new_access": "[0]"},
113
+ {"function_name": "tokenizer", "old_access": '["input_ids"]', "new_access": ".input_ids"},
114
+ ],
115
+ }
116
+
117
+
118
+ @dataclass
119
+ class BaselineDriftGenerator:
120
+ """Deterministic stand-in for the LLM Drift Generator.
121
+
122
+ Used for warm-start data, baseline rollouts, and unit tests.
123
+ """
124
+
125
+ seed: Optional[int] = None
126
+
127
+ def __post_init__(self) -> None:
128
+ self._rng = random.Random(self.seed) if self.seed is not None else random
129
+
130
+ def propose(
131
+ self, target_category: str = "", script: str = ""
132
+ ) -> dict:
133
+ """Produce a JSON-serializable breakage spec for `target_category`.
134
+
135
+ Order of preference:
136
+ 1. A primitive of `target_category` whose default params apply to `script`.
137
+ 2. A primitive of any type whose default params apply to `script`.
138
+ 3. A primitive of `target_category` (no-op fallback).
139
+ """
140
+
141
+ preferred_types = (
142
+ [target_category] if target_category in _DEFAULT_PARAMS_BY_TYPE else []
143
+ )
144
+ all_types = list(_DEFAULT_PARAMS_BY_TYPE.keys())
145
+
146
+ for type_set in (preferred_types, all_types):
147
+ shuffled = self._rng.sample(type_set, len(type_set)) if type_set else []
148
+ for ptype in shuffled:
149
+ for params in self._rng.sample(
150
+ _DEFAULT_PARAMS_BY_TYPE[ptype],
151
+ len(_DEFAULT_PARAMS_BY_TYPE[ptype]),
152
+ ):
153
+ if self._params_apply_to_script(ptype, params, script):
154
+ return {"primitive_type": ptype, "params": dict(params)}
155
+
156
+ ptype = preferred_types[0] if preferred_types else all_types[0]
157
+ return {
158
+ "primitive_type": ptype,
159
+ "params": dict(_DEFAULT_PARAMS_BY_TYPE[ptype][0]),
160
+ }
161
+
162
+ @staticmethod
163
+ def _params_apply_to_script(ptype: str, params: dict, script: str) -> bool:
164
+ """Heuristic: would this primitive actually mutate `script`?"""
165
+ if not script:
166
+ return True
167
+ for key in ("old_name", "old_module", "removed_arg", "field_name", "old_column", "old_kwarg", "method_name", "old_access"):
168
+ if key in params and params[key] and params[key] in script:
169
+ return True
170
+ return False
forgeenv-space/forgeenv/roles/prompts.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System and user prompts for the two RL roles.
2
+
3
+ Both roles are trained from the same base policy (Qwen-2.5-Coder-7B) with
4
+ LoRA adapters per role, so role prompts are the only thing distinguishing
5
+ them at inference time. Keep them concise — every token is a token of GPU
6
+ budget during GRPO rollouts.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Iterable
11
+
12
+
13
+ PRIMITIVE_DESCRIPTIONS = {
14
+ "RenameApiCall": "Rename a function/method call (api_drift)",
15
+ "DeprecateImport": "Change an import path (import_drift)",
16
+ "ChangeArgumentSignature": "Remove an expected kwarg from a call (api_drift)",
17
+ "ModifyConfigField": "Change a config-class default (config_drift)",
18
+ "RestructureDatasetSchema": "Rename a dataset column reference (dataset_drift)",
19
+ "ChangeTokenizerBehavior": "Change tokenizer call kwargs (tokenizer_drift)",
20
+ "RemoveDeprecatedMethod": "Remove a method, leaving a sentinel _DEPRECATED suffix (api_drift)",
21
+ "ChangeReturnType": "Function returns a different structure (api_drift)",
22
+ }
23
+
24
+ DRIFT_GENERATOR_SYSTEM_PROMPT = """You are the Drift Generator.
25
+ You see a working HuggingFace training script and the curriculum target category.
26
+ Output exactly one JSON object describing a breakage primitive that simulates
27
+ realistic library version drift. The primitive must:
28
+ 1. Be PLAUSIBLE — match the kind of breakage that happens between real
29
+ transformers/datasets/trl releases.
30
+ 2. Be SOLVABLE — the Repair Agent should be able to fix it from the error trace alone.
31
+ 3. Match the requested target_category.
32
+
33
+ Output schema:
34
+ {"primitive_type": "<one of the 8 types>", "params": { ... }}
35
+
36
+ Available primitive types and parameter schemas:
37
+ - RenameApiCall: {"old_name": str, "new_name": str}
38
+ - DeprecateImport: {"old_module": str, "new_module": str}
39
+ - ChangeArgumentSignature: {"function_name": str, "removed_arg": str, "added_arg": str, "added_value": str}
40
+ - ModifyConfigField: {"config_class": str, "field_name": str, "new_value": str}
41
+ - RestructureDatasetSchema: {"old_column": str, "new_column": str}
42
+ - ChangeTokenizerBehavior: {"old_kwarg": str, "old_value": str, "new_kwarg": str, "new_value": str}
43
+ - RemoveDeprecatedMethod: {"class_name": str, "method_name": str, "replacement": str}
44
+ - ChangeReturnType: {"function_name": str, "old_access": str, "new_access": str}
45
+
46
+ Output ONLY the JSON object — no commentary, no markdown fences.
47
+ """
48
+
49
+
50
+ REPAIR_AGENT_SYSTEM_PROMPT = """You are the Repair Agent.
51
+ You see a broken HuggingFace training script, an error trace, and the current
52
+ library version snapshot. Output ONLY a unified diff that fixes the script.
53
+
54
+ Rules:
55
+ 1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`
56
+ headers and `@@ ... @@` hunk markers.
57
+ 2. Make the MINIMAL change that resolves the error AND preserves the original
58
+ training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit
59
+ calls.
60
+ 3. Do NOT add any prose, markdown fences, or thinking output — diff only.
61
+ 4. If the error is unfixable, output an empty diff.
62
+ """
63
+
64
+
65
+ def render_drift_generator_prompt(
66
+ script: str, target_category: str, library_versions: dict
67
+ ) -> str:
68
+ versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
69
+ return f"""Target category: {target_category}
70
+ Library versions: {versions_str}
71
+
72
+ Working script:
73
+ ```python
74
+ {script}
75
+ ```
76
+
77
+ Output JSON breakage primitive:"""
78
+
79
+
80
+ def render_repair_agent_prompt(
81
+ broken_script: str,
82
+ error_trace: str,
83
+ library_versions: dict,
84
+ target_category: str = "",
85
+ ) -> str:
86
+ versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
87
+ return f"""Library versions: {versions_str}
88
+ Target category hint: {target_category or 'unknown'}
89
+
90
+ Broken script:
91
+ ```python
92
+ {broken_script}
93
+ ```
94
+
95
+ Error trace:
96
+ {error_trace}
97
+
98
+ Output unified diff (no prose, no fences):"""
99
+
100
+
101
+ def list_primitive_descriptions() -> Iterable[str]:
102
+ return (f"- {k}: {v}" for k, v in PRIMITIVE_DESCRIPTIONS.items())
forgeenv-space/forgeenv/roles/repair_agent.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Repair Agent helpers: response sanitisation + a deterministic baseline.
2
+
3
+ The Repair Agent's training output is a unified diff. LLMs frequently emit
4
+ prose / fences / chain-of-thought before the diff; this module strips that
5
+ preamble. The baseline policy uses the inverse-primitive map from
6
+ `repair_primitives.py` to produce ground-truth diffs for warm-start.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass
12
+ from typing import Optional
13
+
14
+ from forgeenv.env.diff_utils import make_unified_diff
15
+ from forgeenv.primitives.breakage_primitives import (
16
+ parse_breakage_spec,
17
+ BreakagePrimitive,
18
+ )
19
+ from forgeenv.primitives.repair_primitives import (
20
+ BREAKAGE_TO_REPAIR,
21
+ REPAIR_REGISTRY,
22
+ RepairPrimitive,
23
+ )
24
+
25
+
26
+ _DIFF_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
27
+ _FENCE_RE = re.compile(r"```[a-zA-Z]*\n([\s\S]*?)\n```")
28
+
29
+
30
+ def extract_diff(raw_text: str) -> str:
31
+ """Pull the unified diff out of an LLM response.
32
+
33
+ Handles: code fences, leading prose / chain-of-thought, trailing notes.
34
+ """
35
+ if not raw_text:
36
+ return ""
37
+ raw_text = raw_text.strip()
38
+
39
+ fence_match = _FENCE_RE.search(raw_text)
40
+ if fence_match:
41
+ raw_text = fence_match.group(1).strip()
42
+
43
+ lines = raw_text.splitlines()
44
+ start = 0
45
+ for i, line in enumerate(lines):
46
+ if line.startswith(("---", "+++", "@@")):
47
+ start = i
48
+ break
49
+
50
+ return "\n".join(lines[start:])
51
+
52
+
53
+ def looks_like_diff(text: str) -> bool:
54
+ if not text:
55
+ return False
56
+ has_header = "---" in text and "+++" in text
57
+ has_hunk = bool(_DIFF_HUNK_RE.search(text))
58
+ has_pm = any(line.startswith(("+", "-")) for line in text.splitlines())
59
+ return (has_header and has_hunk) or (has_hunk and has_pm)
60
+
61
+
62
+ # ---------------------------------------------------------------- baselines
63
+ @dataclass
64
+ class BaselineRepairAgent:
65
+ """Deterministic Repair Agent that uses the primitive inverse map.
66
+
67
+ Used for warm-start dataset generation and baseline rollout comparisons.
68
+ """
69
+
70
+ def repair(
71
+ self,
72
+ broken_script: str,
73
+ breakage_spec: Optional[dict] = None,
74
+ original_script: str = "",
75
+ ) -> str:
76
+ """Return a unified diff (or full replacement script) that fixes the
77
+ broken script.
78
+
79
+ Strategy preference:
80
+ 1. If `original_script` is provided, return a diff between the
81
+ broken script and the original (oracle). This is the warm-start
82
+ path — we always know the ground truth.
83
+ 2. Otherwise try to invert the structured breakage_spec via the
84
+ repair-primitive registry.
85
+ 3. Otherwise return an empty diff.
86
+ """
87
+ if original_script and original_script != broken_script:
88
+ return make_unified_diff(broken_script, original_script)
89
+
90
+ if breakage_spec:
91
+ try:
92
+ breakage = parse_breakage_spec(breakage_spec)
93
+ except (ValueError, TypeError):
94
+ breakage = None
95
+ if breakage is not None:
96
+ repair = _invert_breakage(breakage)
97
+ if repair is not None:
98
+ repaired = repair.apply(broken_script)
99
+ if repaired != broken_script:
100
+ return make_unified_diff(broken_script, repaired)
101
+
102
+ return ""
103
+
104
+
105
+ _PARAM_REMAP: dict[str, dict[str, str]] = {
106
+ "RenameApiCall": {"old_name": "old_name", "new_name": "new_name"},
107
+ "DeprecateImport": {"old_module": "old_module", "new_module": "new_module"},
108
+ "ChangeArgumentSignature": {
109
+ "function_name": "function_name",
110
+ "removed_arg": "arg_name",
111
+ },
112
+ "ModifyConfigField": {"field_name": "field_name"},
113
+ "RestructureDatasetSchema": {
114
+ "old_column": "old_column",
115
+ "new_column": "new_column",
116
+ },
117
+ "ChangeTokenizerBehavior": {
118
+ "old_kwarg": "old_kwarg",
119
+ "old_value": "old_value",
120
+ "new_kwarg": "new_kwarg",
121
+ "new_value": "new_value",
122
+ },
123
+ "RemoveDeprecatedMethod": {"method_name": "method_name"},
124
+ "ChangeReturnType": {"old_access": "old_access", "new_access": "new_access"},
125
+ }
126
+
127
+
128
+ def _invert_breakage(breakage: BreakagePrimitive) -> Optional[RepairPrimitive]:
129
+ breakage_name = type(breakage).__name__
130
+ repair_name = BREAKAGE_TO_REPAIR.get(breakage_name)
131
+ if repair_name is None:
132
+ return None
133
+ repair_cls = REPAIR_REGISTRY.get(repair_name)
134
+ if repair_cls is None:
135
+ return None
136
+
137
+ breakage_params = breakage._get_params() # type: ignore[attr-defined]
138
+ remap = _PARAM_REMAP.get(breakage_name, {})
139
+ mapped: dict[str, str] = {}
140
+ for src_key, dst_key in remap.items():
141
+ if src_key in breakage_params:
142
+ mapped[dst_key] = breakage_params[src_key]
143
+
144
+ valid_fields = {
145
+ f.name
146
+ for f in repair_cls.__dataclass_fields__.values() # type: ignore[attr-defined]
147
+ if f.init
148
+ }
149
+ filtered = {k: v for k, v in mapped.items() if k in valid_fields}
150
+ try:
151
+ return repair_cls(**filtered)
152
+ except TypeError:
153
+ return None
forgeenv-space/forgeenv/roles/teacher.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Teacher (curriculum controller).
2
+
3
+ Deterministic — NOT an LLM. Maintains an EMA success rate per breakage
4
+ category and routes the next episode toward the category where the
5
+ Repair Agent is closest to a 50% success rate (R-Zero's difficulty band).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import random
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class Teacher:
15
+ categories: list[str]
16
+ alpha: float = 0.9
17
+ success_counts: dict[str, int] = field(default_factory=dict)
18
+ attempt_counts: dict[str, int] = field(default_factory=dict)
19
+ ema_success: dict[str, float] = field(default_factory=dict)
20
+
21
+ def __post_init__(self) -> None:
22
+ for category in self.categories:
23
+ self.success_counts.setdefault(category, 0)
24
+ self.attempt_counts.setdefault(category, 0)
25
+ self.ema_success.setdefault(category, 0.5)
26
+
27
+ def update(self, category: str, success: bool) -> None:
28
+ if category not in self.ema_success:
29
+ self.categories.append(category)
30
+ self.ema_success[category] = 0.5
31
+ self.success_counts[category] = 0
32
+ self.attempt_counts[category] = 0
33
+
34
+ self.attempt_counts[category] += 1
35
+ self.success_counts[category] += int(success)
36
+ rate = self.success_counts[category] / max(1, self.attempt_counts[category])
37
+ self.ema_success[category] = (
38
+ self.alpha * self.ema_success[category] + (1 - self.alpha) * rate
39
+ )
40
+
41
+ def select_next_category(self) -> str:
42
+ in_zone = {
43
+ c: abs(s - 0.5) for c, s in self.ema_success.items() if 0.3 <= s <= 0.7
44
+ }
45
+ if in_zone:
46
+ weights = [1.0 / (v + 0.01) for v in in_zone.values()]
47
+ return random.choices(list(in_zone.keys()), weights=weights, k=1)[0]
48
+ return min(self.ema_success, key=lambda c: abs(self.ema_success[c] - 0.5))
49
+
50
+ def get_state(self) -> dict:
51
+ return {
52
+ c: {
53
+ "ema_success": round(self.ema_success[c], 4),
54
+ "attempts": self.attempt_counts[c],
55
+ "successes": self.success_counts[c],
56
+ }
57
+ for c in self.categories
58
+ }
forgeenv-space/forgeenv/sandbox/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/sandbox/ast_validator.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """AST-based script validator.
2
+
3
+ Catches forbidden imports and dangerous patterns BEFORE any execution
4
+ happens. This is a critical defense against reward hacking via system
5
+ calls, network access, or process manipulation.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import ast
10
+
11
+ from forgeenv.tasks.models import ValidationResult
12
+
13
+ FORBIDDEN_MODULES = {
14
+ "os",
15
+ "subprocess",
16
+ "socket",
17
+ "urllib",
18
+ "requests",
19
+ "ctypes",
20
+ "shutil",
21
+ "signal",
22
+ "multiprocessing",
23
+ "threading",
24
+ }
25
+
26
+ FORBIDDEN_FUNCTIONS = {"eval", "exec", "compile", "__import__"}
27
+
28
+
29
+ def validate_script(script_content: str) -> ValidationResult:
30
+ """Parse a script as AST and reject forbidden patterns.
31
+
32
+ Returns a ValidationResult with `is_valid` and a list of `violations`.
33
+ """
34
+ violations: list[str] = []
35
+
36
+ try:
37
+ tree = ast.parse(script_content)
38
+ except SyntaxError as e:
39
+ return ValidationResult(is_valid=False, violations=[f"SyntaxError: {e}"])
40
+
41
+ for node in ast.walk(tree):
42
+ if isinstance(node, ast.Import):
43
+ for alias in node.names:
44
+ module_root = alias.name.split(".")[0]
45
+ if module_root in FORBIDDEN_MODULES:
46
+ violations.append(f"Forbidden import: {alias.name}")
47
+
48
+ if isinstance(node, ast.ImportFrom):
49
+ if node.module:
50
+ module_root = node.module.split(".")[0]
51
+ if module_root in FORBIDDEN_MODULES:
52
+ violations.append(f"Forbidden import from: {node.module}")
53
+
54
+ if isinstance(node, ast.Call):
55
+ if isinstance(node.func, ast.Name):
56
+ if node.func.id in FORBIDDEN_FUNCTIONS:
57
+ violations.append(f"Forbidden call: {node.func.id}()")
58
+ if isinstance(node.func, ast.Attribute):
59
+ if node.func.attr in FORBIDDEN_FUNCTIONS:
60
+ violations.append(f"Forbidden call: .{node.func.attr}()")
61
+
62
+ if isinstance(node, ast.Assign):
63
+ for target in node.targets:
64
+ if isinstance(target, ast.Name) and target.id == "__builtins__":
65
+ violations.append("Forbidden: __builtins__ assignment")
66
+
67
+ return ValidationResult(
68
+ is_valid=len(violations) == 0,
69
+ violations=violations,
70
+ )
forgeenv-space/forgeenv/sandbox/simulation_mode.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fast simulation executor for development.
2
+
3
+ Static-analysis-based execution simulator. Sub-100ms per call. No Docker
4
+ required. The success probability of a simulated run depends on whether
5
+ the script contains expected HF training markers (model imports, training
6
+ calls, save calls). When the simulation succeeds, a synthetic decreasing
7
+ loss curve is emitted; when it fails, a representative HF error is raised.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import random
12
+ import time
13
+ from typing import Optional
14
+
15
+ from forgeenv.sandbox.ast_validator import validate_script
16
+ from forgeenv.tasks.models import ExecutionResult, Task
17
+
18
+
19
+ class SimulationExecutor:
20
+ """Simulates script execution via static analysis.
21
+
22
+ Use this throughout development phases. Real Docker execution is added
23
+ later for grounded final-stage verification.
24
+ """
25
+
26
+ def __init__(self, seed: Optional[int] = None) -> None:
27
+ self._rng = random.Random(seed) if seed is not None else random
28
+
29
+ def execute(
30
+ self, script_content: str, task: Optional[Task] = None
31
+ ) -> ExecutionResult:
32
+ start = time.time()
33
+
34
+ validation = validate_script(script_content)
35
+ if not validation.is_valid:
36
+ return ExecutionResult(
37
+ exit_code=1,
38
+ stdout="",
39
+ stderr=f"Validation failed: {'; '.join(validation.violations)}",
40
+ wall_time_ms=int((time.time() - start) * 1000),
41
+ script_content=script_content,
42
+ )
43
+
44
+ try:
45
+ compile(script_content, "<forge_script>", "exec")
46
+ except SyntaxError as e:
47
+ return ExecutionResult(
48
+ exit_code=1,
49
+ stdout="",
50
+ stderr=f"SyntaxError: {e}",
51
+ wall_time_ms=int((time.time() - start) * 1000),
52
+ script_content=script_content,
53
+ )
54
+
55
+ has_model_import = any(
56
+ kw in script_content
57
+ for kw in ("from transformers", "import torch", "from datasets")
58
+ )
59
+ has_training_call = any(
60
+ kw in script_content
61
+ for kw in ("trainer.train()", ".fit(", "train_loop", "for epoch")
62
+ )
63
+ has_save = any(
64
+ kw in script_content
65
+ for kw in ("save_pretrained", "save_model", "torch.save")
66
+ )
67
+
68
+ success_prob = 0.3
69
+ if has_model_import:
70
+ success_prob += 0.3
71
+ if has_training_call:
72
+ success_prob += 0.2
73
+ if has_save:
74
+ success_prob += 0.1
75
+
76
+ # Mark obviously broken patterns as definite failures even when
77
+ # they pass syntactic compilation. The simulator pretends to be a
78
+ # static linter that catches AttributeError / ImportError signatures
79
+ # before they would fire at runtime.
80
+ broken_markers = (
81
+ "_DEPRECATED(",
82
+ "transformers.legacy",
83
+ "from transformers.training import",
84
+ ".start_training(",
85
+ "load_from_hub(",
86
+ "save_to_hub(",
87
+ "pad_to_max_length=",
88
+ "evaluation_loop(",
89
+ )
90
+ if any(marker in script_content for marker in broken_markers):
91
+ success_prob = 0.0
92
+ # Patterns that look like dataset column drift: a renamed column
93
+ # that doesn't appear in real HF datasets.
94
+ import re as _re
95
+
96
+ if _re.search(r"['\"]input_text['\"]\s*[]:),]", script_content):
97
+ success_prob = min(success_prob, 0.05)
98
+ if _re.search(r"['\"]words['\"]\s*[]:),]", script_content):
99
+ success_prob = min(success_prob, 0.05)
100
+ # Tokenizer kwarg drift (truncate is not valid; truncation is).
101
+ if _re.search(r"\btruncate\s*=", script_content):
102
+ success_prob = min(success_prob, 0.05)
103
+
104
+ succeeded = self._rng.random() < success_prob
105
+
106
+ if succeeded:
107
+ steps = self._rng.randint(20, 50)
108
+ log_lines: list[str] = []
109
+ loss = self._rng.uniform(2.0, 4.0)
110
+ for step in range(1, steps + 1):
111
+ loss *= self._rng.uniform(0.92, 0.99)
112
+ log_lines.append(f"step={step} loss={loss:.4f}")
113
+ log_lines.append("eval_accuracy=0.78")
114
+ log_lines.append("TRAINING_COMPLETE")
115
+
116
+ return ExecutionResult(
117
+ exit_code=0,
118
+ stdout="\n".join(log_lines),
119
+ stderr="",
120
+ wall_time_ms=int((time.time() - start) * 1000)
121
+ + self._rng.randint(1000, 5000),
122
+ checkpoint_exists=True,
123
+ peak_memory_mb=self._rng.uniform(500, 2000),
124
+ script_content=script_content,
125
+ )
126
+
127
+ error_types = [
128
+ "ImportError: cannot import name 'OldTrainer' from 'transformers'",
129
+ "AttributeError: 'Trainer' object has no attribute 'evaluate_model'",
130
+ "KeyError: 'text' column not found in dataset",
131
+ "TypeError: __init__() got an unexpected keyword argument 'num_epochs'",
132
+ "RuntimeError: Expected input batch_size (16) to match target batch_size (32)",
133
+ "ModuleNotFoundError: No module named 'transformers.legacy'",
134
+ ]
135
+ return ExecutionResult(
136
+ exit_code=1,
137
+ stdout="",
138
+ stderr=self._rng.choice(error_types),
139
+ wall_time_ms=int((time.time() - start) * 1000)
140
+ + self._rng.randint(100, 500),
141
+ script_content=script_content,
142
+ )
forgeenv-space/forgeenv/tasks/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/tasks/models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core data models for ForgeEnv tasks and execution results.
2
+
3
+ These are framework-internal dataclasses (not Pydantic) used throughout the
4
+ simulation, verifier, and primitive layers. The OpenEnv-facing Pydantic
5
+ models live in `forgeenv.env.actions` / `forgeenv.env.observations`.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Optional
11
+
12
+
13
+ @dataclass
14
+ class Task:
15
+ """A HuggingFace training script with execution metadata."""
16
+
17
+ task_id: str
18
+ description: str
19
+ script_content: str
20
+ difficulty: str # "easy", "medium", "hard"
21
+ category: str = "general"
22
+ expected_loss_range: tuple[float, float] = (0.0, 5.0)
23
+ expected_accuracy_range: tuple[float, float] = (0.0, 1.0)
24
+ checkpoint_output_path: str = "/tmp/forge_output/checkpoint"
25
+
26
+
27
+ @dataclass
28
+ class ExecutionResult:
29
+ """Result of executing a Python script in the sandbox."""
30
+
31
+ exit_code: int
32
+ stdout: str
33
+ stderr: str
34
+ wall_time_ms: int
35
+ checkpoint_exists: bool = False
36
+ peak_memory_mb: float = 0.0
37
+ script_content: str = ""
38
+
39
+
40
+ @dataclass
41
+ class ValidationResult:
42
+ """Result of AST validation on a script."""
43
+
44
+ is_valid: bool
45
+ violations: list[str] = field(default_factory=list)
forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py ADDED
File without changes
forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ALBERT-tiny extractive QA on 100-sample SQuAD subset."""
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForQuestionAnswering,
5
+ Trainer,
6
+ TrainingArguments,
7
+ DefaultDataCollator,
8
+ )
9
+ from datasets import load_dataset
10
+
11
+ dataset = load_dataset("squad", split="train[:100]")
12
+ tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
13
+
14
+
15
+ def prepare(examples):
16
+ enc = tokenizer(
17
+ examples["question"],
18
+ examples["context"],
19
+ max_length=128,
20
+ truncation="only_second",
21
+ padding="max_length",
22
+ return_offsets_mapping=True,
23
+ )
24
+ start_positions, end_positions = [], []
25
+ for i, offsets in enumerate(enc["offset_mapping"]):
26
+ answer = examples["answers"][i]
27
+ start_char = answer["answer_start"][0]
28
+ end_char = start_char + len(answer["text"][0])
29
+
30
+ token_start = next(
31
+ (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0
32
+ )
33
+ token_end = next(
34
+ (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start
35
+ )
36
+ start_positions.append(token_start)
37
+ end_positions.append(token_end)
38
+
39
+ enc["start_positions"] = start_positions
40
+ enc["end_positions"] = end_positions
41
+ enc.pop("offset_mapping")
42
+ return enc
43
+
44
+
45
+ dataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)
46
+
47
+ model = AutoModelForQuestionAnswering.from_pretrained("albert-base-v2")
48
+
49
+ training_args = TrainingArguments(
50
+ output_dir="/tmp/forge_output/checkpoint",
51
+ num_train_epochs=1,
52
+ per_device_train_batch_size=4,
53
+ logging_steps=5,
54
+ save_strategy="epoch",
55
+ no_cuda=True,
56
+ report_to="none",
57
+ )
58
+
59
+ trainer = Trainer(
60
+ model=model,
61
+ args=training_args,
62
+ train_dataset=dataset,
63
+ data_collator=DefaultDataCollator(),
64
+ )
65
+ trainer.train()
66
+ trainer.save_model("/tmp/forge_output/checkpoint")
67
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset."""
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForTokenClassification,
5
+ Trainer,
6
+ TrainingArguments,
7
+ DataCollatorForTokenClassification,
8
+ )
9
+ from datasets import load_dataset
10
+
11
+ dataset = load_dataset("conll2003", split="train[:200]")
12
+ tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
13
+
14
+
15
+ def tokenize_and_align(example):
16
+ enc = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, max_length=64)
17
+ word_ids = enc.word_ids()
18
+ labels = []
19
+ prev_id = None
20
+ for wid in word_ids:
21
+ if wid is None:
22
+ labels.append(-100)
23
+ elif wid != prev_id:
24
+ labels.append(example["ner_tags"][wid])
25
+ else:
26
+ labels.append(-100)
27
+ prev_id = wid
28
+ enc["labels"] = labels
29
+ return enc
30
+
31
+
32
+ dataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)
33
+
34
+ model = AutoModelForTokenClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=9)
35
+
36
+ training_args = TrainingArguments(
37
+ output_dir="/tmp/forge_output/checkpoint",
38
+ num_train_epochs=1,
39
+ per_device_train_batch_size=8,
40
+ logging_steps=5,
41
+ save_strategy="epoch",
42
+ no_cuda=True,
43
+ report_to="none",
44
+ )
45
+
46
+ trainer = Trainer(
47
+ model=model,
48
+ args=training_args,
49
+ train_dataset=dataset,
50
+ data_collator=DataCollatorForTokenClassification(tokenizer),
51
+ )
52
+
53
+ trainer.train()
54
+ trainer.save_model("/tmp/forge_output/checkpoint")
55
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DistilBERT fine-tuning on a tiny SST-2 subset.
2
+
3
+ Minimal HuggingFace text-classification training script. Should complete
4
+ in ~60s on CPU.
5
+ """
6
+ from transformers import (
7
+ DistilBertTokenizer,
8
+ DistilBertForSequenceClassification,
9
+ Trainer,
10
+ TrainingArguments,
11
+ )
12
+ from datasets import load_dataset
13
+
14
+ dataset = load_dataset("glue", "sst2", split="train[:500]")
15
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
16
+
17
+
18
+ def tokenize_function(examples):
19
+ return tokenizer(
20
+ examples["sentence"],
21
+ padding="max_length",
22
+ truncation=True,
23
+ max_length=64,
24
+ )
25
+
26
+
27
+ dataset = dataset.map(tokenize_function, batched=True)
28
+ dataset = dataset.rename_column("label", "labels")
29
+ dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
30
+
31
+ model = DistilBertForSequenceClassification.from_pretrained(
32
+ "distilbert-base-uncased", num_labels=2
33
+ )
34
+
35
+ training_args = TrainingArguments(
36
+ output_dir="/tmp/forge_output/checkpoint",
37
+ num_train_epochs=1,
38
+ per_device_train_batch_size=16,
39
+ logging_steps=5,
40
+ save_strategy="epoch",
41
+ no_cuda=True,
42
+ report_to="none",
43
+ )
44
+
45
+ trainer = Trainer(
46
+ model=model,
47
+ args=training_args,
48
+ train_dataset=dataset,
49
+ )
50
+
51
+ trainer.train()
52
+ trainer.save_model("/tmp/forge_output/checkpoint")
53
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ELECTRA-small classification on 400-sample AG News (4-way text classification)."""
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForSequenceClassification,
5
+ Trainer,
6
+ TrainingArguments,
7
+ )
8
+ from datasets import load_dataset
9
+
10
+ dataset = load_dataset("ag_news", split="train[:400]")
11
+ tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
12
+
13
+
14
+ def tokenize(examples):
15
+ return tokenizer(
16
+ examples["text"],
17
+ padding="max_length",
18
+ truncation=True,
19
+ max_length=64,
20
+ )
21
+
22
+
23
+ dataset = dataset.map(tokenize, batched=True)
24
+ dataset = dataset.rename_column("label", "labels")
25
+ dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
26
+
27
+ model = AutoModelForSequenceClassification.from_pretrained(
28
+ "google/electra-small-discriminator", num_labels=4
29
+ )
30
+
31
+ training_args = TrainingArguments(
32
+ output_dir="/tmp/forge_output/checkpoint",
33
+ num_train_epochs=1,
34
+ per_device_train_batch_size=8,
35
+ logging_steps=5,
36
+ save_strategy="epoch",
37
+ no_cuda=True,
38
+ report_to="none",
39
+ )
40
+
41
+ trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
42
+ trainer.train()
43
+ trainer.save_model("/tmp/forge_output/checkpoint")
44
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation)."""
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForCausalLM,
5
+ Trainer,
6
+ TrainingArguments,
7
+ DataCollatorForLanguageModeling,
8
+ )
9
+ from datasets import load_dataset
10
+
11
+ dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:300]")
12
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
13
+ tokenizer.pad_token = tokenizer.eos_token
14
+
15
+
16
+ def tokenize(examples):
17
+ return tokenizer(examples["text"], truncation=True, max_length=64)
18
+
19
+
20
+ dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
21
+
22
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
23
+
24
+ training_args = TrainingArguments(
25
+ output_dir="/tmp/forge_output/checkpoint",
26
+ num_train_epochs=1,
27
+ per_device_train_batch_size=4,
28
+ logging_steps=5,
29
+ save_strategy="epoch",
30
+ no_cuda=True,
31
+ report_to="none",
32
+ )
33
+
34
+ trainer = Trainer(
35
+ model=model,
36
+ args=training_args,
37
+ train_dataset=dataset,
38
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
39
+ )
40
+
41
+ trainer.train()
42
+ trainer.save_model("/tmp/forge_output/checkpoint")
43
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sklearn logistic-regression baseline on a 500-sample tabular task.
2
+
3
+ Sanity baseline that doesn't require torch / transformers / datasets.
4
+ """
5
+ import json
6
+ import pickle
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ from sklearn.datasets import make_classification
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.model_selection import train_test_split
13
+
14
+ X, y = make_classification(
15
+ n_samples=500, n_features=20, n_informative=10, random_state=0
16
+ )
17
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
18
+
19
+ model = LogisticRegression(max_iter=200)
20
+ for step in range(1, 11):
21
+ model.set_params(max_iter=step * 20)
22
+ model.fit(X_train, y_train)
23
+ train_loss = -np.mean(np.log(np.maximum(model.predict_proba(X_train)[np.arange(len(y_train)), y_train], 1e-9)))
24
+ print(f"step={step} loss={train_loss:.4f}")
25
+
26
+ acc = model.score(X_test, y_test)
27
+ print(f"eval_accuracy={acc:.4f}")
28
+
29
+ ckpt_dir = Path("/tmp/forge_output/checkpoint")
30
+ ckpt_dir.mkdir(parents=True, exist_ok=True)
31
+ with open(ckpt_dir / "logreg.pkl", "wb") as f:
32
+ pickle.dump(model, f)
33
+ with open(ckpt_dir / "metrics.json", "w") as f:
34
+ json.dump({"accuracy": acc}, f)
35
+
36
+ print("TRAINING_COMPLETE")
forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DistilRoberta sentiment classification on 400-sample IMDB subset."""
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForSequenceClassification,
5
+ Trainer,
6
+ TrainingArguments,
7
+ )
8
+ from datasets import load_dataset
9
+
10
+ dataset = load_dataset("imdb", split="train[:400]")
11
+ tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
12
+
13
+
14
+ def tokenize(examples):
15
+ return tokenizer(
16
+ examples["text"],
17
+ padding="max_length",
18
+ truncation=True,
19
+ max_length=64,
20
+ )
21
+
22
+
23
+ dataset = dataset.map(tokenize, batched=True)
24
+ dataset = dataset.rename_column("label", "labels")
25
+ dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
26
+
27
+ model = AutoModelForSequenceClassification.from_pretrained(
28
+ "distilroberta-base", num_labels=2
29
+ )
30
+
31
+ training_args = TrainingArguments(
32
+ output_dir="/tmp/forge_output/checkpoint",
33
+ num_train_epochs=1,
34
+ per_device_train_batch_size=8,
35
+ logging_steps=5,
36
+ save_strategy="epoch",
37
+ no_cuda=True,
38
+ report_to="none",
39
+ )
40
+
41
+ trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
42
+ trainer.train()
43
+ trainer.save_model("/tmp/forge_output/checkpoint")
44
+ print("TRAINING_COMPLETE")