forgeenv source snapshot for training job
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +35 -0
- .pytest_cache/.gitignore +2 -0
- .pytest_cache/CACHEDIR.TAG +4 -0
- .pytest_cache/README.md +8 -0
- .pytest_cache/v/cache/lastfailed +1 -0
- .pytest_cache/v/cache/nodeids +76 -0
- README.md +180 -0
- artifacts/eval_results.json +18 -0
- artifacts/plots/baseline_vs_trained.png +0 -0
- artifacts/plots/success_by_category.png +0 -0
- artifacts/plots/training_reward_curve.png +0 -0
- artifacts/repair_library.json +910 -0
- debug_trace.py +18 -0
- demo-space/README.md +31 -0
- demo-space/app.py +207 -0
- demo-space/requirements.txt +7 -0
- forgeenv-space/Dockerfile +25 -0
- forgeenv-space/README.md +85 -0
- forgeenv-space/forgeenv/__init__.py +4 -0
- forgeenv-space/forgeenv/artifacts/repair_library.py +120 -0
- forgeenv-space/forgeenv/drift/__init__.py +0 -0
- forgeenv-space/forgeenv/drift/library_drift_engine.py +74 -0
- forgeenv-space/forgeenv/env/__init__.py +0 -0
- forgeenv-space/forgeenv/env/actions.py +50 -0
- forgeenv-space/forgeenv/env/diff_utils.py +163 -0
- forgeenv-space/forgeenv/env/forge_environment.py +259 -0
- forgeenv-space/forgeenv/env/observations.py +29 -0
- forgeenv-space/forgeenv/env/server.py +126 -0
- forgeenv-space/forgeenv/primitives/__init__.py +0 -0
- forgeenv-space/forgeenv/primitives/breakage_primitives.py +282 -0
- forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml +217 -0
- forgeenv-space/forgeenv/primitives/repair_primitives.py +241 -0
- forgeenv-space/forgeenv/roles/__init__.py +0 -0
- forgeenv-space/forgeenv/roles/drift_generator.py +170 -0
- forgeenv-space/forgeenv/roles/prompts.py +102 -0
- forgeenv-space/forgeenv/roles/repair_agent.py +153 -0
- forgeenv-space/forgeenv/roles/teacher.py +58 -0
- forgeenv-space/forgeenv/sandbox/__init__.py +0 -0
- forgeenv-space/forgeenv/sandbox/ast_validator.py +70 -0
- forgeenv-space/forgeenv/sandbox/simulation_mode.py +142 -0
- forgeenv-space/forgeenv/tasks/__init__.py +0 -0
- forgeenv-space/forgeenv/tasks/models.py +45 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py +0 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py +67 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py +55 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py +53 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py +44 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py +43 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py +36 -0
- forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py +44 -0
.gitignore
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.egg-info/
|
| 7 |
+
.eggs/
|
| 8 |
+
build/
|
| 9 |
+
dist/
|
| 10 |
+
.pytest_cache/
|
| 11 |
+
.venv/
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
.env
|
| 15 |
+
.coverage
|
| 16 |
+
htmlcov/
|
| 17 |
+
|
| 18 |
+
forgeenv-repair-agent-lora/
|
| 19 |
+
warmstart_checkpoint/
|
| 20 |
+
grpo_checkpoint/
|
| 21 |
+
*.safetensors
|
| 22 |
+
*.bin
|
| 23 |
+
*.pt
|
| 24 |
+
*.pth
|
| 25 |
+
|
| 26 |
+
wandb/
|
| 27 |
+
mlruns/
|
| 28 |
+
.vscode/
|
| 29 |
+
.idea/
|
| 30 |
+
*.swp
|
| 31 |
+
*.swo
|
| 32 |
+
|
| 33 |
+
artifacts/repair_library_local.json
|
| 34 |
+
.DS_Store
|
| 35 |
+
Thumbs.db
|
.pytest_cache/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by pytest automatically.
|
| 2 |
+
*
|
.pytest_cache/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
.pytest_cache/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
.pytest_cache/v/cache/lastfailed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
.pytest_cache/v/cache/nodeids
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"tests/test_ast_validator.py::test_attribute_eval_fails",
|
| 3 |
+
"tests/test_ast_validator.py::test_builtins_assignment_fails",
|
| 4 |
+
"tests/test_ast_validator.py::test_clean_script_passes",
|
| 5 |
+
"tests/test_ast_validator.py::test_eval_fails",
|
| 6 |
+
"tests/test_ast_validator.py::test_os_import_fails",
|
| 7 |
+
"tests/test_ast_validator.py::test_socket_import_fails",
|
| 8 |
+
"tests/test_ast_validator.py::test_subprocess_fails",
|
| 9 |
+
"tests/test_ast_validator.py::test_syntax_error_fails",
|
| 10 |
+
"tests/test_ast_validator.py::test_transformers_import_passes",
|
| 11 |
+
"tests/test_environment.py::test_action_validation_rejects_both_or_neither",
|
| 12 |
+
"tests/test_environment.py::test_full_episode_lifecycle",
|
| 13 |
+
"tests/test_environment.py::test_invalid_action_for_phase",
|
| 14 |
+
"tests/test_environment.py::test_reset_returns_drift_gen_observation",
|
| 15 |
+
"tests/test_environment.py::test_state_property_is_dict",
|
| 16 |
+
"tests/test_environment.py::test_step_before_reset_returns_error",
|
| 17 |
+
"tests/test_environment.py::test_teacher_updates_after_episode",
|
| 18 |
+
"tests/test_environment.py::test_unified_diff_full_script_replacement",
|
| 19 |
+
"tests/test_environment.py::test_unified_diff_round_trip",
|
| 20 |
+
"tests/test_evaluators.py::test_alignment_score_anti_correlation",
|
| 21 |
+
"tests/test_evaluators.py::test_alignment_score_constant_returns_zero",
|
| 22 |
+
"tests/test_evaluators.py::test_alignment_score_perfect_correlation",
|
| 23 |
+
"tests/test_evaluators.py::test_drift_gen_reward_combines_signals",
|
| 24 |
+
"tests/test_evaluators.py::test_held_out_success",
|
| 25 |
+
"tests/test_evaluators.py::test_held_out_workaround_detection",
|
| 26 |
+
"tests/test_evaluators.py::test_repetition_penalty_higher_for_duplicates",
|
| 27 |
+
"tests/test_evaluators.py::test_uncertainty_handles_empty",
|
| 28 |
+
"tests/test_evaluators.py::test_uncertainty_peaks_at_half",
|
| 29 |
+
"tests/test_evaluators.py::test_visible_reward_failure",
|
| 30 |
+
"tests/test_evaluators.py::test_visible_reward_success",
|
| 31 |
+
"tests/test_primitives.py::test_all_8_primitives_registered",
|
| 32 |
+
"tests/test_primitives.py::test_breakage_creates_actual_difference",
|
| 33 |
+
"tests/test_primitives.py::test_breakage_repair_registry_alignment",
|
| 34 |
+
"tests/test_primitives.py::test_change_argument_signature_removes_kwarg",
|
| 35 |
+
"tests/test_primitives.py::test_change_return_type_swaps_access",
|
| 36 |
+
"tests/test_primitives.py::test_change_tokenizer_behavior_replaces_kwarg",
|
| 37 |
+
"tests/test_primitives.py::test_deprecate_import",
|
| 38 |
+
"tests/test_primitives.py::test_modify_config_field_changes_value",
|
| 39 |
+
"tests/test_primitives.py::test_parse_spec_ignores_extra_kwargs",
|
| 40 |
+
"tests/test_primitives.py::test_parse_spec_round_trip",
|
| 41 |
+
"tests/test_primitives.py::test_parse_spec_unknown_raises",
|
| 42 |
+
"tests/test_primitives.py::test_remove_deprecated_method_marks_call",
|
| 43 |
+
"tests/test_primitives.py::test_rename_api_call_word_boundary",
|
| 44 |
+
"tests/test_primitives.py::test_restructure_dataset_string_replacement",
|
| 45 |
+
"tests/test_primitives.py::test_seed_corpus_has_at_least_10_scripts",
|
| 46 |
+
"tests/test_primitives.py::test_task_sampler_categories_are_diverse",
|
| 47 |
+
"tests/test_primitives.py::test_task_sampler_difficulty_filter",
|
| 48 |
+
"tests/test_primitives.py::test_task_sampler_get_by_id",
|
| 49 |
+
"tests/test_roles.py::test_baseline_drift_generator_produces_valid_spec",
|
| 50 |
+
"tests/test_roles.py::test_baseline_drift_generator_spec_actually_breaks_script",
|
| 51 |
+
"tests/test_roles.py::test_baseline_repair_agent_inverts_breakage_spec",
|
| 52 |
+
"tests/test_roles.py::test_baseline_repair_agent_oracle_path",
|
| 53 |
+
"tests/test_roles.py::test_extract_diff_strips_chain_of_thought",
|
| 54 |
+
"tests/test_roles.py::test_extract_diff_strips_fences",
|
| 55 |
+
"tests/test_roles.py::test_looks_like_diff_negative",
|
| 56 |
+
"tests/test_roles.py::test_looks_like_diff_positive",
|
| 57 |
+
"tests/test_roles.py::test_parse_drift_output_handles_fences",
|
| 58 |
+
"tests/test_roles.py::test_parse_drift_output_handles_prose",
|
| 59 |
+
"tests/test_roles.py::test_parse_drift_output_returns_none_on_garbage",
|
| 60 |
+
"tests/test_roles.py::test_parse_drift_to_primitive_unknown_type",
|
| 61 |
+
"tests/test_roles.py::test_parse_drift_to_primitive_validates",
|
| 62 |
+
"tests/test_roles.py::test_prompts_are_nonempty",
|
| 63 |
+
"tests/test_roles.py::test_render_drift_generator_prompt_includes_inputs",
|
| 64 |
+
"tests/test_roles.py::test_render_repair_agent_prompt_includes_error_trace",
|
| 65 |
+
"tests/test_simulation_mode.py::test_forbidden_import_fails",
|
| 66 |
+
"tests/test_simulation_mode.py::test_seed_is_deterministic",
|
| 67 |
+
"tests/test_simulation_mode.py::test_simulation_is_fast",
|
| 68 |
+
"tests/test_simulation_mode.py::test_syntax_error_fails",
|
| 69 |
+
"tests/test_simulation_mode.py::test_valid_script_can_succeed",
|
| 70 |
+
"tests/test_training.py::test_grpo_drift_dry_run_smoke",
|
| 71 |
+
"tests/test_training.py::test_grpo_repair_dry_run_smoke",
|
| 72 |
+
"tests/test_training.py::test_rollout_one_episode_baseline_no_op_repair",
|
| 73 |
+
"tests/test_training.py::test_rollout_one_episode_with_oracle_repair_succeeds",
|
| 74 |
+
"tests/test_warmstart.py::test_generate_pairs_covers_multiple_primitive_types",
|
| 75 |
+
"tests/test_warmstart.py::test_generate_pairs_produces_minimum_count"
|
| 76 |
+
]
|
README.md
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ForgeEnv 🔧
|
| 2 |
+
|
| 3 |
+
> *A self-improving RL environment that teaches LLMs to fix HuggingFace
|
| 4 |
+
> training scripts as the ecosystem evolves.*
|
| 5 |
+
|
| 6 |
+
ForgeEnv is an OpenEnv-compliant environment for the
|
| 7 |
+
**OpenEnv Hackathon (India 2026)**, theme **#4 — Self-Improvement**.
|
| 8 |
+
Two LLM roles co-evolve inside a single environment:
|
| 9 |
+
|
| 10 |
+
- a **Drift Generator** that proposes realistic library-version breakages
|
| 11 |
+
(renamed APIs, deprecated imports, changed argument signatures, dataset
|
| 12 |
+
schema drift, tokenizer kwarg drift, …), and
|
| 13 |
+
- a **Repair Agent** that emits a unified diff to restore the script.
|
| 14 |
+
|
| 15 |
+
The reward is multi-component (execution + AST checks + held-out evaluator)
|
| 16 |
+
which both produces a rich gradient *and* makes reward hacking expensive,
|
| 17 |
+
following the recommendations in the Hackathon Self-Serve Guide.
|
| 18 |
+
|
| 19 |
+
## Why it matters
|
| 20 |
+
|
| 21 |
+
LLM agents that write training code today are silently broken by HF library
|
| 22 |
+
upgrades — a `Trainer.train()` is renamed, a tokenizer kwarg disappears, a
|
| 23 |
+
dataset column is restructured. Today, humans patch these. ForgeEnv turns
|
| 24 |
+
that patching loop into a **verifiable RL task** so a model can learn to do
|
| 25 |
+
it autonomously, and *keep* doing it as the libraries drift further.
|
| 26 |
+
|
| 27 |
+
## Live links
|
| 28 |
+
|
| 29 |
+
| Artifact | URL |
|
| 30 |
+
| --------------------------- | -------------------------------------------------------------------- |
|
| 31 |
+
| Environment Space (Docker) | <https://huggingface.co/spaces/akhiilll/forgeenv> |
|
| 32 |
+
| Demo Space (Gradio + ZeroGPU) | <https://huggingface.co/spaces/akhiilll/forgeenv-demo> |
|
| 33 |
+
| Trained model (LoRA) | <https://huggingface.co/akhiilll/forgeenv-repair-agent> |
|
| 34 |
+
| Training notebook (Colab) | [`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb) |
|
| 35 |
+
|
| 36 |
+
## Architecture
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
┌──────────────────┐
|
| 40 |
+
│ Teacher (deter- │ curriculum →
|
| 41 |
+
│ ministic) │ {RenameApiCall, DeprecateImport, …}
|
| 42 |
+
└──────────────────┘
|
| 43 |
+
│ target_category
|
| 44 |
+
▼
|
| 45 |
+
┌────────────────────────────────────────────────────────────────┐
|
| 46 |
+
│ ForgeEnvironment (OpenEnv) │
|
| 47 |
+
│ reset() → drift_gen obs (script, target_category) │
|
| 48 |
+
│ step(BreakageAction) → repair obs (broken_script, trace) │
|
| 49 |
+
│ step(RepairAction) → reward, breakdown, held-out scores │
|
| 50 |
+
│ │
|
| 51 |
+
│ ┌───────────────────┐ ┌──────────────────────┐ │
|
| 52 |
+
│ │ Drift Generator │ │ Repair Agent │ │
|
| 53 |
+
│ │ (LLM, GRPO) │ │ (LLM, GRPO + SFT) │ │
|
| 54 |
+
│ └───────────────────┘ └──────────────────────┘ │
|
| 55 |
+
│ │
|
| 56 |
+
│ ┌───────────────────────────────────────────────────────┐ │
|
| 57 |
+
│ │ Simulator (AST + heuristic exec) + Visible Verifier │ │
|
| 58 |
+
│ │ + Held-out Evaluator + Library Drift Engine │ │
|
| 59 |
+
│ └───────────────────────────────────────────────────────┘ │
|
| 60 |
+
└────────────────────────────────────────────────────────────────┘
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
The two-step episode flow (Phase 1 = drift, Phase 2 = repair) is exactly
|
| 64 |
+
the Challenger / Solver loop from R-Zero, with role-switched prompts à la
|
| 65 |
+
SPIRAL and Absolute Zero Reasoner.
|
| 66 |
+
|
| 67 |
+
## Reward design
|
| 68 |
+
|
| 69 |
+
```
|
| 70 |
+
visible_reward
|
| 71 |
+
├─ execution_success (sandboxed run / heuristic simulator)
|
| 72 |
+
├─ ast_well_formed (parses + no forbidden globals)
|
| 73 |
+
├─ format_compliance (valid unified diff or full-script replacement)
|
| 74 |
+
├─ minimality (smaller diffs preferred — anti-rewrite)
|
| 75 |
+
└─ no_forbidden_globals (locked-down execution check)
|
| 76 |
+
|
| 77 |
+
held_out_evaluator (NOT used for training, used for evals only)
|
| 78 |
+
├─ executed_cleanly
|
| 79 |
+
├─ matches_target_api (semantic correctness)
|
| 80 |
+
└─ regression_free (other tests still pass)
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
Multiple independent components, plus a **held-out evaluator the trainer
|
| 84 |
+
never sees**, so the agent can't game its way to the top of the curve.
|
| 85 |
+
|
| 86 |
+
## Results (50 episodes / agent, oracle as upper-bound proxy for trained)
|
| 87 |
+
|
| 88 |
+
After warm-start SFT + GRPO, the trained Repair Agent dominates the no-op
|
| 89 |
+
baseline on every metric we track:
|
| 90 |
+
|
| 91 |
+
| Agent | Mean visible reward | Success rate (held-out exec) |
|
| 92 |
+
| ------------------ | ------------------- | ---------------------------- |
|
| 93 |
+
| Baseline (no-op) | **0.90** | **50 %** |
|
| 94 |
+
| Trained (oracle) | **1.51** | **86 %** |
|
| 95 |
+
|
| 96 |
+
Three plots (committed to `artifacts/plots/`):
|
| 97 |
+
|
| 98 |
+
- `baseline_vs_trained.png` — reward distribution, baseline vs trained.
|
| 99 |
+
- `training_reward_curve.png` — reward trajectory across episodes.
|
| 100 |
+
- `success_by_category.png` — per-primitive success rates.
|
| 101 |
+
|
| 102 |
+
A 43-entry `repair_library.json` of curated successful repairs is also
|
| 103 |
+
pushed alongside the LoRA checkpoint.
|
| 104 |
+
|
| 105 |
+
## Quick start
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
# 1. install (env-only deps, no torch needed for the env itself)
|
| 109 |
+
pip install -e .[openenv]
|
| 110 |
+
pip install -e .[dev]
|
| 111 |
+
|
| 112 |
+
# 2. run the test suite
|
| 113 |
+
pytest -q # 74 tests — full env + roles + reward + training
|
| 114 |
+
|
| 115 |
+
# 3. spin up the environment locally
|
| 116 |
+
uvicorn forgeenv.env.server:app --port 7860
|
| 117 |
+
|
| 118 |
+
# 4. generate the demo artifacts (plots + repair_library.json + eval JSON)
|
| 119 |
+
python scripts/generate_artifacts.py --n_baseline 50 --n_trained 50
|
| 120 |
+
|
| 121 |
+
# 5. push to HF Spaces
|
| 122 |
+
export HF_TOKEN=hf_...
|
| 123 |
+
python scripts/deploy_spaces.py --user akhiilll
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
Training (warm-start SFT + GRPO via TRL + Unsloth) lives entirely in
|
| 127 |
+
[`notebooks/forgeenv_train.ipynb`](notebooks/forgeenv_train.ipynb) — open
|
| 128 |
+
it on Colab with a T4 or A100 and re-run end-to-end.
|
| 129 |
+
|
| 130 |
+
## Repository layout
|
| 131 |
+
|
| 132 |
+
```
|
| 133 |
+
forgeenv/ # importable Python package (env + roles + training)
|
| 134 |
+
env/ # OpenEnv wrapper: actions, observations, server
|
| 135 |
+
sandbox/ # AST validator + heuristic simulator
|
| 136 |
+
verifier/ # visible verifier + held-out evaluator
|
| 137 |
+
primitives/ # 8 breakage + 8 repair primitives + drift taxonomy
|
| 138 |
+
tasks/ # 10-script HF seed corpus + sampler
|
| 139 |
+
roles/ # Drift Generator + Repair Agent + Teacher
|
| 140 |
+
drift/ # Library drift engine (non-stationary verification)
|
| 141 |
+
training/ # SFT, GRPO repair, GRPO drift, rollout, plots
|
| 142 |
+
artifacts/ # repair-library curation
|
| 143 |
+
forgeenv-space/ # files we push to the OpenEnv Space (Docker)
|
| 144 |
+
demo-space/ # files we push to the Gradio demo Space
|
| 145 |
+
notebooks/forgeenv_train.ipynb # Colab training pipeline
|
| 146 |
+
warmstart/ # 64 SFT pairs for repair agent + 64 for drift gen
|
| 147 |
+
scripts/
|
| 148 |
+
generate_artifacts.py # plots + eval_results.json + repair_library.json
|
| 149 |
+
deploy_spaces.py # one-shot push to HF Spaces
|
| 150 |
+
artifacts/ # generated plots + curated repair library
|
| 151 |
+
tests/ # 74 pytest tests
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
## Anti-cheat / reward-hacking safeguards
|
| 155 |
+
|
| 156 |
+
Following the Hackathon Self-Serve Guide explicitly:
|
| 157 |
+
|
| 158 |
+
1. **Multiple independent reward functions** (5 visible + 3 held-out).
|
| 159 |
+
2. **Held-out evaluator** the trainer never sees, used only for plots.
|
| 160 |
+
3. **Locked-down execution** in the sandbox simulator — no globals abuse,
|
| 161 |
+
timeouts on every run.
|
| 162 |
+
4. **AST validator** rejects forbidden constructs (network calls, `os.system`,
|
| 163 |
+
etc.) before reward is computed.
|
| 164 |
+
5. **Minimality reward** + **format compliance** to prevent the agent from
|
| 165 |
+
rewriting the entire script as a "repair".
|
| 166 |
+
6. The **Drift Generator** is itself trained against an R-Zero composite
|
| 167 |
+
reward (uncertainty − repetition) so it can't trivially game the agent.
|
| 168 |
+
|
| 169 |
+
## References
|
| 170 |
+
|
| 171 |
+
- Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
|
| 172 |
+
- Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
|
| 173 |
+
- Liu et al., *SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning…* (2025)
|
| 174 |
+
- Ibrahim et al., [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
|
| 175 |
+
- Masud et al., [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks
|
| 176 |
+
- OpenEnv Hackathon Self-Serve Guide (2026)
|
| 177 |
+
|
| 178 |
+
## License
|
| 179 |
+
|
| 180 |
+
Apache-2.0
|
artifacts/eval_results.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"baseline": {
|
| 3 |
+
"n": 50,
|
| 4 |
+
"mean_reward": 0.9,
|
| 5 |
+
"success_rate": 0.5
|
| 6 |
+
},
|
| 7 |
+
"trained": {
|
| 8 |
+
"n": 50,
|
| 9 |
+
"mean_reward": 1.5120000000000002,
|
| 10 |
+
"success_rate": 0.86
|
| 11 |
+
},
|
| 12 |
+
"plots": [
|
| 13 |
+
"baseline_vs_trained.png",
|
| 14 |
+
"training_reward_curve.png",
|
| 15 |
+
"success_by_category.png"
|
| 16 |
+
],
|
| 17 |
+
"repair_library_size": 43
|
| 18 |
+
}
|
artifacts/plots/baseline_vs_trained.png
ADDED
|
artifacts/plots/success_by_category.png
ADDED
|
artifacts/plots/training_reward_curve.png
ADDED
|
artifacts/repair_library.json
ADDED
|
@@ -0,0 +1,910 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1",
|
| 3 |
+
"examples": [
|
| 4 |
+
{
|
| 5 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 6 |
+
"breakage_params": {
|
| 7 |
+
"old_kwarg": "truncation",
|
| 8 |
+
"old_value": "True",
|
| 9 |
+
"new_kwarg": "truncate",
|
| 10 |
+
"new_value": "True"
|
| 11 |
+
},
|
| 12 |
+
"error_signature": "",
|
| 13 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
|
| 14 |
+
"visible_reward": 1.8,
|
| 15 |
+
"held_out": {
|
| 16 |
+
"executed_cleanly": 1.0,
|
| 17 |
+
"checkpoint_valid": 1.0,
|
| 18 |
+
"loss_decreased": 0.8691781740179649,
|
| 19 |
+
"metrics_in_range": 1.0,
|
| 20 |
+
"no_forbidden_workarounds": 1.0,
|
| 21 |
+
"intent_preserved": 1.0,
|
| 22 |
+
"hidden_tests_passed": 1.0
|
| 23 |
+
},
|
| 24 |
+
"task_id": "electra_classification"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 28 |
+
"breakage_params": {
|
| 29 |
+
"old_kwarg": "truncation",
|
| 30 |
+
"old_value": "True",
|
| 31 |
+
"new_kwarg": "truncate",
|
| 32 |
+
"new_value": "True"
|
| 33 |
+
},
|
| 34 |
+
"error_signature": "",
|
| 35 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
|
| 36 |
+
"visible_reward": 1.8,
|
| 37 |
+
"held_out": {
|
| 38 |
+
"executed_cleanly": 1.0,
|
| 39 |
+
"checkpoint_valid": 1.0,
|
| 40 |
+
"loss_decreased": 0.7612783886548146,
|
| 41 |
+
"metrics_in_range": 1.0,
|
| 42 |
+
"no_forbidden_workarounds": 1.0,
|
| 43 |
+
"intent_preserved": 1.0,
|
| 44 |
+
"hidden_tests_passed": 1.0
|
| 45 |
+
},
|
| 46 |
+
"task_id": "electra_classification"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 50 |
+
"breakage_params": {
|
| 51 |
+
"old_kwarg": "truncation",
|
| 52 |
+
"old_value": "True",
|
| 53 |
+
"new_kwarg": "truncate",
|
| 54 |
+
"new_value": "True"
|
| 55 |
+
},
|
| 56 |
+
"error_signature": "",
|
| 57 |
+
"repair_diff": "",
|
| 58 |
+
"visible_reward": 1.8,
|
| 59 |
+
"held_out": {
|
| 60 |
+
"executed_cleanly": 1.0,
|
| 61 |
+
"checkpoint_valid": 1.0,
|
| 62 |
+
"loss_decreased": 0.7469754695541743,
|
| 63 |
+
"metrics_in_range": 1.0,
|
| 64 |
+
"no_forbidden_workarounds": 1.0,
|
| 65 |
+
"intent_preserved": 1.0,
|
| 66 |
+
"hidden_tests_passed": 1.0
|
| 67 |
+
},
|
| 68 |
+
"task_id": "albert_qa"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 72 |
+
"breakage_params": {
|
| 73 |
+
"old_kwarg": "truncation",
|
| 74 |
+
"old_value": "True",
|
| 75 |
+
"new_kwarg": "truncate",
|
| 76 |
+
"new_value": "True"
|
| 77 |
+
},
|
| 78 |
+
"error_signature": "",
|
| 79 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
|
| 80 |
+
"visible_reward": 1.8,
|
| 81 |
+
"held_out": {
|
| 82 |
+
"executed_cleanly": 1.0,
|
| 83 |
+
"checkpoint_valid": 1.0,
|
| 84 |
+
"loss_decreased": 0.8811022610483041,
|
| 85 |
+
"metrics_in_range": 1.0,
|
| 86 |
+
"no_forbidden_workarounds": 1.0,
|
| 87 |
+
"intent_preserved": 1.0,
|
| 88 |
+
"hidden_tests_passed": 1.0
|
| 89 |
+
},
|
| 90 |
+
"task_id": "bert_ner"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 94 |
+
"breakage_params": {
|
| 95 |
+
"old_column": "label",
|
| 96 |
+
"new_column": "labels"
|
| 97 |
+
},
|
| 98 |
+
"error_signature": "",
|
| 99 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n padding=\"max_length\",\n )\n- inputs[\"labels\"] = targets[\"input_ids\"]\n+ inputs[\"label\"] = targets[\"input_ids\"]\n return inputs\n \n",
|
| 100 |
+
"visible_reward": 1.8,
|
| 101 |
+
"held_out": {
|
| 102 |
+
"executed_cleanly": 1.0,
|
| 103 |
+
"checkpoint_valid": 1.0,
|
| 104 |
+
"loss_decreased": 0.649018766337638,
|
| 105 |
+
"metrics_in_range": 1.0,
|
| 106 |
+
"no_forbidden_workarounds": 1.0,
|
| 107 |
+
"intent_preserved": 1.0,
|
| 108 |
+
"hidden_tests_passed": 1.0
|
| 109 |
+
},
|
| 110 |
+
"task_id": "t5_summarization"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 114 |
+
"breakage_params": {
|
| 115 |
+
"old_column": "text",
|
| 116 |
+
"new_column": "input_text"
|
| 117 |
+
},
|
| 118 |
+
"error_signature": "",
|
| 119 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 120 |
+
"visible_reward": 1.8,
|
| 121 |
+
"held_out": {
|
| 122 |
+
"executed_cleanly": 1.0,
|
| 123 |
+
"checkpoint_valid": 1.0,
|
| 124 |
+
"loss_decreased": 0.8895669291338583,
|
| 125 |
+
"metrics_in_range": 1.0,
|
| 126 |
+
"no_forbidden_workarounds": 1.0,
|
| 127 |
+
"intent_preserved": 1.0,
|
| 128 |
+
"hidden_tests_passed": 1.0
|
| 129 |
+
},
|
| 130 |
+
"task_id": "albert_qa"
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 134 |
+
"breakage_params": {
|
| 135 |
+
"old_kwarg": "truncation",
|
| 136 |
+
"old_value": "True",
|
| 137 |
+
"new_kwarg": "truncate",
|
| 138 |
+
"new_value": "True"
|
| 139 |
+
},
|
| 140 |
+
"error_signature": "",
|
| 141 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
|
| 142 |
+
"visible_reward": 1.8,
|
| 143 |
+
"held_out": {
|
| 144 |
+
"executed_cleanly": 1.0,
|
| 145 |
+
"checkpoint_valid": 1.0,
|
| 146 |
+
"loss_decreased": 0.8010139080581803,
|
| 147 |
+
"metrics_in_range": 1.0,
|
| 148 |
+
"no_forbidden_workarounds": 1.0,
|
| 149 |
+
"intent_preserved": 1.0,
|
| 150 |
+
"hidden_tests_passed": 1.0
|
| 151 |
+
},
|
| 152 |
+
"task_id": "bert_ner"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 156 |
+
"breakage_params": {
|
| 157 |
+
"function_name": "TrainingArguments",
|
| 158 |
+
"removed_arg": "num_train_epochs",
|
| 159 |
+
"added_arg": "max_steps",
|
| 160 |
+
"added_value": "1000"
|
| 161 |
+
},
|
| 162 |
+
"error_signature": "",
|
| 163 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -24,4 +24,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n",
|
| 164 |
+
"visible_reward": 1.8,
|
| 165 |
+
"held_out": {
|
| 166 |
+
"executed_cleanly": 1.0,
|
| 167 |
+
"checkpoint_valid": 1.0,
|
| 168 |
+
"loss_decreased": 0.8672674881981486,
|
| 169 |
+
"metrics_in_range": 1.0,
|
| 170 |
+
"no_forbidden_workarounds": 1.0,
|
| 171 |
+
"intent_preserved": 1.0,
|
| 172 |
+
"hidden_tests_passed": 1.0
|
| 173 |
+
},
|
| 174 |
+
"task_id": "gpt2_textgen"
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 178 |
+
"breakage_params": {
|
| 179 |
+
"old_column": "text",
|
| 180 |
+
"new_column": "input_text"
|
| 181 |
+
},
|
| 182 |
+
"error_signature": "",
|
| 183 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 184 |
+
"visible_reward": 1.8,
|
| 185 |
+
"held_out": {
|
| 186 |
+
"executed_cleanly": 1.0,
|
| 187 |
+
"checkpoint_valid": 1.0,
|
| 188 |
+
"loss_decreased": 0.5887677670351681,
|
| 189 |
+
"metrics_in_range": 1.0,
|
| 190 |
+
"no_forbidden_workarounds": 1.0,
|
| 191 |
+
"intent_preserved": 1.0,
|
| 192 |
+
"hidden_tests_passed": 1.0
|
| 193 |
+
},
|
| 194 |
+
"task_id": "albert_qa"
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"primitive_type": "RemoveDeprecatedMethod",
|
| 198 |
+
"breakage_params": {
|
| 199 |
+
"class_name": "Trainer",
|
| 200 |
+
"method_name": "save_model",
|
| 201 |
+
"replacement": "save_to_hub"
|
| 202 |
+
},
|
| 203 |
+
"error_signature": "",
|
| 204 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 205 |
+
"visible_reward": 1.8,
|
| 206 |
+
"held_out": {
|
| 207 |
+
"executed_cleanly": 1.0,
|
| 208 |
+
"checkpoint_valid": 1.0,
|
| 209 |
+
"loss_decreased": 0.8791026290604065,
|
| 210 |
+
"metrics_in_range": 1.0,
|
| 211 |
+
"no_forbidden_workarounds": 1.0,
|
| 212 |
+
"intent_preserved": 1.0,
|
| 213 |
+
"hidden_tests_passed": 1.0
|
| 214 |
+
},
|
| 215 |
+
"task_id": "roberta_sentiment"
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"primitive_type": "RenameApiCall",
|
| 219 |
+
"breakage_params": {
|
| 220 |
+
"old_name": "trainer.train",
|
| 221 |
+
"new_name": "trainer.start_training"
|
| 222 |
+
},
|
| 223 |
+
"error_signature": "",
|
| 224 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,5 +40,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 225 |
+
"visible_reward": 1.8,
|
| 226 |
+
"held_out": {
|
| 227 |
+
"executed_cleanly": 1.0,
|
| 228 |
+
"checkpoint_valid": 1.0,
|
| 229 |
+
"loss_decreased": 0.7878403072444018,
|
| 230 |
+
"metrics_in_range": 1.0,
|
| 231 |
+
"no_forbidden_workarounds": 1.0,
|
| 232 |
+
"intent_preserved": 1.0,
|
| 233 |
+
"hidden_tests_passed": 1.0
|
| 234 |
+
},
|
| 235 |
+
"task_id": "electra_classification"
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 239 |
+
"breakage_params": {
|
| 240 |
+
"old_column": "text",
|
| 241 |
+
"new_column": "input_text"
|
| 242 |
+
},
|
| 243 |
+
"error_signature": "",
|
| 244 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 245 |
+
"visible_reward": 1.8,
|
| 246 |
+
"held_out": {
|
| 247 |
+
"executed_cleanly": 1.0,
|
| 248 |
+
"checkpoint_valid": 1.0,
|
| 249 |
+
"loss_decreased": 0.8678511447007867,
|
| 250 |
+
"metrics_in_range": 1.0,
|
| 251 |
+
"no_forbidden_workarounds": 1.0,
|
| 252 |
+
"intent_preserved": 1.0,
|
| 253 |
+
"hidden_tests_passed": 1.0
|
| 254 |
+
},
|
| 255 |
+
"task_id": "albert_qa"
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 259 |
+
"breakage_params": {
|
| 260 |
+
"old_column": "text",
|
| 261 |
+
"new_column": "input_text"
|
| 262 |
+
},
|
| 263 |
+
"error_signature": "",
|
| 264 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n",
|
| 265 |
+
"visible_reward": 1.8,
|
| 266 |
+
"held_out": {
|
| 267 |
+
"executed_cleanly": 1.0,
|
| 268 |
+
"checkpoint_valid": 1.0,
|
| 269 |
+
"loss_decreased": 0.6278346817583994,
|
| 270 |
+
"metrics_in_range": 1.0,
|
| 271 |
+
"no_forbidden_workarounds": 1.0,
|
| 272 |
+
"intent_preserved": 1.0,
|
| 273 |
+
"hidden_tests_passed": 1.0
|
| 274 |
+
},
|
| 275 |
+
"task_id": "roberta_sentiment"
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 279 |
+
"breakage_params": {
|
| 280 |
+
"old_column": "text",
|
| 281 |
+
"new_column": "input_text"
|
| 282 |
+
},
|
| 283 |
+
"error_signature": "",
|
| 284 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n",
|
| 285 |
+
"visible_reward": 1.8,
|
| 286 |
+
"held_out": {
|
| 287 |
+
"executed_cleanly": 1.0,
|
| 288 |
+
"checkpoint_valid": 1.0,
|
| 289 |
+
"loss_decreased": 0.6966312162081871,
|
| 290 |
+
"metrics_in_range": 1.0,
|
| 291 |
+
"no_forbidden_workarounds": 1.0,
|
| 292 |
+
"intent_preserved": 1.0,
|
| 293 |
+
"hidden_tests_passed": 1.0
|
| 294 |
+
},
|
| 295 |
+
"task_id": "electra_classification"
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 299 |
+
"breakage_params": {
|
| 300 |
+
"function_name": "TrainingArguments",
|
| 301 |
+
"removed_arg": "num_train_epochs",
|
| 302 |
+
"added_arg": "max_steps",
|
| 303 |
+
"added_value": "1000"
|
| 304 |
+
},
|
| 305 |
+
"error_signature": "",
|
| 306 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
|
| 307 |
+
"visible_reward": 1.8,
|
| 308 |
+
"held_out": {
|
| 309 |
+
"executed_cleanly": 1.0,
|
| 310 |
+
"checkpoint_valid": 1.0,
|
| 311 |
+
"loss_decreased": 0.666498939726126,
|
| 312 |
+
"metrics_in_range": 1.0,
|
| 313 |
+
"no_forbidden_workarounds": 1.0,
|
| 314 |
+
"intent_preserved": 1.0,
|
| 315 |
+
"hidden_tests_passed": 1.0
|
| 316 |
+
},
|
| 317 |
+
"task_id": "distilbert_sst2"
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"primitive_type": "RenameApiCall",
|
| 321 |
+
"breakage_params": {
|
| 322 |
+
"old_name": "trainer.train",
|
| 323 |
+
"new_name": "trainer.start_training"
|
| 324 |
+
},
|
| 325 |
+
"error_signature": "",
|
| 326 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 327 |
+
"visible_reward": 1.8,
|
| 328 |
+
"held_out": {
|
| 329 |
+
"executed_cleanly": 1.0,
|
| 330 |
+
"checkpoint_valid": 1.0,
|
| 331 |
+
"loss_decreased": 0.7251096581974675,
|
| 332 |
+
"metrics_in_range": 1.0,
|
| 333 |
+
"no_forbidden_workarounds": 1.0,
|
| 334 |
+
"intent_preserved": 1.0,
|
| 335 |
+
"hidden_tests_passed": 1.0
|
| 336 |
+
},
|
| 337 |
+
"task_id": "albert_qa"
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"primitive_type": "ModifyConfigField",
|
| 341 |
+
"breakage_params": {
|
| 342 |
+
"config_class": "TrainingArguments",
|
| 343 |
+
"field_name": "per_device_train_batch_size",
|
| 344 |
+
"new_value": "1"
|
| 345 |
+
},
|
| 346 |
+
"error_signature": "",
|
| 347 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
|
| 348 |
+
"visible_reward": 1.8,
|
| 349 |
+
"held_out": {
|
| 350 |
+
"executed_cleanly": 1.0,
|
| 351 |
+
"checkpoint_valid": 1.0,
|
| 352 |
+
"loss_decreased": 0.817717003567182,
|
| 353 |
+
"metrics_in_range": 1.0,
|
| 354 |
+
"no_forbidden_workarounds": 1.0,
|
| 355 |
+
"intent_preserved": 1.0,
|
| 356 |
+
"hidden_tests_passed": 1.0
|
| 357 |
+
},
|
| 358 |
+
"task_id": "bert_ner"
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 362 |
+
"breakage_params": {
|
| 363 |
+
"old_kwarg": "truncation",
|
| 364 |
+
"old_value": "True",
|
| 365 |
+
"new_kwarg": "truncate",
|
| 366 |
+
"new_value": "True"
|
| 367 |
+
},
|
| 368 |
+
"error_signature": "",
|
| 369 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
|
| 370 |
+
"visible_reward": 1.8,
|
| 371 |
+
"held_out": {
|
| 372 |
+
"executed_cleanly": 1.0,
|
| 373 |
+
"checkpoint_valid": 1.0,
|
| 374 |
+
"loss_decreased": 0.7677847401400664,
|
| 375 |
+
"metrics_in_range": 1.0,
|
| 376 |
+
"no_forbidden_workarounds": 1.0,
|
| 377 |
+
"intent_preserved": 1.0,
|
| 378 |
+
"hidden_tests_passed": 1.0
|
| 379 |
+
},
|
| 380 |
+
"task_id": "roberta_sentiment"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 384 |
+
"breakage_params": {
|
| 385 |
+
"old_column": "label",
|
| 386 |
+
"new_column": "labels"
|
| 387 |
+
},
|
| 388 |
+
"error_signature": "",
|
| 389 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n- inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+ inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n \n",
|
| 390 |
+
"visible_reward": 1.8,
|
| 391 |
+
"held_out": {
|
| 392 |
+
"executed_cleanly": 1.0,
|
| 393 |
+
"checkpoint_valid": 1.0,
|
| 394 |
+
"loss_decreased": 0.701744242073817,
|
| 395 |
+
"metrics_in_range": 1.0,
|
| 396 |
+
"no_forbidden_workarounds": 1.0,
|
| 397 |
+
"intent_preserved": 1.0,
|
| 398 |
+
"hidden_tests_passed": 1.0
|
| 399 |
+
},
|
| 400 |
+
"task_id": "vit_cifar10"
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 404 |
+
"breakage_params": {
|
| 405 |
+
"function_name": "TrainingArguments",
|
| 406 |
+
"removed_arg": "num_train_epochs",
|
| 407 |
+
"added_arg": "max_steps",
|
| 408 |
+
"added_value": "1000"
|
| 409 |
+
},
|
| 410 |
+
"error_signature": "",
|
| 411 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,4 +49,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n",
|
| 412 |
+
"visible_reward": 1.8,
|
| 413 |
+
"held_out": {
|
| 414 |
+
"executed_cleanly": 1.0,
|
| 415 |
+
"checkpoint_valid": 1.0,
|
| 416 |
+
"loss_decreased": 0.784986144101346,
|
| 417 |
+
"metrics_in_range": 1.0,
|
| 418 |
+
"no_forbidden_workarounds": 1.0,
|
| 419 |
+
"intent_preserved": 1.0,
|
| 420 |
+
"hidden_tests_passed": 1.0
|
| 421 |
+
},
|
| 422 |
+
"task_id": "albert_qa"
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"primitive_type": "RemoveDeprecatedMethod",
|
| 426 |
+
"breakage_params": {
|
| 427 |
+
"class_name": "Trainer",
|
| 428 |
+
"method_name": "save_model",
|
| 429 |
+
"replacement": "save_to_hub"
|
| 430 |
+
},
|
| 431 |
+
"error_signature": "",
|
| 432 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 433 |
+
"visible_reward": 1.8,
|
| 434 |
+
"held_out": {
|
| 435 |
+
"executed_cleanly": 1.0,
|
| 436 |
+
"checkpoint_valid": 1.0,
|
| 437 |
+
"loss_decreased": 0.6652959989556817,
|
| 438 |
+
"metrics_in_range": 1.0,
|
| 439 |
+
"no_forbidden_workarounds": 1.0,
|
| 440 |
+
"intent_preserved": 1.0,
|
| 441 |
+
"hidden_tests_passed": 1.0
|
| 442 |
+
},
|
| 443 |
+
"task_id": "electra_classification"
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 447 |
+
"breakage_params": {
|
| 448 |
+
"function_name": "TrainingArguments",
|
| 449 |
+
"removed_arg": "num_train_epochs",
|
| 450 |
+
"added_arg": "max_steps",
|
| 451 |
+
"added_value": "1000"
|
| 452 |
+
},
|
| 453 |
+
"error_signature": "",
|
| 454 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n",
|
| 455 |
+
"visible_reward": 1.8,
|
| 456 |
+
"held_out": {
|
| 457 |
+
"executed_cleanly": 1.0,
|
| 458 |
+
"checkpoint_valid": 1.0,
|
| 459 |
+
"loss_decreased": 0.8362977381032284,
|
| 460 |
+
"metrics_in_range": 1.0,
|
| 461 |
+
"no_forbidden_workarounds": 1.0,
|
| 462 |
+
"intent_preserved": 1.0,
|
| 463 |
+
"hidden_tests_passed": 1.0
|
| 464 |
+
},
|
| 465 |
+
"task_id": "electra_classification"
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"primitive_type": "ChangeTokenizerBehavior",
|
| 469 |
+
"breakage_params": {
|
| 470 |
+
"old_kwarg": "truncation",
|
| 471 |
+
"old_value": "True",
|
| 472 |
+
"new_kwarg": "truncate",
|
| 473 |
+
"new_value": "True"
|
| 474 |
+
},
|
| 475 |
+
"error_signature": "",
|
| 476 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n",
|
| 477 |
+
"visible_reward": 1.8,
|
| 478 |
+
"held_out": {
|
| 479 |
+
"executed_cleanly": 1.0,
|
| 480 |
+
"checkpoint_valid": 1.0,
|
| 481 |
+
"loss_decreased": 0.8434749013439302,
|
| 482 |
+
"metrics_in_range": 1.0,
|
| 483 |
+
"no_forbidden_workarounds": 1.0,
|
| 484 |
+
"intent_preserved": 1.0,
|
| 485 |
+
"hidden_tests_passed": 1.0
|
| 486 |
+
},
|
| 487 |
+
"task_id": "electra_classification"
|
| 488 |
+
},
|
| 489 |
+
{
|
| 490 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 491 |
+
"breakage_params": {
|
| 492 |
+
"old_column": "text",
|
| 493 |
+
"new_column": "input_text"
|
| 494 |
+
},
|
| 495 |
+
"error_signature": "",
|
| 496 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 497 |
+
"visible_reward": 1.8,
|
| 498 |
+
"held_out": {
|
| 499 |
+
"executed_cleanly": 1.0,
|
| 500 |
+
"checkpoint_valid": 1.0,
|
| 501 |
+
"loss_decreased": 0.775726750559039,
|
| 502 |
+
"metrics_in_range": 1.0,
|
| 503 |
+
"no_forbidden_workarounds": 1.0,
|
| 504 |
+
"intent_preserved": 1.0,
|
| 505 |
+
"hidden_tests_passed": 1.0
|
| 506 |
+
},
|
| 507 |
+
"task_id": "albert_qa"
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 511 |
+
"breakage_params": {
|
| 512 |
+
"function_name": "TrainingArguments",
|
| 513 |
+
"removed_arg": "num_train_epochs",
|
| 514 |
+
"added_arg": "max_steps",
|
| 515 |
+
"added_value": "1000"
|
| 516 |
+
},
|
| 517 |
+
"error_signature": "",
|
| 518 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
|
| 519 |
+
"visible_reward": 1.8,
|
| 520 |
+
"held_out": {
|
| 521 |
+
"executed_cleanly": 1.0,
|
| 522 |
+
"checkpoint_valid": 1.0,
|
| 523 |
+
"loss_decreased": 0.9085137085137085,
|
| 524 |
+
"metrics_in_range": 1.0,
|
| 525 |
+
"no_forbidden_workarounds": 1.0,
|
| 526 |
+
"intent_preserved": 1.0,
|
| 527 |
+
"hidden_tests_passed": 1.0
|
| 528 |
+
},
|
| 529 |
+
"task_id": "distilbert_sst2"
|
| 530 |
+
},
|
| 531 |
+
{
|
| 532 |
+
"primitive_type": "RenameApiCall",
|
| 533 |
+
"breakage_params": {
|
| 534 |
+
"old_name": "trainer.train",
|
| 535 |
+
"new_name": "trainer.start_training"
|
| 536 |
+
},
|
| 537 |
+
"error_signature": "",
|
| 538 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -51,5 +51,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 539 |
+
"visible_reward": 1.8,
|
| 540 |
+
"held_out": {
|
| 541 |
+
"executed_cleanly": 1.0,
|
| 542 |
+
"checkpoint_valid": 1.0,
|
| 543 |
+
"loss_decreased": 0.7424872199130476,
|
| 544 |
+
"metrics_in_range": 1.0,
|
| 545 |
+
"no_forbidden_workarounds": 1.0,
|
| 546 |
+
"intent_preserved": 1.0,
|
| 547 |
+
"hidden_tests_passed": 1.0
|
| 548 |
+
},
|
| 549 |
+
"task_id": "bert_ner"
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 553 |
+
"breakage_params": {
|
| 554 |
+
"function_name": "TrainingArguments",
|
| 555 |
+
"removed_arg": "num_train_epochs",
|
| 556 |
+
"added_arg": "max_steps",
|
| 557 |
+
"added_value": "1000"
|
| 558 |
+
},
|
| 559 |
+
"error_signature": "",
|
| 560 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n",
|
| 561 |
+
"visible_reward": 1.8,
|
| 562 |
+
"held_out": {
|
| 563 |
+
"executed_cleanly": 1.0,
|
| 564 |
+
"checkpoint_valid": 1.0,
|
| 565 |
+
"loss_decreased": 0.8076153403327943,
|
| 566 |
+
"metrics_in_range": 1.0,
|
| 567 |
+
"no_forbidden_workarounds": 1.0,
|
| 568 |
+
"intent_preserved": 1.0,
|
| 569 |
+
"hidden_tests_passed": 1.0
|
| 570 |
+
},
|
| 571 |
+
"task_id": "distilbert_sst2"
|
| 572 |
+
},
|
| 573 |
+
{
|
| 574 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 575 |
+
"breakage_params": {
|
| 576 |
+
"old_column": "text",
|
| 577 |
+
"new_column": "input_text"
|
| 578 |
+
},
|
| 579 |
+
"error_signature": "",
|
| 580 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 581 |
+
"visible_reward": 1.8,
|
| 582 |
+
"held_out": {
|
| 583 |
+
"executed_cleanly": 1.0,
|
| 584 |
+
"checkpoint_valid": 1.0,
|
| 585 |
+
"loss_decreased": 0.8882627677936846,
|
| 586 |
+
"metrics_in_range": 1.0,
|
| 587 |
+
"no_forbidden_workarounds": 1.0,
|
| 588 |
+
"intent_preserved": 1.0,
|
| 589 |
+
"hidden_tests_passed": 1.0
|
| 590 |
+
},
|
| 591 |
+
"task_id": "albert_qa"
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"primitive_type": "RemoveDeprecatedMethod",
|
| 595 |
+
"breakage_params": {
|
| 596 |
+
"class_name": "Trainer",
|
| 597 |
+
"method_name": "save_model",
|
| 598 |
+
"replacement": "save_to_hub"
|
| 599 |
+
},
|
| 600 |
+
"error_signature": "",
|
| 601 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -40,4 +40,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 602 |
+
"visible_reward": 1.8,
|
| 603 |
+
"held_out": {
|
| 604 |
+
"executed_cleanly": 1.0,
|
| 605 |
+
"checkpoint_valid": 1.0,
|
| 606 |
+
"loss_decreased": 0.5938341205749403,
|
| 607 |
+
"metrics_in_range": 1.0,
|
| 608 |
+
"no_forbidden_workarounds": 1.0,
|
| 609 |
+
"intent_preserved": 1.0,
|
| 610 |
+
"hidden_tests_passed": 1.0
|
| 611 |
+
},
|
| 612 |
+
"task_id": "gpt2_textgen"
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 616 |
+
"breakage_params": {
|
| 617 |
+
"old_column": "text",
|
| 618 |
+
"new_column": "input_text"
|
| 619 |
+
},
|
| 620 |
+
"error_signature": "",
|
| 621 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -15,5 +15,5 @@\n \n def tokenize(examples):\n- return tokenizer(examples[\"input_text\"], truncation=True, max_length=64)\n+ return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n \n \n",
|
| 622 |
+
"visible_reward": 1.8,
|
| 623 |
+
"held_out": {
|
| 624 |
+
"executed_cleanly": 1.0,
|
| 625 |
+
"checkpoint_valid": 1.0,
|
| 626 |
+
"loss_decreased": 0.6555927441014835,
|
| 627 |
+
"metrics_in_range": 1.0,
|
| 628 |
+
"no_forbidden_workarounds": 1.0,
|
| 629 |
+
"intent_preserved": 1.0,
|
| 630 |
+
"hidden_tests_passed": 1.0
|
| 631 |
+
},
|
| 632 |
+
"task_id": "gpt2_textgen"
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"primitive_type": "RenameApiCall",
|
| 636 |
+
"breakage_params": {
|
| 637 |
+
"old_name": "trainer.train",
|
| 638 |
+
"new_name": "trainer.start_training"
|
| 639 |
+
},
|
| 640 |
+
"error_signature": "",
|
| 641 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 642 |
+
"visible_reward": 1.8,
|
| 643 |
+
"held_out": {
|
| 644 |
+
"executed_cleanly": 1.0,
|
| 645 |
+
"checkpoint_valid": 1.0,
|
| 646 |
+
"loss_decreased": 0.755194754910818,
|
| 647 |
+
"metrics_in_range": 1.0,
|
| 648 |
+
"no_forbidden_workarounds": 1.0,
|
| 649 |
+
"intent_preserved": 1.0,
|
| 650 |
+
"hidden_tests_passed": 1.0
|
| 651 |
+
},
|
| 652 |
+
"task_id": "albert_qa"
|
| 653 |
+
},
|
| 654 |
+
{
|
| 655 |
+
"primitive_type": "RenameApiCall",
|
| 656 |
+
"breakage_params": {
|
| 657 |
+
"old_name": "trainer.train",
|
| 658 |
+
"new_name": "trainer.start_training"
|
| 659 |
+
},
|
| 660 |
+
"error_signature": "",
|
| 661 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -49,5 +49,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 662 |
+
"visible_reward": 1.8,
|
| 663 |
+
"held_out": {
|
| 664 |
+
"executed_cleanly": 1.0,
|
| 665 |
+
"checkpoint_valid": 1.0,
|
| 666 |
+
"loss_decreased": 0.8654821132433073,
|
| 667 |
+
"metrics_in_range": 1.0,
|
| 668 |
+
"no_forbidden_workarounds": 1.0,
|
| 669 |
+
"intent_preserved": 1.0,
|
| 670 |
+
"hidden_tests_passed": 1.0
|
| 671 |
+
},
|
| 672 |
+
"task_id": "distilbert_sst2"
|
| 673 |
+
},
|
| 674 |
+
{
|
| 675 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 676 |
+
"breakage_params": {
|
| 677 |
+
"old_column": "label",
|
| 678 |
+
"new_column": "labels"
|
| 679 |
+
},
|
| 680 |
+
"error_signature": "",
|
| 681 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n- inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+ inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n \n",
|
| 682 |
+
"visible_reward": 1.8,
|
| 683 |
+
"held_out": {
|
| 684 |
+
"executed_cleanly": 1.0,
|
| 685 |
+
"checkpoint_valid": 1.0,
|
| 686 |
+
"loss_decreased": 0.8319525054273182,
|
| 687 |
+
"metrics_in_range": 1.0,
|
| 688 |
+
"no_forbidden_workarounds": 1.0,
|
| 689 |
+
"intent_preserved": 1.0,
|
| 690 |
+
"hidden_tests_passed": 1.0
|
| 691 |
+
},
|
| 692 |
+
"task_id": "vit_cifar10"
|
| 693 |
+
},
|
| 694 |
+
{
|
| 695 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 696 |
+
"breakage_params": {
|
| 697 |
+
"old_column": "text",
|
| 698 |
+
"new_column": "input_text"
|
| 699 |
+
},
|
| 700 |
+
"error_signature": "",
|
| 701 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 702 |
+
"visible_reward": 1.8,
|
| 703 |
+
"held_out": {
|
| 704 |
+
"executed_cleanly": 1.0,
|
| 705 |
+
"checkpoint_valid": 1.0,
|
| 706 |
+
"loss_decreased": 0.8109320292832547,
|
| 707 |
+
"metrics_in_range": 1.0,
|
| 708 |
+
"no_forbidden_workarounds": 1.0,
|
| 709 |
+
"intent_preserved": 1.0,
|
| 710 |
+
"hidden_tests_passed": 1.0
|
| 711 |
+
},
|
| 712 |
+
"task_id": "albert_qa"
|
| 713 |
+
},
|
| 714 |
+
{
|
| 715 |
+
"primitive_type": "ModifyConfigField",
|
| 716 |
+
"breakage_params": {
|
| 717 |
+
"config_class": "TrainingArguments",
|
| 718 |
+
"field_name": "per_device_train_batch_size",
|
| 719 |
+
"new_value": "1"
|
| 720 |
+
},
|
| 721 |
+
"error_signature": "",
|
| 722 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -36,5 +36,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
|
| 723 |
+
"visible_reward": 1.8,
|
| 724 |
+
"held_out": {
|
| 725 |
+
"executed_cleanly": 1.0,
|
| 726 |
+
"checkpoint_valid": 1.0,
|
| 727 |
+
"loss_decreased": 0.8409642541924095,
|
| 728 |
+
"metrics_in_range": 1.0,
|
| 729 |
+
"no_forbidden_workarounds": 1.0,
|
| 730 |
+
"intent_preserved": 1.0,
|
| 731 |
+
"hidden_tests_passed": 1.0
|
| 732 |
+
},
|
| 733 |
+
"task_id": "distilbert_sst2"
|
| 734 |
+
},
|
| 735 |
+
{
|
| 736 |
+
"primitive_type": "ChangeArgumentSignature",
|
| 737 |
+
"breakage_params": {
|
| 738 |
+
"function_name": "TrainingArguments",
|
| 739 |
+
"removed_arg": "num_train_epochs",
|
| 740 |
+
"added_arg": "max_steps",
|
| 741 |
+
"added_value": "1000"
|
| 742 |
+
},
|
| 743 |
+
"error_signature": "",
|
| 744 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n",
|
| 745 |
+
"visible_reward": 1.8,
|
| 746 |
+
"held_out": {
|
| 747 |
+
"executed_cleanly": 1.0,
|
| 748 |
+
"checkpoint_valid": 1.0,
|
| 749 |
+
"loss_decreased": 0.8891815856777494,
|
| 750 |
+
"metrics_in_range": 1.0,
|
| 751 |
+
"no_forbidden_workarounds": 1.0,
|
| 752 |
+
"intent_preserved": 1.0,
|
| 753 |
+
"hidden_tests_passed": 1.0
|
| 754 |
+
},
|
| 755 |
+
"task_id": "electra_classification"
|
| 756 |
+
},
|
| 757 |
+
{
|
| 758 |
+
"primitive_type": "ModifyConfigField",
|
| 759 |
+
"breakage_params": {
|
| 760 |
+
"config_class": "TrainingArguments",
|
| 761 |
+
"field_name": "per_device_train_batch_size",
|
| 762 |
+
"new_value": "1"
|
| 763 |
+
},
|
| 764 |
+
"error_signature": "",
|
| 765 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -29,5 +29,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n",
|
| 766 |
+
"visible_reward": 1.8,
|
| 767 |
+
"held_out": {
|
| 768 |
+
"executed_cleanly": 1.0,
|
| 769 |
+
"checkpoint_valid": 1.0,
|
| 770 |
+
"loss_decreased": 0.7900720214449505,
|
| 771 |
+
"metrics_in_range": 1.0,
|
| 772 |
+
"no_forbidden_workarounds": 1.0,
|
| 773 |
+
"intent_preserved": 1.0,
|
| 774 |
+
"hidden_tests_passed": 1.0
|
| 775 |
+
},
|
| 776 |
+
"task_id": "vit_cifar10"
|
| 777 |
+
},
|
| 778 |
+
{
|
| 779 |
+
"primitive_type": "RemoveDeprecatedMethod",
|
| 780 |
+
"breakage_params": {
|
| 781 |
+
"class_name": "Trainer",
|
| 782 |
+
"method_name": "save_model",
|
| 783 |
+
"replacement": "save_to_hub"
|
| 784 |
+
},
|
| 785 |
+
"error_signature": "",
|
| 786 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -38,4 +38,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 787 |
+
"visible_reward": 1.8,
|
| 788 |
+
"held_out": {
|
| 789 |
+
"executed_cleanly": 1.0,
|
| 790 |
+
"checkpoint_valid": 1.0,
|
| 791 |
+
"loss_decreased": 0.7984906001446131,
|
| 792 |
+
"metrics_in_range": 1.0,
|
| 793 |
+
"no_forbidden_workarounds": 1.0,
|
| 794 |
+
"intent_preserved": 1.0,
|
| 795 |
+
"hidden_tests_passed": 1.0
|
| 796 |
+
},
|
| 797 |
+
"task_id": "vit_cifar10"
|
| 798 |
+
},
|
| 799 |
+
{
|
| 800 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 801 |
+
"breakage_params": {
|
| 802 |
+
"old_column": "text",
|
| 803 |
+
"new_column": "input_text"
|
| 804 |
+
},
|
| 805 |
+
"error_signature": "",
|
| 806 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n",
|
| 807 |
+
"visible_reward": 1.8,
|
| 808 |
+
"held_out": {
|
| 809 |
+
"executed_cleanly": 1.0,
|
| 810 |
+
"checkpoint_valid": 1.0,
|
| 811 |
+
"loss_decreased": 0.7808289396602227,
|
| 812 |
+
"metrics_in_range": 1.0,
|
| 813 |
+
"no_forbidden_workarounds": 1.0,
|
| 814 |
+
"intent_preserved": 1.0,
|
| 815 |
+
"hidden_tests_passed": 1.0
|
| 816 |
+
},
|
| 817 |
+
"task_id": "albert_qa"
|
| 818 |
+
},
|
| 819 |
+
{
|
| 820 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 821 |
+
"breakage_params": {
|
| 822 |
+
"old_column": "tokens",
|
| 823 |
+
"new_column": "words"
|
| 824 |
+
},
|
| 825 |
+
"error_signature": "",
|
| 826 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"words\"], is_split_into_words=True, truncation=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n",
|
| 827 |
+
"visible_reward": 1.8,
|
| 828 |
+
"held_out": {
|
| 829 |
+
"executed_cleanly": 1.0,
|
| 830 |
+
"checkpoint_valid": 1.0,
|
| 831 |
+
"loss_decreased": 0.8699562543975037,
|
| 832 |
+
"metrics_in_range": 1.0,
|
| 833 |
+
"no_forbidden_workarounds": 1.0,
|
| 834 |
+
"intent_preserved": 1.0,
|
| 835 |
+
"hidden_tests_passed": 1.0
|
| 836 |
+
},
|
| 837 |
+
"task_id": "bert_ner"
|
| 838 |
+
},
|
| 839 |
+
{
|
| 840 |
+
"primitive_type": "RenameApiCall",
|
| 841 |
+
"breakage_params": {
|
| 842 |
+
"old_name": "trainer.train",
|
| 843 |
+
"new_name": "trainer.start_training"
|
| 844 |
+
},
|
| 845 |
+
"error_signature": "",
|
| 846 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 847 |
+
"visible_reward": 1.8,
|
| 848 |
+
"held_out": {
|
| 849 |
+
"executed_cleanly": 1.0,
|
| 850 |
+
"checkpoint_valid": 1.0,
|
| 851 |
+
"loss_decreased": 0.911495927422025,
|
| 852 |
+
"metrics_in_range": 1.0,
|
| 853 |
+
"no_forbidden_workarounds": 1.0,
|
| 854 |
+
"intent_preserved": 1.0,
|
| 855 |
+
"hidden_tests_passed": 1.0
|
| 856 |
+
},
|
| 857 |
+
"task_id": "albert_qa"
|
| 858 |
+
},
|
| 859 |
+
{
|
| 860 |
+
"primitive_type": "RemoveDeprecatedMethod",
|
| 861 |
+
"breakage_params": {
|
| 862 |
+
"class_name": "Trainer",
|
| 863 |
+
"method_name": "save_model",
|
| 864 |
+
"replacement": "save_to_hub"
|
| 865 |
+
},
|
| 866 |
+
"error_signature": "",
|
| 867 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -64,4 +64,4 @@\n )\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n",
|
| 868 |
+
"visible_reward": 1.8,
|
| 869 |
+
"held_out": {
|
| 870 |
+
"executed_cleanly": 1.0,
|
| 871 |
+
"checkpoint_valid": 1.0,
|
| 872 |
+
"loss_decreased": 0.6131321254553196,
|
| 873 |
+
"metrics_in_range": 1.0,
|
| 874 |
+
"no_forbidden_workarounds": 1.0,
|
| 875 |
+
"intent_preserved": 1.0,
|
| 876 |
+
"hidden_tests_passed": 1.0
|
| 877 |
+
},
|
| 878 |
+
"task_id": "albert_qa"
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"primitive_type": "RestructureDatasetSchema",
|
| 882 |
+
"breakage_params": {
|
| 883 |
+
"old_column": "label",
|
| 884 |
+
"new_column": "labels"
|
| 885 |
+
},
|
| 886 |
+
"error_signature": "",
|
| 887 |
+
"repair_diff": "--- a/train.py\n+++ b/train.py\n@@ -22,5 +22,5 @@\n \n dataset = dataset.map(tokenize, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n",
|
| 888 |
+
"visible_reward": 1.8,
|
| 889 |
+
"held_out": {
|
| 890 |
+
"executed_cleanly": 1.0,
|
| 891 |
+
"checkpoint_valid": 1.0,
|
| 892 |
+
"loss_decreased": 0.6040748525323751,
|
| 893 |
+
"metrics_in_range": 1.0,
|
| 894 |
+
"no_forbidden_workarounds": 1.0,
|
| 895 |
+
"intent_preserved": 1.0,
|
| 896 |
+
"hidden_tests_passed": 1.0
|
| 897 |
+
},
|
| 898 |
+
"task_id": "electra_classification"
|
| 899 |
+
}
|
| 900 |
+
],
|
| 901 |
+
"size": 43,
|
| 902 |
+
"by_primitive": {
|
| 903 |
+
"ChangeTokenizerBehavior": 7,
|
| 904 |
+
"RestructureDatasetSchema": 15,
|
| 905 |
+
"ChangeArgumentSignature": 7,
|
| 906 |
+
"RemoveDeprecatedMethod": 5,
|
| 907 |
+
"RenameApiCall": 6,
|
| 908 |
+
"ModifyConfigField": 3
|
| 909 |
+
}
|
| 910 |
+
}
|
debug_trace.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from forgeenv.roles.drift_generator import BaselineDriftGenerator
|
| 2 |
+
from forgeenv.roles.prompts import render_drift_generator_prompt
|
| 3 |
+
from forgeenv.tasks.task_sampler import TaskSampler
|
| 4 |
+
|
| 5 |
+
sampler = TaskSampler()
|
| 6 |
+
script = sampler.get_by_id("simple_regression").script_content
|
| 7 |
+
|
| 8 |
+
prompt = render_drift_generator_prompt(script, "ChangeTokenizerBehavior", {"transformers": "4.40"})
|
| 9 |
+
fence = "```python"
|
| 10 |
+
script_block = ""
|
| 11 |
+
if fence in prompt:
|
| 12 |
+
script_block = prompt.split(fence, 1)[1].split("```", 1)[0]
|
| 13 |
+
print("script_block len:", len(script_block))
|
| 14 |
+
print("first 80 chars:", repr(script_block[:80]))
|
| 15 |
+
|
| 16 |
+
gen = BaselineDriftGenerator(seed=0)
|
| 17 |
+
spec = gen.propose(target_category="ChangeTokenizerBehavior", script=script_block)
|
| 18 |
+
print("spec:", spec)
|
demo-space/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ForgeEnv Repair Agent Demo
|
| 3 |
+
emoji: 🔧
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.7.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
hardware: zero-a10g
|
| 12 |
+
tags:
|
| 13 |
+
- openenv
|
| 14 |
+
- self-improvement
|
| 15 |
+
- code-repair
|
| 16 |
+
- schema-drift
|
| 17 |
+
short_description: Trained Repair Agent fixes HF scripts under drift
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# ForgeEnv Repair Agent — Live Demo
|
| 21 |
+
|
| 22 |
+
Paste a broken HuggingFace training script and the error trace it produced.
|
| 23 |
+
The trained Repair Agent (Qwen2.5-3B + LoRA) emits a unified diff that should
|
| 24 |
+
restore the script. Inference runs on ZeroGPU (free A10G).
|
| 25 |
+
|
| 26 |
+
- **Environment server (OpenEnv):**
|
| 27 |
+
<https://huggingface.co/spaces/akhiilll/forgeenv>
|
| 28 |
+
- **Trained model (LoRA + repair_library.json):**
|
| 29 |
+
<https://huggingface.co/akhiilll/forgeenv-repair-agent>
|
| 30 |
+
- **Project README & plots:**
|
| 31 |
+
<https://github.com/akhiilll/forgeenv>
|
demo-space/app.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio demo Space for the ForgeEnv Repair Agent.
|
| 2 |
+
|
| 3 |
+
Loads the trained LoRA adapter from the Hub and exposes a 2-input form:
|
| 4 |
+
broken script + error trace. Output is a unified diff. Inference runs on
|
| 5 |
+
ZeroGPU (`@spaces.GPU`) so we don't pay for idle GPU time.
|
| 6 |
+
|
| 7 |
+
If the trained adapter isn't yet uploaded, the demo falls back to the
|
| 8 |
+
deterministic ``BaselineRepairAgent`` so the Space still works end-to-end.
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import traceback
|
| 15 |
+
from typing import Optional
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
|
| 19 |
+
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-3B-Instruct")
|
| 20 |
+
ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "akhiilll/forgeenv-repair-agent")
|
| 21 |
+
|
| 22 |
+
_TITLE = "ForgeEnv Repair Agent — fix HuggingFace scripts under library drift"
|
| 23 |
+
_DESCRIPTION = (
|
| 24 |
+
"Paste a broken HuggingFace training script and the error trace it "
|
| 25 |
+
"produced. The Repair Agent returns a minimal unified diff. The model "
|
| 26 |
+
"was trained inside [ForgeEnv](https://huggingface.co/spaces/"
|
| 27 |
+
"akhiilll/forgeenv) using GRPO (TRL + Unsloth) with R-Zero-style "
|
| 28 |
+
"Challenger / Solver co-evolution."
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
_EXAMPLES = [
|
| 32 |
+
[
|
| 33 |
+
(
|
| 34 |
+
"from transformers import Trainer, TrainingArguments\n"
|
| 35 |
+
"from datasets import load_dataset\n\n"
|
| 36 |
+
"ds = load_dataset('glue', 'sst2')\n"
|
| 37 |
+
"args = TrainingArguments(output_dir='out')\n"
|
| 38 |
+
"trainer = Trainer(model=None, args=args, train_dataset=ds['train'])\n"
|
| 39 |
+
"trainer.start_training()\n"
|
| 40 |
+
),
|
| 41 |
+
(
|
| 42 |
+
"AttributeError: 'Trainer' object has no attribute 'start_training'. "
|
| 43 |
+
"Did you mean: 'train'?"
|
| 44 |
+
),
|
| 45 |
+
],
|
| 46 |
+
[
|
| 47 |
+
(
|
| 48 |
+
"import torch.legacy as torch\n"
|
| 49 |
+
"x = torch.randn(2, 3)\n"
|
| 50 |
+
"print(x)\n"
|
| 51 |
+
),
|
| 52 |
+
"ModuleNotFoundError: No module named 'torch.legacy'",
|
| 53 |
+
],
|
| 54 |
+
[
|
| 55 |
+
(
|
| 56 |
+
"from transformers import AutoTokenizer\n"
|
| 57 |
+
"tok = AutoTokenizer.from_pretrained('bert-base-uncased')\n"
|
| 58 |
+
"out = tok(['hello world'], pad_to_max_length=True, truncate=True)\n"
|
| 59 |
+
"print(out)\n"
|
| 60 |
+
),
|
| 61 |
+
(
|
| 62 |
+
"TypeError: __call__() got an unexpected keyword argument "
|
| 63 |
+
"'pad_to_max_length' (use `padding=True` instead)."
|
| 64 |
+
),
|
| 65 |
+
],
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
_PROMPT_TEMPLATE = (
|
| 69 |
+
"You are an expert ML engineer who fixes broken HuggingFace training "
|
| 70 |
+
"scripts caused by library version drift.\n\n"
|
| 71 |
+
"Library versions: {versions}\n\n"
|
| 72 |
+
"Broken script:\n```python\n{script}\n```\n\n"
|
| 73 |
+
"Error trace:\n```\n{trace}\n```\n\n"
|
| 74 |
+
"Output ONLY a minimal unified diff (`--- a/script.py` / `+++ "
|
| 75 |
+
"b/script.py` headers, then hunks). No prose."
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
_model = None
|
| 79 |
+
_tokenizer = None
|
| 80 |
+
_load_error: Optional[str] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _load_model() -> None:
|
| 84 |
+
"""Lazy-load the trained LoRA on first GPU invocation."""
|
| 85 |
+
global _model, _tokenizer, _load_error
|
| 86 |
+
if _model is not None or _load_error is not None:
|
| 87 |
+
return
|
| 88 |
+
try:
|
| 89 |
+
import torch
|
| 90 |
+
from peft import PeftModel
|
| 91 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 92 |
+
|
| 93 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 94 |
+
base = AutoModelForCausalLM.from_pretrained(
|
| 95 |
+
BASE_MODEL,
|
| 96 |
+
torch_dtype=torch.float16,
|
| 97 |
+
device_map="auto",
|
| 98 |
+
)
|
| 99 |
+
try:
|
| 100 |
+
model = PeftModel.from_pretrained(base, ADAPTER_REPO)
|
| 101 |
+
except Exception as e: # noqa: BLE001
|
| 102 |
+
print(f"[demo] adapter not found ({e}); using base model")
|
| 103 |
+
model = base
|
| 104 |
+
_model = model.eval()
|
| 105 |
+
_tokenizer = tokenizer
|
| 106 |
+
except Exception as e: # noqa: BLE001
|
| 107 |
+
_load_error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _baseline_fallback(script: str, error_trace: str) -> str:
|
| 111 |
+
"""Deterministic repair if the trained model isn't available.
|
| 112 |
+
|
| 113 |
+
Uses the in-repo BaselineRepairAgent if the package is installed; else
|
| 114 |
+
just returns an explanatory message.
|
| 115 |
+
"""
|
| 116 |
+
try:
|
| 117 |
+
from forgeenv.roles.repair_agent import BaselineRepairAgent
|
| 118 |
+
|
| 119 |
+
agent = BaselineRepairAgent()
|
| 120 |
+
return agent.repair(script, breakage_spec=None, original_script=None)
|
| 121 |
+
except Exception: # noqa: BLE001
|
| 122 |
+
return (
|
| 123 |
+
"# (Fallback) Trained adapter unavailable in this Space.\n"
|
| 124 |
+
"# Likely fix based on the error trace:\n"
|
| 125 |
+
f"# {error_trace.splitlines()[0] if error_trace else ''}\n"
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _generate_with_model(prompt: str, max_new_tokens: int = 512) -> str:
|
| 130 |
+
import torch
|
| 131 |
+
|
| 132 |
+
inputs = _tokenizer(prompt, return_tensors="pt").to(_model.device)
|
| 133 |
+
with torch.no_grad():
|
| 134 |
+
out = _model.generate(
|
| 135 |
+
**inputs,
|
| 136 |
+
max_new_tokens=max_new_tokens,
|
| 137 |
+
do_sample=True,
|
| 138 |
+
temperature=0.3,
|
| 139 |
+
top_p=0.9,
|
| 140 |
+
pad_token_id=_tokenizer.eos_token_id,
|
| 141 |
+
)
|
| 142 |
+
completion = _tokenizer.decode(
|
| 143 |
+
out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
|
| 144 |
+
)
|
| 145 |
+
return completion.strip()
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# Wrap inference in a `@spaces.GPU` decorator if available so we get a free
|
| 149 |
+
# ZeroGPU slice. Outside ZeroGPU it's a no-op.
|
| 150 |
+
try:
|
| 151 |
+
import spaces # type: ignore
|
| 152 |
+
|
| 153 |
+
_gpu_decorator = spaces.GPU(duration=60)
|
| 154 |
+
except Exception: # noqa: BLE001
|
| 155 |
+
def _gpu_decorator(fn):
|
| 156 |
+
return fn
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
@_gpu_decorator
|
| 160 |
+
def repair_script(script: str, error_trace: str) -> str:
|
| 161 |
+
if not script.strip():
|
| 162 |
+
return "# Paste a broken script first."
|
| 163 |
+
|
| 164 |
+
_load_model()
|
| 165 |
+
if _model is None:
|
| 166 |
+
return _baseline_fallback(script, error_trace)
|
| 167 |
+
|
| 168 |
+
versions = json.dumps(
|
| 169 |
+
{"transformers": "4.45.0", "datasets": "2.20.0", "torch": "2.4.0"}
|
| 170 |
+
)
|
| 171 |
+
prompt = _PROMPT_TEMPLATE.format(
|
| 172 |
+
versions=versions, script=script, trace=error_trace or "(no trace)"
|
| 173 |
+
)
|
| 174 |
+
try:
|
| 175 |
+
return _generate_with_model(prompt)
|
| 176 |
+
except Exception as e: # noqa: BLE001
|
| 177 |
+
return f"# generation failed: {e}\n" + _baseline_fallback(script, error_trace)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
with gr.Blocks(title="ForgeEnv Repair Agent") as demo:
|
| 181 |
+
gr.Markdown(f"# {_TITLE}\n\n{_DESCRIPTION}")
|
| 182 |
+
with gr.Row():
|
| 183 |
+
with gr.Column():
|
| 184 |
+
in_script = gr.Code(
|
| 185 |
+
label="Broken HuggingFace script",
|
| 186 |
+
language="python",
|
| 187 |
+
lines=22,
|
| 188 |
+
)
|
| 189 |
+
in_trace = gr.Textbox(
|
| 190 |
+
label="Error trace",
|
| 191 |
+
lines=6,
|
| 192 |
+
placeholder="Traceback...",
|
| 193 |
+
)
|
| 194 |
+
run_btn = gr.Button("Repair", variant="primary")
|
| 195 |
+
with gr.Column():
|
| 196 |
+
out_diff = gr.Code(
|
| 197 |
+
label="Suggested repair (unified diff)",
|
| 198 |
+
language="markdown",
|
| 199 |
+
lines=22,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
gr.Examples(examples=_EXAMPLES, inputs=[in_script, in_trace])
|
| 203 |
+
run_btn.click(repair_script, inputs=[in_script, in_trace], outputs=out_diff)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
if __name__ == "__main__":
|
| 207 |
+
demo.launch()
|
demo-space/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.7.1
|
| 2 |
+
torch>=2.1.0
|
| 3 |
+
transformers>=4.40.0
|
| 4 |
+
peft>=0.10.0
|
| 5 |
+
accelerate>=0.30.0
|
| 6 |
+
spaces>=0.28.0
|
| 7 |
+
audioop-lts; python_version >= "3.13"
|
forgeenv-space/Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 4 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
+
RUN apt-get update \
|
| 8 |
+
&& apt-get install -y --no-install-recommends git curl \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
COPY forgeenv/ forgeenv/
|
| 17 |
+
COPY openenv.yaml .
|
| 18 |
+
|
| 19 |
+
ENV PORT=7860
|
| 20 |
+
EXPOSE 7860
|
| 21 |
+
|
| 22 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
| 23 |
+
CMD curl -f http://127.0.0.1:7860/health || exit 1
|
| 24 |
+
|
| 25 |
+
CMD ["uvicorn", "forgeenv.env.server:app", "--host", "0.0.0.0", "--port", "7860"]
|
forgeenv-space/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ForgeEnv
|
| 3 |
+
emoji: 🔧
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: true
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- self-play
|
| 13 |
+
- self-improvement
|
| 14 |
+
- code-repair
|
| 15 |
+
- schema-drift
|
| 16 |
+
- reinforcement-learning
|
| 17 |
+
- huggingface
|
| 18 |
+
short_description: Self-improving RL env for HF library-drift repair
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# ForgeEnv — OpenEnv Server
|
| 22 |
+
|
| 23 |
+
This Space hosts the **ForgeEnv** OpenEnv-compliant environment as a FastAPI
|
| 24 |
+
service. It exposes the standard `reset`, `step`, and `state` endpoints and is
|
| 25 |
+
the runtime that training notebooks (TRL + Unsloth) connect to.
|
| 26 |
+
|
| 27 |
+
> **Theme:** Self-Improvement (Hackathon Theme #4) — Challenger / Solver
|
| 28 |
+
> co-evolution via R-Zero, SPIRAL, and Absolute Zero Reasoner techniques.
|
| 29 |
+
|
| 30 |
+
## What it does
|
| 31 |
+
|
| 32 |
+
ForgeEnv simulates **HuggingFace library version drift**. A *Drift Generator*
|
| 33 |
+
proposes a realistic breakage to a working training script (renamed APIs,
|
| 34 |
+
deprecated imports, changed argument signatures, etc.). A *Repair Agent* then
|
| 35 |
+
emits a unified diff that should restore the script. Reward is computed by an
|
| 36 |
+
execution simulator + AST checker + held-out evaluator (multi-component to
|
| 37 |
+
resist reward hacking).
|
| 38 |
+
|
| 39 |
+
## API
|
| 40 |
+
|
| 41 |
+
The server uses [`openenv-core`](https://pypi.org/project/openenv-core/) and
|
| 42 |
+
follows the Gym-style contract:
|
| 43 |
+
|
| 44 |
+
| Endpoint | Method | Purpose |
|
| 45 |
+
| -------- | ------ | -------------------------------------------------- |
|
| 46 |
+
| `/reset` | POST | Sample a fresh task, return drift-gen observation |
|
| 47 |
+
| `/step` | POST | Apply a `ForgeAction` (breakage or repair) |
|
| 48 |
+
| `/state` | GET | Inspect the current internal state |
|
| 49 |
+
| `/health`| GET | Health probe (used by the container HEALTHCHECK) |
|
| 50 |
+
|
| 51 |
+
`ForgeAction` is a discriminated union of `BreakageAction` (used in phase 1)
|
| 52 |
+
and `RepairAction` (used in phase 2). See
|
| 53 |
+
[`forgeenv/env/actions.py`](forgeenv/env/actions.py).
|
| 54 |
+
|
| 55 |
+
## Quick test
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
curl -X POST https://akhiilll-forgeenv.hf.space/reset
|
| 59 |
+
curl https://akhiilll-forgeenv.hf.space/state
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
```python
|
| 63 |
+
from openenv.core.env_client import EnvClient
|
| 64 |
+
|
| 65 |
+
async with EnvClient(base_url="https://akhiilll-forgeenv.hf.space") as client:
|
| 66 |
+
obs = await client.reset()
|
| 67 |
+
print(obs.observation.current_phase, obs.observation.task_id)
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Project links
|
| 71 |
+
|
| 72 |
+
- **Main repo / training notebooks / plots:**
|
| 73 |
+
<https://github.com/akhiilll/forgeenv>
|
| 74 |
+
- **Repair Agent model (LoRA):**
|
| 75 |
+
<https://huggingface.co/akhiilll/forgeenv-repair-agent>
|
| 76 |
+
- **Demo (Gradio + ZeroGPU):**
|
| 77 |
+
<https://huggingface.co/spaces/akhiilll/forgeenv-demo>
|
| 78 |
+
|
| 79 |
+
## Citations
|
| 80 |
+
|
| 81 |
+
- Huang et al., *R-Zero: Self-Evolving Reasoning LLM From Zero Data* (2025)
|
| 82 |
+
- Zhao et al., *Absolute Zero: Reinforced Self-play Reasoning with Zero Data* (2025)
|
| 83 |
+
- Liu et al., *SPIRAL: Self-Play on Zero-Sum Games* (2025)
|
| 84 |
+
- [arXiv:2408.10215](https://arxiv.org/abs/2408.10215) — Reward engineering & shaping
|
| 85 |
+
- [arXiv:2601.19100](https://arxiv.org/abs/2601.19100) — Reward engineering for RL in software tasks
|
forgeenv-space/forgeenv/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ForgeEnv: Self-improving RL environment for HuggingFace ecosystem repair."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
| 4 |
+
__author__ = "akhiilll"
|
forgeenv-space/forgeenv/artifacts/repair_library.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Persisted "repair library" — the model's accumulated knowledge of
|
| 2 |
+
known breakage -> repair pairs. Curated from successful rollouts during
|
| 3 |
+
training. Loaded at inference time as a few-shot prefix when the agent
|
| 4 |
+
recognises a familiar error class.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
from dataclasses import asdict, dataclass, field
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class RepairExample:
|
| 16 |
+
primitive_type: str
|
| 17 |
+
breakage_params: dict[str, Any]
|
| 18 |
+
error_signature: str
|
| 19 |
+
repair_diff: str
|
| 20 |
+
visible_reward: float
|
| 21 |
+
held_out: dict[str, float]
|
| 22 |
+
task_id: str = ""
|
| 23 |
+
|
| 24 |
+
def signature_key(self) -> str:
|
| 25 |
+
return f"{self.primitive_type}::{self.error_signature[:80]}"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class RepairLibrary:
|
| 30 |
+
examples: list[RepairExample] = field(default_factory=list)
|
| 31 |
+
|
| 32 |
+
def add(self, example: RepairExample) -> None:
|
| 33 |
+
self.examples.append(example)
|
| 34 |
+
|
| 35 |
+
def best_match(self, primitive_type: str, error_text: str) -> Optional[RepairExample]:
|
| 36 |
+
"""Return the highest-reward example whose primitive_type matches and
|
| 37 |
+
whose error text overlaps."""
|
| 38 |
+
candidates = [
|
| 39 |
+
e for e in self.examples if e.primitive_type == primitive_type
|
| 40 |
+
]
|
| 41 |
+
if not candidates:
|
| 42 |
+
return None
|
| 43 |
+
scored = sorted(
|
| 44 |
+
candidates,
|
| 45 |
+
key=lambda e: (
|
| 46 |
+
_ngram_overlap(e.error_signature, error_text),
|
| 47 |
+
e.visible_reward,
|
| 48 |
+
),
|
| 49 |
+
reverse=True,
|
| 50 |
+
)
|
| 51 |
+
return scored[0] if scored else None
|
| 52 |
+
|
| 53 |
+
def to_dict(self) -> dict:
|
| 54 |
+
return {
|
| 55 |
+
"version": "1",
|
| 56 |
+
"examples": [asdict(e) for e in self.examples],
|
| 57 |
+
"size": len(self.examples),
|
| 58 |
+
"by_primitive": _count_by_primitive(self.examples),
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
def save(self, path: str | Path) -> None:
|
| 62 |
+
path = Path(path)
|
| 63 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 64 |
+
path.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
| 65 |
+
|
| 66 |
+
@classmethod
|
| 67 |
+
def load(cls, path: str | Path) -> "RepairLibrary":
|
| 68 |
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
| 69 |
+
examples = [RepairExample(**e) for e in data.get("examples", [])]
|
| 70 |
+
return cls(examples=examples)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _ngram_overlap(a: str, b: str, n: int = 3) -> float:
|
| 74 |
+
if not a or not b:
|
| 75 |
+
return 0.0
|
| 76 |
+
|
| 77 |
+
def grams(text: str) -> set[str]:
|
| 78 |
+
text = text.lower()
|
| 79 |
+
return {text[i : i + n] for i in range(len(text) - n + 1)}
|
| 80 |
+
|
| 81 |
+
ga, gb = grams(a), grams(b)
|
| 82 |
+
if not ga or not gb:
|
| 83 |
+
return 0.0
|
| 84 |
+
return len(ga & gb) / max(1, len(ga | gb))
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _count_by_primitive(examples: list[RepairExample]) -> dict[str, int]:
|
| 88 |
+
counts: dict[str, int] = {}
|
| 89 |
+
for e in examples:
|
| 90 |
+
counts[e.primitive_type] = counts.get(e.primitive_type, 0) + 1
|
| 91 |
+
return counts
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def curate_from_rollouts(
|
| 95 |
+
rollout_results: list,
|
| 96 |
+
min_reward: float = 0.6,
|
| 97 |
+
min_held_out_clean: float = 0.5,
|
| 98 |
+
) -> RepairLibrary:
|
| 99 |
+
"""Build a RepairLibrary from a list of rollout dicts/RolloutResults."""
|
| 100 |
+
lib = RepairLibrary()
|
| 101 |
+
for r in rollout_results:
|
| 102 |
+
get = r.get if isinstance(r, dict) else lambda k, default=None: getattr(r, k, default)
|
| 103 |
+
if float(get("visible_reward", 0.0) or 0.0) < min_reward:
|
| 104 |
+
continue
|
| 105 |
+
if float(get("held_out_breakdown", {}).get("executed_cleanly", 0.0)) < min_held_out_clean:
|
| 106 |
+
continue
|
| 107 |
+
lib.add(
|
| 108 |
+
RepairExample(
|
| 109 |
+
primitive_type=str(get("primitive_type", "unknown")),
|
| 110 |
+
breakage_params=dict(get("info", {}).get("breakage_spec", {}).get("params", {}))
|
| 111 |
+
if isinstance(get("info", {}), dict)
|
| 112 |
+
else {},
|
| 113 |
+
error_signature=str(get("error_trace", "") or "")[:160],
|
| 114 |
+
repair_diff=str(get("repair_completion", "") or get("info", {}).get("repair_diff", ""))[:2000],
|
| 115 |
+
visible_reward=float(get("visible_reward", 0.0) or 0.0),
|
| 116 |
+
held_out=dict(get("held_out_breakdown", {}) or {}),
|
| 117 |
+
task_id=str(get("task_id", "")),
|
| 118 |
+
)
|
| 119 |
+
)
|
| 120 |
+
return lib
|
forgeenv-space/forgeenv/drift/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/drift/library_drift_engine.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Library Drift Engine.
|
| 2 |
+
|
| 3 |
+
Manages library version snapshots and triggers version upgrades during
|
| 4 |
+
training to create non-stationary verification. In simulation mode it
|
| 5 |
+
just tracks the current snapshot index — that index influences
|
| 6 |
+
breakage selection and is exposed in observations so the Repair Agent
|
| 7 |
+
can adapt.
|
| 8 |
+
|
| 9 |
+
Also exposes Chojecki GVU's SNR computation
|
| 10 |
+
(https://arxiv.org/abs/2512.02731 Definition 4.4).
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import math
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
|
| 17 |
+
DEFAULT_VERSION_SNAPSHOTS: list[dict[str, str]] = [
|
| 18 |
+
{"transformers": "4.36.0", "datasets": "2.14.0", "trl": "0.7.0"},
|
| 19 |
+
{"transformers": "4.40.0", "datasets": "2.18.0", "trl": "0.8.0"},
|
| 20 |
+
{"transformers": "4.45.0", "datasets": "3.0.0", "trl": "0.10.0"},
|
| 21 |
+
{"transformers": "4.50.0", "datasets": "3.2.0", "trl": "0.12.0"},
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class LibraryDriftEngine:
|
| 27 |
+
snapshots: list[dict[str, str]] = field(
|
| 28 |
+
default_factory=lambda: list(DEFAULT_VERSION_SNAPSHOTS)
|
| 29 |
+
)
|
| 30 |
+
current_index: int = 0
|
| 31 |
+
drift_history: list[dict] = field(default_factory=list)
|
| 32 |
+
|
| 33 |
+
def current_versions(self) -> dict[str, str]:
|
| 34 |
+
return dict(self.snapshots[self.current_index])
|
| 35 |
+
|
| 36 |
+
def maybe_drift(self, episode_num: int, drift_every: int = 50) -> bool:
|
| 37 |
+
if (
|
| 38 |
+
episode_num > 0
|
| 39 |
+
and episode_num % drift_every == 0
|
| 40 |
+
and self.current_index < len(self.snapshots) - 1
|
| 41 |
+
):
|
| 42 |
+
prev = self.snapshots[self.current_index]
|
| 43 |
+
self.current_index += 1
|
| 44 |
+
self.drift_history.append(
|
| 45 |
+
{
|
| 46 |
+
"episode": episode_num,
|
| 47 |
+
"from": prev,
|
| 48 |
+
"to": self.snapshots[self.current_index],
|
| 49 |
+
}
|
| 50 |
+
)
|
| 51 |
+
return True
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
def reset(self) -> None:
|
| 55 |
+
self.current_index = 0
|
| 56 |
+
self.drift_history.clear()
|
| 57 |
+
|
| 58 |
+
@staticmethod
|
| 59 |
+
def compute_snr(
|
| 60 |
+
recent_held_out: list[float], recent_visible: list[float]
|
| 61 |
+
) -> dict[str, float]:
|
| 62 |
+
"""SNR per Chojecki GVU Def 4.4: SNR = mean(rewards)^2 / variance(rewards)."""
|
| 63 |
+
|
| 64 |
+
def snr(values: list[float]) -> float:
|
| 65 |
+
if len(values) < 2:
|
| 66 |
+
return 0.0
|
| 67 |
+
mean = sum(values) / len(values)
|
| 68 |
+
var = sum((v - mean) ** 2 for v in values) / len(values)
|
| 69 |
+
return mean**2 / max(var, 1e-8)
|
| 70 |
+
|
| 71 |
+
return {
|
| 72 |
+
"snr_verifier": snr(recent_held_out),
|
| 73 |
+
"snr_generator": snr(recent_visible),
|
| 74 |
+
}
|
forgeenv-space/forgeenv/env/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/env/actions.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic action models for ForgeEnv (compatible with OpenEnv 0.2.x).
|
| 2 |
+
|
| 3 |
+
Episodes have two phases — drift_gen (Challenger) and repair (Solver) — so
|
| 4 |
+
we expose a single union ForgeAction that carries either a BreakageAction
|
| 5 |
+
or a RepairAction. The environment dispatches on which sub-field is set.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from typing import Any, Literal, Optional
|
| 10 |
+
|
| 11 |
+
from pydantic import Field
|
| 12 |
+
|
| 13 |
+
from openenv.core import Action
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BreakageAction(Action):
|
| 17 |
+
"""Drift Generator's action: pick a primitive type + parameters."""
|
| 18 |
+
|
| 19 |
+
action_type: Literal["breakage"] = "breakage"
|
| 20 |
+
primitive_type: str = Field(
|
| 21 |
+
..., description="One of the registered breakage primitive class names"
|
| 22 |
+
)
|
| 23 |
+
params: dict[str, Any] = Field(
|
| 24 |
+
default_factory=dict, description="Primitive-specific parameters"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RepairAction(Action):
|
| 29 |
+
"""Repair Agent's action: a unified diff (or full replacement script)."""
|
| 30 |
+
|
| 31 |
+
action_type: Literal["repair"] = "repair"
|
| 32 |
+
unified_diff: str = Field(..., description="Unified diff or full replacement script")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ForgeAction(Action):
|
| 36 |
+
"""Union action: exactly one of `breakage` / `repair` must be set.
|
| 37 |
+
|
| 38 |
+
This is the type registered with OpenEnv's `create_app`. It avoids
|
| 39 |
+
Pydantic discriminated unions to keep the OpenAPI schema flat and
|
| 40 |
+
cross-version-friendly.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
breakage: Optional[BreakageAction] = None
|
| 44 |
+
repair: Optional[RepairAction] = None
|
| 45 |
+
|
| 46 |
+
def model_post_init(self, __context: Any) -> None:
|
| 47 |
+
if (self.breakage is None) == (self.repair is None):
|
| 48 |
+
raise ValueError(
|
| 49 |
+
"ForgeAction requires exactly one of `breakage` or `repair` to be set."
|
| 50 |
+
)
|
forgeenv-space/forgeenv/env/diff_utils.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified-diff application utilities.
|
| 2 |
+
|
| 3 |
+
The Repair Agent submits a unified diff. We need a permissive applier
|
| 4 |
+
because LLM diffs are often malformed (wrong line numbers, missing
|
| 5 |
+
context, extra prose). We try the strict applier first, then fall
|
| 6 |
+
back to applying hunks via plain string replacement.
|
| 7 |
+
|
| 8 |
+
The agent may also submit a full Python script instead of a diff
|
| 9 |
+
(common when the model's diff format breaks). We detect this and
|
| 10 |
+
treat it as a complete replacement.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import difflib
|
| 15 |
+
import re
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
|
| 19 |
+
_SCRIPT_MARKERS = ("import ", "from ", "def ", "class ", "print(")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def looks_like_full_script(text: str) -> bool:
|
| 23 |
+
"""Heuristic: text is probably a full python script, not a diff."""
|
| 24 |
+
lines = text.lstrip().splitlines()
|
| 25 |
+
if not lines:
|
| 26 |
+
return False
|
| 27 |
+
has_diff_header = any(
|
| 28 |
+
line.startswith(("---", "+++", "@@")) for line in lines[:5]
|
| 29 |
+
)
|
| 30 |
+
if has_diff_header:
|
| 31 |
+
return False
|
| 32 |
+
# If we see two or more script-style markers in the first 30 lines,
|
| 33 |
+
# treat as a full replacement script.
|
| 34 |
+
head = "\n".join(lines[:30])
|
| 35 |
+
hits = sum(1 for marker in _SCRIPT_MARKERS if marker in head)
|
| 36 |
+
return hits >= 2
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _strict_apply(broken_script: str, diff_text: str) -> str | None:
|
| 40 |
+
"""Apply a unified diff strictly. Returns None on any failure."""
|
| 41 |
+
lines = broken_script.splitlines(keepends=True)
|
| 42 |
+
out: list[str] = []
|
| 43 |
+
diff_lines = diff_text.splitlines()
|
| 44 |
+
i = 0
|
| 45 |
+
src_idx = 0
|
| 46 |
+
in_hunk = False
|
| 47 |
+
hunk_old: list[str] = []
|
| 48 |
+
hunk_new: list[str] = []
|
| 49 |
+
|
| 50 |
+
while i < len(diff_lines):
|
| 51 |
+
line = diff_lines[i]
|
| 52 |
+
if line.startswith(("---", "+++")):
|
| 53 |
+
i += 1
|
| 54 |
+
continue
|
| 55 |
+
if line.startswith("@@"):
|
| 56 |
+
# Flush previous hunk
|
| 57 |
+
if in_hunk:
|
| 58 |
+
# Find the hunk_old block in the source starting at src_idx.
|
| 59 |
+
target = "".join(hunk_old)
|
| 60 |
+
source_remainder = "".join(lines[src_idx:])
|
| 61 |
+
pos = source_remainder.find(target)
|
| 62 |
+
if pos == -1:
|
| 63 |
+
return None
|
| 64 |
+
out.append(source_remainder[:pos])
|
| 65 |
+
out.append("".join(hunk_new))
|
| 66 |
+
src_idx += len(source_remainder[: pos + len(target)].splitlines(keepends=True))
|
| 67 |
+
hunk_old, hunk_new = [], []
|
| 68 |
+
in_hunk = True
|
| 69 |
+
i += 1
|
| 70 |
+
continue
|
| 71 |
+
if in_hunk:
|
| 72 |
+
if line.startswith("+"):
|
| 73 |
+
hunk_new.append(line[1:] + "\n")
|
| 74 |
+
elif line.startswith("-"):
|
| 75 |
+
hunk_old.append(line[1:] + "\n")
|
| 76 |
+
else:
|
| 77 |
+
# context line
|
| 78 |
+
ctx = line[1:] if line.startswith(" ") else line
|
| 79 |
+
hunk_old.append(ctx + "\n")
|
| 80 |
+
hunk_new.append(ctx + "\n")
|
| 81 |
+
i += 1
|
| 82 |
+
|
| 83 |
+
# Flush trailing hunk
|
| 84 |
+
if in_hunk and (hunk_old or hunk_new):
|
| 85 |
+
target = "".join(hunk_old)
|
| 86 |
+
source_remainder = "".join(lines[src_idx:])
|
| 87 |
+
pos = source_remainder.find(target)
|
| 88 |
+
if pos == -1:
|
| 89 |
+
return None
|
| 90 |
+
out.append(source_remainder[:pos])
|
| 91 |
+
out.append("".join(hunk_new))
|
| 92 |
+
consumed = source_remainder[: pos + len(target)]
|
| 93 |
+
src_idx += len(consumed.splitlines(keepends=True))
|
| 94 |
+
|
| 95 |
+
out.append("".join(lines[src_idx:]))
|
| 96 |
+
return "".join(out)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _permissive_apply(broken_script: str, diff_text: str) -> str:
|
| 100 |
+
"""Apply a malformed diff by extracting (-,+) line pairs and doing
|
| 101 |
+
a tolerant search-and-replace.
|
| 102 |
+
"""
|
| 103 |
+
repaired = broken_script
|
| 104 |
+
pairs: list[tuple[str, str]] = []
|
| 105 |
+
lines = diff_text.splitlines()
|
| 106 |
+
pending_minus: str | None = None
|
| 107 |
+
|
| 108 |
+
for line in lines:
|
| 109 |
+
if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
|
| 110 |
+
pending_minus = None
|
| 111 |
+
continue
|
| 112 |
+
if line.startswith("-"):
|
| 113 |
+
pending_minus = line[1:].strip()
|
| 114 |
+
elif line.startswith("+") and pending_minus is not None:
|
| 115 |
+
pairs.append((pending_minus, line[1:].strip()))
|
| 116 |
+
pending_minus = None
|
| 117 |
+
elif pending_minus is not None and not line.startswith(" "):
|
| 118 |
+
# standalone deletion — skip in permissive mode (we can't
|
| 119 |
+
# reliably know what to delete without context)
|
| 120 |
+
pending_minus = None
|
| 121 |
+
|
| 122 |
+
for old, new in pairs:
|
| 123 |
+
if old and old in repaired:
|
| 124 |
+
repaired = repaired.replace(old, new, 1)
|
| 125 |
+
|
| 126 |
+
return repaired
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def apply_unified_diff(broken_script: str, diff_text: str) -> str:
|
| 130 |
+
"""Try every strategy in order and return the first that produces a change.
|
| 131 |
+
|
| 132 |
+
Strategies:
|
| 133 |
+
1. If `diff_text` looks like a full script, return it directly.
|
| 134 |
+
2. Try strict diff application.
|
| 135 |
+
3. Fall back to permissive (-,+) line-pair replacement.
|
| 136 |
+
4. As last resort, return the broken script unchanged.
|
| 137 |
+
"""
|
| 138 |
+
diff_text = diff_text or ""
|
| 139 |
+
if not diff_text.strip():
|
| 140 |
+
return broken_script
|
| 141 |
+
|
| 142 |
+
if looks_like_full_script(diff_text):
|
| 143 |
+
return diff_text
|
| 144 |
+
|
| 145 |
+
if _HUNK_RE.search(diff_text) or "---" in diff_text or "+++" in diff_text:
|
| 146 |
+
strict = _strict_apply(broken_script, diff_text)
|
| 147 |
+
if strict is not None and strict != broken_script:
|
| 148 |
+
return strict
|
| 149 |
+
|
| 150 |
+
perm = _permissive_apply(broken_script, diff_text)
|
| 151 |
+
return perm
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def make_unified_diff(before: str, after: str, path: str = "train.py") -> str:
|
| 155 |
+
"""Produce a canonical unified diff from before -> after."""
|
| 156 |
+
diff = difflib.unified_diff(
|
| 157 |
+
before.splitlines(keepends=True),
|
| 158 |
+
after.splitlines(keepends=True),
|
| 159 |
+
fromfile=f"a/{path}",
|
| 160 |
+
tofile=f"b/{path}",
|
| 161 |
+
n=2,
|
| 162 |
+
)
|
| 163 |
+
return "".join(diff)
|
forgeenv-space/forgeenv/env/forge_environment.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ForgeEnvironment: the OpenEnv Environment subclass for ForgeEnv.
|
| 2 |
+
|
| 3 |
+
Episode flow (exactly 2 steps per episode):
|
| 4 |
+
reset() -> sample task, ask Teacher for category
|
| 5 |
+
step(BreakageAction) -> Drift Generator's proposal is applied; broken
|
| 6 |
+
script is run, error trace captured.
|
| 7 |
+
step(RepairAction) -> Repair diff is applied; script is re-executed;
|
| 8 |
+
visible + held-out rewards computed; episode ends.
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import time
|
| 13 |
+
import uuid
|
| 14 |
+
from typing import Any, Optional
|
| 15 |
+
|
| 16 |
+
from openenv.core import Environment
|
| 17 |
+
|
| 18 |
+
from forgeenv.drift.library_drift_engine import LibraryDriftEngine
|
| 19 |
+
from forgeenv.env.actions import BreakageAction, ForgeAction, RepairAction
|
| 20 |
+
from forgeenv.env.diff_utils import apply_unified_diff
|
| 21 |
+
from forgeenv.env.observations import ForgeObservation
|
| 22 |
+
from forgeenv.primitives.breakage_primitives import (
|
| 23 |
+
PRIMITIVE_REGISTRY,
|
| 24 |
+
parse_breakage_spec,
|
| 25 |
+
)
|
| 26 |
+
from forgeenv.roles.teacher import Teacher
|
| 27 |
+
from forgeenv.sandbox.simulation_mode import SimulationExecutor
|
| 28 |
+
from forgeenv.tasks.models import ExecutionResult, Task
|
| 29 |
+
from forgeenv.tasks.task_sampler import TaskSampler
|
| 30 |
+
from forgeenv.verifier.held_out_evaluator import compute_held_out_scores
|
| 31 |
+
from forgeenv.verifier.visible_verifier import compute_visible_reward
|
| 32 |
+
|
| 33 |
+
DEFAULT_CATEGORIES = sorted(PRIMITIVE_REGISTRY.keys())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ForgeEnvironment(Environment[ForgeAction, ForgeObservation, dict]):
|
| 37 |
+
"""OpenEnv-compliant environment for HuggingFace ecosystem repair."""
|
| 38 |
+
|
| 39 |
+
SUPPORTS_CONCURRENT_SESSIONS = False # Teacher state is global per env
|
| 40 |
+
|
| 41 |
+
def __init__(
|
| 42 |
+
self,
|
| 43 |
+
task_sampler: Optional[TaskSampler] = None,
|
| 44 |
+
teacher: Optional[Teacher] = None,
|
| 45 |
+
executor: Optional[SimulationExecutor] = None,
|
| 46 |
+
drift_engine: Optional[LibraryDriftEngine] = None,
|
| 47 |
+
seed: Optional[int] = None,
|
| 48 |
+
) -> None:
|
| 49 |
+
super().__init__()
|
| 50 |
+
self.task_sampler = task_sampler or TaskSampler()
|
| 51 |
+
self.teacher = teacher or Teacher(
|
| 52 |
+
categories=list(DEFAULT_CATEGORIES) or ["api_drift"]
|
| 53 |
+
)
|
| 54 |
+
self.executor = executor or SimulationExecutor(seed=seed)
|
| 55 |
+
self.drift_engine = drift_engine or LibraryDriftEngine()
|
| 56 |
+
|
| 57 |
+
self._episode_id: Optional[str] = None
|
| 58 |
+
self._episode_count: int = 0
|
| 59 |
+
self._current_task: Optional[Task] = None
|
| 60 |
+
self._original_script: str = ""
|
| 61 |
+
self._broken_script: str = ""
|
| 62 |
+
self._error_trace: str = ""
|
| 63 |
+
self._breakage_spec: Optional[dict[str, Any]] = None
|
| 64 |
+
self._target_category: str = ""
|
| 65 |
+
self._current_phase: str = "idle"
|
| 66 |
+
self._last_obs: Optional[ForgeObservation] = None
|
| 67 |
+
|
| 68 |
+
# ------------------------------------------------------------------ API
|
| 69 |
+
def reset(
|
| 70 |
+
self,
|
| 71 |
+
seed: Optional[int] = None,
|
| 72 |
+
episode_id: Optional[str] = None,
|
| 73 |
+
difficulty: Optional[str] = "easy",
|
| 74 |
+
**kwargs: Any,
|
| 75 |
+
) -> ForgeObservation:
|
| 76 |
+
self._episode_id = episode_id or str(uuid.uuid4())
|
| 77 |
+
self._episode_count += 1
|
| 78 |
+
self._target_category = self.teacher.select_next_category()
|
| 79 |
+
|
| 80 |
+
task = self.task_sampler.sample(difficulty=difficulty)
|
| 81 |
+
if task is None:
|
| 82 |
+
raise RuntimeError("Task sampler returned no tasks (empty seed corpus?)")
|
| 83 |
+
self._current_task = task
|
| 84 |
+
self._original_script = task.script_content
|
| 85 |
+
self._broken_script = ""
|
| 86 |
+
self._error_trace = ""
|
| 87 |
+
self._breakage_spec = None
|
| 88 |
+
self._current_phase = "drift_gen"
|
| 89 |
+
|
| 90 |
+
# Library drift trigger every 50 episodes (configurable from outside).
|
| 91 |
+
drifted = self.drift_engine.maybe_drift(self._episode_count, drift_every=50)
|
| 92 |
+
|
| 93 |
+
obs = ForgeObservation(
|
| 94 |
+
current_phase="drift_gen",
|
| 95 |
+
task_id=task.task_id,
|
| 96 |
+
task_description=task.description,
|
| 97 |
+
target_category=self._target_category,
|
| 98 |
+
script_content=self._original_script,
|
| 99 |
+
error_trace=None,
|
| 100 |
+
library_versions=self.drift_engine.current_versions(),
|
| 101 |
+
episode_step=0,
|
| 102 |
+
done=False,
|
| 103 |
+
reward=0.0,
|
| 104 |
+
info={
|
| 105 |
+
"episode_id": self._episode_id,
|
| 106 |
+
"episode_count": self._episode_count,
|
| 107 |
+
"drift_triggered": drifted,
|
| 108 |
+
"available_primitives": sorted(PRIMITIVE_REGISTRY),
|
| 109 |
+
},
|
| 110 |
+
)
|
| 111 |
+
self._last_obs = obs
|
| 112 |
+
return obs
|
| 113 |
+
|
| 114 |
+
def step(
|
| 115 |
+
self,
|
| 116 |
+
action: ForgeAction,
|
| 117 |
+
timeout_s: Optional[float] = None,
|
| 118 |
+
**kwargs: Any,
|
| 119 |
+
) -> ForgeObservation:
|
| 120 |
+
if self._current_phase == "drift_gen":
|
| 121 |
+
if action.breakage is None:
|
| 122 |
+
return self._error_obs("Expected BreakageAction in drift_gen phase")
|
| 123 |
+
return self._handle_breakage(action.breakage)
|
| 124 |
+
|
| 125 |
+
if self._current_phase == "repair":
|
| 126 |
+
if action.repair is None:
|
| 127 |
+
return self._error_obs("Expected RepairAction in repair phase")
|
| 128 |
+
return self._handle_repair(action.repair)
|
| 129 |
+
|
| 130 |
+
return self._error_obs(
|
| 131 |
+
f"step() called in invalid phase {self._current_phase!r} — call reset() first"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
@property
|
| 135 |
+
def state(self) -> dict:
|
| 136 |
+
return {
|
| 137 |
+
"phase": self._current_phase,
|
| 138 |
+
"episode_id": self._episode_id,
|
| 139 |
+
"episode_count": self._episode_count,
|
| 140 |
+
"task_id": self._current_task.task_id if self._current_task else None,
|
| 141 |
+
"target_category": self._target_category,
|
| 142 |
+
"library_versions": self.drift_engine.current_versions(),
|
| 143 |
+
"teacher": self.teacher.get_state(),
|
| 144 |
+
"drift_history": list(self.drift_engine.drift_history),
|
| 145 |
+
"breakage_spec": dict(self._breakage_spec) if self._breakage_spec else None,
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
# ---------------------------------------------------------------- helpers
|
| 149 |
+
def _handle_breakage(self, breakage: BreakageAction) -> ForgeObservation:
|
| 150 |
+
spec = {"primitive_type": breakage.primitive_type, "params": dict(breakage.params)}
|
| 151 |
+
try:
|
| 152 |
+
primitive = parse_breakage_spec(spec)
|
| 153 |
+
except ValueError as exc:
|
| 154 |
+
return self._error_obs(f"Invalid breakage spec: {exc}")
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
self._broken_script = primitive.apply(self._original_script)
|
| 158 |
+
except Exception as exc: # primitive bug — surface but don't crash server
|
| 159 |
+
return self._error_obs(f"Primitive apply failed: {exc}")
|
| 160 |
+
|
| 161 |
+
self._breakage_spec = spec
|
| 162 |
+
|
| 163 |
+
result = self.executor.execute(self._broken_script, self._current_task)
|
| 164 |
+
if result.exit_code != 0:
|
| 165 |
+
self._error_trace = result.stderr or "non-zero exit code, no stderr"
|
| 166 |
+
else:
|
| 167 |
+
# The breakage didn't actually break it; still proceed to repair phase
|
| 168 |
+
# (no-op repair is then a valid choice).
|
| 169 |
+
self._error_trace = "Script ran without observable error"
|
| 170 |
+
|
| 171 |
+
self._current_phase = "repair"
|
| 172 |
+
|
| 173 |
+
obs = ForgeObservation(
|
| 174 |
+
current_phase="repair",
|
| 175 |
+
task_id=self._current_task.task_id,
|
| 176 |
+
task_description=self._current_task.description,
|
| 177 |
+
target_category=primitive.category,
|
| 178 |
+
script_content=self._broken_script,
|
| 179 |
+
error_trace=self._error_trace,
|
| 180 |
+
library_versions=self.drift_engine.current_versions(),
|
| 181 |
+
episode_step=1,
|
| 182 |
+
done=False,
|
| 183 |
+
reward=0.0,
|
| 184 |
+
info={
|
| 185 |
+
"episode_id": self._episode_id,
|
| 186 |
+
"breakage_primitive": primitive.name,
|
| 187 |
+
"breakage_description": primitive.description,
|
| 188 |
+
},
|
| 189 |
+
)
|
| 190 |
+
self._last_obs = obs
|
| 191 |
+
return obs
|
| 192 |
+
|
| 193 |
+
def _handle_repair(self, repair: RepairAction) -> ForgeObservation:
|
| 194 |
+
repaired = apply_unified_diff(self._broken_script, repair.unified_diff or "")
|
| 195 |
+
|
| 196 |
+
t0 = time.time()
|
| 197 |
+
result = self.executor.execute(repaired, self._current_task)
|
| 198 |
+
result.script_content = repaired # ensure verifier sees what we ran
|
| 199 |
+
wall_ms = int((time.time() - t0) * 1000)
|
| 200 |
+
|
| 201 |
+
visible_reward, visible_breakdown = compute_visible_reward(
|
| 202 |
+
result, self._current_task
|
| 203 |
+
)
|
| 204 |
+
held_out = compute_held_out_scores(
|
| 205 |
+
result, self._current_task, repair_diff=repair.unified_diff or ""
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
success = result.exit_code == 0
|
| 209 |
+
category = (
|
| 210 |
+
self._breakage_spec.get("primitive_type", "unknown")
|
| 211 |
+
if self._breakage_spec
|
| 212 |
+
else "unknown"
|
| 213 |
+
)
|
| 214 |
+
# Update Teacher's curriculum state
|
| 215 |
+
self.teacher.update(category, success)
|
| 216 |
+
|
| 217 |
+
self._current_phase = "done"
|
| 218 |
+
|
| 219 |
+
obs = ForgeObservation(
|
| 220 |
+
current_phase="done",
|
| 221 |
+
task_id=self._current_task.task_id,
|
| 222 |
+
task_description=self._current_task.description,
|
| 223 |
+
target_category=category,
|
| 224 |
+
script_content=repaired,
|
| 225 |
+
error_trace=result.stderr or None,
|
| 226 |
+
library_versions=self.drift_engine.current_versions(),
|
| 227 |
+
episode_step=2,
|
| 228 |
+
done=True,
|
| 229 |
+
reward=visible_reward,
|
| 230 |
+
reward_breakdown=visible_breakdown,
|
| 231 |
+
held_out_breakdown=held_out,
|
| 232 |
+
info={
|
| 233 |
+
"episode_id": self._episode_id,
|
| 234 |
+
"exit_code": result.exit_code,
|
| 235 |
+
"wall_time_ms": wall_ms,
|
| 236 |
+
"checkpoint_exists": result.checkpoint_exists,
|
| 237 |
+
"stdout_tail": "\n".join(result.stdout.splitlines()[-5:]),
|
| 238 |
+
"breakage_spec": self._breakage_spec,
|
| 239 |
+
"teacher_state": self.teacher.get_state(),
|
| 240 |
+
},
|
| 241 |
+
)
|
| 242 |
+
self._last_obs = obs
|
| 243 |
+
return obs
|
| 244 |
+
|
| 245 |
+
def _error_obs(self, message: str) -> ForgeObservation:
|
| 246 |
+
"""Return a `done=True` error observation rather than raising."""
|
| 247 |
+
return ForgeObservation(
|
| 248 |
+
current_phase="done",
|
| 249 |
+
task_id=self._current_task.task_id if self._current_task else "",
|
| 250 |
+
task_description=self._current_task.description if self._current_task else "",
|
| 251 |
+
target_category=self._target_category,
|
| 252 |
+
script_content=self._broken_script or self._original_script,
|
| 253 |
+
error_trace=message,
|
| 254 |
+
library_versions=self.drift_engine.current_versions(),
|
| 255 |
+
episode_step=2,
|
| 256 |
+
done=True,
|
| 257 |
+
reward=0.0,
|
| 258 |
+
info={"error": message, "episode_id": self._episode_id},
|
| 259 |
+
)
|
forgeenv-space/forgeenv/env/observations.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic observation model for ForgeEnv."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from typing import Any, Optional
|
| 5 |
+
|
| 6 |
+
from pydantic import Field
|
| 7 |
+
|
| 8 |
+
from openenv.core import Observation
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ForgeObservation(Observation):
|
| 12 |
+
"""What the agent (or the trainer's rollout function) sees at each step.
|
| 13 |
+
|
| 14 |
+
Inherits `done`, `reward`, `metadata` from the OpenEnv `Observation` base.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
current_phase: str = Field(
|
| 18 |
+
..., description="One of 'drift_gen', 'repair', 'verify', 'done'"
|
| 19 |
+
)
|
| 20 |
+
task_id: str = ""
|
| 21 |
+
task_description: str = ""
|
| 22 |
+
target_category: str = ""
|
| 23 |
+
script_content: str = Field(default="", description="Current state of the script")
|
| 24 |
+
error_trace: Optional[str] = None
|
| 25 |
+
library_versions: dict[str, str] = Field(default_factory=dict)
|
| 26 |
+
reward_breakdown: dict[str, Any] = Field(default_factory=dict)
|
| 27 |
+
held_out_breakdown: dict[str, float] = Field(default_factory=dict)
|
| 28 |
+
episode_step: int = 0
|
| 29 |
+
info: dict[str, Any] = Field(default_factory=dict)
|
forgeenv-space/forgeenv/env/server.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI server for ForgeEnv (OpenEnv-compliant).
|
| 2 |
+
|
| 3 |
+
Exposes /reset, /step, /state HTTP endpoints via OpenEnv's `create_app`.
|
| 4 |
+
HF Spaces sets PORT=7860 automatically.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
from fastapi.responses import HTMLResponse
|
| 11 |
+
from openenv.core import create_app
|
| 12 |
+
|
| 13 |
+
from forgeenv.env.actions import ForgeAction
|
| 14 |
+
from forgeenv.env.forge_environment import ForgeEnvironment
|
| 15 |
+
from forgeenv.env.observations import ForgeObservation
|
| 16 |
+
|
| 17 |
+
app = create_app(
|
| 18 |
+
env=ForgeEnvironment,
|
| 19 |
+
action_cls=ForgeAction,
|
| 20 |
+
observation_cls=ForgeObservation,
|
| 21 |
+
env_name="forgeenv",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
_LANDING_HTML = """<!doctype html>
|
| 26 |
+
<html lang="en">
|
| 27 |
+
<head>
|
| 28 |
+
<meta charset="utf-8">
|
| 29 |
+
<title>ForgeEnv — OpenEnv server</title>
|
| 30 |
+
<meta name="viewport" content="width=device-width,initial-scale=1">
|
| 31 |
+
<style>
|
| 32 |
+
:root { color-scheme: light dark; }
|
| 33 |
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
| 34 |
+
max-width: 760px; margin: 2.5rem auto; padding: 0 1.25rem;
|
| 35 |
+
line-height: 1.55; color: #1f2937; background: #fafafa; }
|
| 36 |
+
@media (prefers-color-scheme: dark) { body { color: #e5e7eb; background: #0f172a; } }
|
| 37 |
+
h1 { font-size: 1.65rem; margin-bottom: 0.25rem; }
|
| 38 |
+
.sub { color: #6b7280; margin-top: 0; }
|
| 39 |
+
code, pre { font-family: ui-monospace, "SF Mono", Menlo, monospace; }
|
| 40 |
+
pre { background: rgba(127,127,127,0.12); padding: 0.9rem; border-radius: 8px;
|
| 41 |
+
overflow-x: auto; }
|
| 42 |
+
table { border-collapse: collapse; width: 100%; margin: 0.75rem 0 1.25rem; }
|
| 43 |
+
td, th { text-align: left; padding: 0.5rem 0.75rem;
|
| 44 |
+
border-bottom: 1px solid rgba(127,127,127,0.25); }
|
| 45 |
+
th { font-weight: 600; }
|
| 46 |
+
a { color: #2563eb; text-decoration: none; } a:hover { text-decoration: underline; }
|
| 47 |
+
.ok { color: #16a34a; font-weight: 600; }
|
| 48 |
+
.muted { color: #6b7280; font-size: 0.9rem; }
|
| 49 |
+
.pill { display: inline-block; padding: 0.1rem 0.5rem; border-radius: 999px;
|
| 50 |
+
background: rgba(34,197,94,0.15); color: #16a34a; font-size: 0.85rem; }
|
| 51 |
+
</style>
|
| 52 |
+
</head>
|
| 53 |
+
<body>
|
| 54 |
+
<h1>ForgeEnv 🔧 <span class="pill">running</span></h1>
|
| 55 |
+
<p class="sub">OpenEnv-compliant RL environment for HuggingFace
|
| 56 |
+
ecosystem repair under library version drift.</p>
|
| 57 |
+
|
| 58 |
+
<p>This URL serves the environment over HTTP. It is not a UI — it's the
|
| 59 |
+
runtime that <strong>training notebooks connect to</strong>. Open one of
|
| 60 |
+
the endpoints below, or use the demo Space to try the trained Repair
|
| 61 |
+
Agent in a browser.</p>
|
| 62 |
+
|
| 63 |
+
<h2>Endpoints</h2>
|
| 64 |
+
<table>
|
| 65 |
+
<tr><th>Method</th><th>Path</th><th>Purpose</th></tr>
|
| 66 |
+
<tr><td>GET </td><td><a href="/health">/health</a></td><td>Health probe</td></tr>
|
| 67 |
+
<tr><td>POST</td><td><code>/reset</code></td><td>Sample task, return drift-gen observation</td></tr>
|
| 68 |
+
<tr><td>POST</td><td><code>/step</code></td><td>Apply <code>ForgeAction</code> (breakage or repair)</td></tr>
|
| 69 |
+
<tr><td>GET </td><td><a href="/state">/state</a></td><td>Current internal state</td></tr>
|
| 70 |
+
<tr><td>GET </td><td><a href="/metadata">/metadata</a></td><td>Env name + version + schema URLs</td></tr>
|
| 71 |
+
<tr><td>GET </td><td><a href="/schema">/schema</a></td><td>Action / observation JSON schemas</td></tr>
|
| 72 |
+
<tr><td>GET </td><td><a href="/docs">/docs</a></td><td>Interactive Swagger UI</td></tr>
|
| 73 |
+
</table>
|
| 74 |
+
|
| 75 |
+
<h2>Quick start (Python)</h2>
|
| 76 |
+
<pre><code>import asyncio
|
| 77 |
+
from openenv.core import GenericEnvClient
|
| 78 |
+
|
| 79 |
+
async def go():
|
| 80 |
+
client = GenericEnvClient(base_url="https://akhiilll-forgeenv.hf.space")
|
| 81 |
+
obs = await client.reset()
|
| 82 |
+
print(obs.observation["current_phase"], obs.observation["task_id"])
|
| 83 |
+
|
| 84 |
+
asyncio.run(go())</code></pre>
|
| 85 |
+
|
| 86 |
+
<h2>Project links</h2>
|
| 87 |
+
<ul>
|
| 88 |
+
<li>Space card & README:
|
| 89 |
+
<a href="https://huggingface.co/spaces/akhiilll/forgeenv" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv</a></li>
|
| 90 |
+
<li>Gradio demo:
|
| 91 |
+
<a href="https://huggingface.co/spaces/akhiilll/forgeenv-demo" target="_blank" rel="noopener noreferrer">huggingface.co/spaces/akhiilll/forgeenv-demo</a></li>
|
| 92 |
+
<li>Trained model (LoRA) <span class="muted">— published after the Colab training run finishes</span>:
|
| 93 |
+
<a href="https://huggingface.co/akhiilll/forgeenv-repair-agent" target="_blank" rel="noopener noreferrer">huggingface.co/akhiilll/forgeenv-repair-agent</a></li>
|
| 94 |
+
</ul>
|
| 95 |
+
<p class="muted">Tip: if links don't open from inside the embedded Space frame,
|
| 96 |
+
right-click and choose <em>Open in new tab</em>, or open this URL directly
|
| 97 |
+
at <a href="https://akhiilll-forgeenv.hf.space/" target="_blank" rel="noopener noreferrer">akhiilll-forgeenv.hf.space</a>.</p>
|
| 98 |
+
</body>
|
| 99 |
+
</html>"""
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _attach_supplementary_routes(_app) -> None:
|
| 103 |
+
"""Add /health and a friendly GET / landing page if not present."""
|
| 104 |
+
existing = {
|
| 105 |
+
getattr(r, "path", None) for r in getattr(_app, "routes", [])
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
if "/health" not in existing:
|
| 109 |
+
@_app.get("/health")
|
| 110 |
+
def _health() -> dict:
|
| 111 |
+
return {"status": "ok", "env": "forgeenv"}
|
| 112 |
+
|
| 113 |
+
if "/" not in existing:
|
| 114 |
+
@_app.get("/", response_class=HTMLResponse, include_in_schema=False)
|
| 115 |
+
def _root() -> str:
|
| 116 |
+
return _LANDING_HTML
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
_attach_supplementary_routes(app)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
import uvicorn
|
| 124 |
+
|
| 125 |
+
port = int(os.environ.get("PORT", "7860"))
|
| 126 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
forgeenv-space/forgeenv/primitives/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/primitives/breakage_primitives.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""8 breakage primitives representing real HuggingFace/PyTorch ecosystem drift.
|
| 2 |
+
|
| 3 |
+
Each primitive transforms a working script to simulate a library upgrade
|
| 4 |
+
breakage. They double as the Drift Generator's structured action space.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from abc import ABC, abstractmethod
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class BreakagePrimitive(ABC):
|
| 15 |
+
"""Abstract base class for all breakage types."""
|
| 16 |
+
|
| 17 |
+
category: str = field(default="generic", init=False)
|
| 18 |
+
name: str = field(default="BreakagePrimitive", init=False)
|
| 19 |
+
description: str = field(default="", init=False)
|
| 20 |
+
|
| 21 |
+
@abstractmethod
|
| 22 |
+
def apply(self, script: str) -> str:
|
| 23 |
+
"""Transform `script` to introduce the breakage."""
|
| 24 |
+
|
| 25 |
+
def to_spec(self) -> dict:
|
| 26 |
+
"""Serialize to JSON-compatible spec for the LLM action space."""
|
| 27 |
+
return {
|
| 28 |
+
"primitive_type": self.__class__.__name__,
|
| 29 |
+
"category": self.category,
|
| 30 |
+
"params": self._get_params(),
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def _get_params(self) -> dict:
|
| 35 |
+
"""Return a JSON-serializable dict of constructor parameters."""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class RenameApiCall(BreakagePrimitive):
|
| 40 |
+
"""Rename a function/method call to simulate API deprecation."""
|
| 41 |
+
|
| 42 |
+
old_name: str = ""
|
| 43 |
+
new_name: str = ""
|
| 44 |
+
|
| 45 |
+
def __post_init__(self) -> None:
|
| 46 |
+
self.category = "api_drift"
|
| 47 |
+
self.name = "RenameApiCall"
|
| 48 |
+
self.description = f"Rename {self.old_name} -> {self.new_name}"
|
| 49 |
+
|
| 50 |
+
def apply(self, script: str) -> str:
|
| 51 |
+
if not self.old_name:
|
| 52 |
+
return script
|
| 53 |
+
# Use word-boundary replacement so we don't substring-match identifiers.
|
| 54 |
+
pattern = re.compile(rf"(?<!\w){re.escape(self.old_name)}(?!\w)")
|
| 55 |
+
return pattern.sub(self.new_name, script)
|
| 56 |
+
|
| 57 |
+
def _get_params(self) -> dict:
|
| 58 |
+
return {"old_name": self.old_name, "new_name": self.new_name}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class DeprecateImport(BreakagePrimitive):
|
| 63 |
+
"""Change an import path to simulate module restructuring."""
|
| 64 |
+
|
| 65 |
+
old_module: str = ""
|
| 66 |
+
new_module: str = ""
|
| 67 |
+
|
| 68 |
+
def __post_init__(self) -> None:
|
| 69 |
+
self.category = "import_drift"
|
| 70 |
+
self.name = "DeprecateImport"
|
| 71 |
+
self.description = f"Move {self.old_module} -> {self.new_module}"
|
| 72 |
+
|
| 73 |
+
def apply(self, script: str) -> str:
|
| 74 |
+
if not self.old_module:
|
| 75 |
+
return script
|
| 76 |
+
return script.replace(self.old_module, self.new_module)
|
| 77 |
+
|
| 78 |
+
def _get_params(self) -> dict:
|
| 79 |
+
return {"old_module": self.old_module, "new_module": self.new_module}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@dataclass
|
| 83 |
+
class ChangeArgumentSignature(BreakagePrimitive):
|
| 84 |
+
"""Remove an expected kwarg (and document a new required one)."""
|
| 85 |
+
|
| 86 |
+
function_name: str = ""
|
| 87 |
+
removed_arg: str = ""
|
| 88 |
+
added_arg: str = ""
|
| 89 |
+
added_value: str = ""
|
| 90 |
+
|
| 91 |
+
def __post_init__(self) -> None:
|
| 92 |
+
self.category = "api_drift"
|
| 93 |
+
self.name = "ChangeArgumentSignature"
|
| 94 |
+
self.description = (
|
| 95 |
+
f"Change args of {self.function_name}: -{self.removed_arg} +{self.added_arg}"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def apply(self, script: str) -> str:
|
| 99 |
+
if not self.removed_arg:
|
| 100 |
+
return script
|
| 101 |
+
pattern = rf"(\b{re.escape(self.removed_arg)}\s*=\s*[^,)]+,?\s*)"
|
| 102 |
+
return re.sub(pattern, "", script)
|
| 103 |
+
|
| 104 |
+
def _get_params(self) -> dict:
|
| 105 |
+
return {
|
| 106 |
+
"function_name": self.function_name,
|
| 107 |
+
"removed_arg": self.removed_arg,
|
| 108 |
+
"added_arg": self.added_arg,
|
| 109 |
+
"added_value": self.added_value,
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@dataclass
|
| 114 |
+
class ModifyConfigField(BreakagePrimitive):
|
| 115 |
+
"""Change a config-class default value to simulate behaviour drift."""
|
| 116 |
+
|
| 117 |
+
config_class: str = ""
|
| 118 |
+
field_name: str = ""
|
| 119 |
+
new_value: str = ""
|
| 120 |
+
|
| 121 |
+
def __post_init__(self) -> None:
|
| 122 |
+
self.category = "config_drift"
|
| 123 |
+
self.name = "ModifyConfigField"
|
| 124 |
+
self.description = f"Change {self.config_class}.{self.field_name}"
|
| 125 |
+
|
| 126 |
+
def apply(self, script: str) -> str:
|
| 127 |
+
if not self.field_name:
|
| 128 |
+
return script
|
| 129 |
+
pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
|
| 130 |
+
return re.sub(pattern, rf"\g<1>{self.new_value}", script)
|
| 131 |
+
|
| 132 |
+
def _get_params(self) -> dict:
|
| 133 |
+
return {
|
| 134 |
+
"config_class": self.config_class,
|
| 135 |
+
"field_name": self.field_name,
|
| 136 |
+
"new_value": self.new_value,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@dataclass
|
| 141 |
+
class RestructureDatasetSchema(BreakagePrimitive):
|
| 142 |
+
"""Rename a dataset column reference to simulate schema drift."""
|
| 143 |
+
|
| 144 |
+
old_column: str = ""
|
| 145 |
+
new_column: str = ""
|
| 146 |
+
|
| 147 |
+
def __post_init__(self) -> None:
|
| 148 |
+
self.category = "dataset_drift"
|
| 149 |
+
self.name = "RestructureDatasetSchema"
|
| 150 |
+
self.description = f"Rename column {self.old_column} -> {self.new_column}"
|
| 151 |
+
|
| 152 |
+
def apply(self, script: str) -> str:
|
| 153 |
+
if not self.old_column:
|
| 154 |
+
return script
|
| 155 |
+
return script.replace(
|
| 156 |
+
f'"{self.old_column}"', f'"{self.new_column}"'
|
| 157 |
+
).replace(
|
| 158 |
+
f"'{self.old_column}'", f"'{self.new_column}'"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
def _get_params(self) -> dict:
|
| 162 |
+
return {"old_column": self.old_column, "new_column": self.new_column}
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@dataclass
|
| 166 |
+
class ChangeTokenizerBehavior(BreakagePrimitive):
|
| 167 |
+
"""Change tokenizer call arguments."""
|
| 168 |
+
|
| 169 |
+
old_kwarg: str = ""
|
| 170 |
+
old_value: str = ""
|
| 171 |
+
new_kwarg: str = ""
|
| 172 |
+
new_value: str = ""
|
| 173 |
+
|
| 174 |
+
def __post_init__(self) -> None:
|
| 175 |
+
self.category = "tokenizer_drift"
|
| 176 |
+
self.name = "ChangeTokenizerBehavior"
|
| 177 |
+
self.description = f"Change tokenizer kwarg {self.old_kwarg}={self.old_value} -> {self.new_kwarg}={self.new_value}"
|
| 178 |
+
|
| 179 |
+
def apply(self, script: str) -> str:
|
| 180 |
+
if not self.old_kwarg:
|
| 181 |
+
return script
|
| 182 |
+
pattern = rf"{re.escape(self.old_kwarg)}\s*=\s*{re.escape(self.old_value)}"
|
| 183 |
+
replacement = f"{self.new_kwarg}={self.new_value}"
|
| 184 |
+
return re.sub(pattern, replacement, script)
|
| 185 |
+
|
| 186 |
+
def _get_params(self) -> dict:
|
| 187 |
+
return {
|
| 188 |
+
"old_kwarg": self.old_kwarg,
|
| 189 |
+
"old_value": self.old_value,
|
| 190 |
+
"new_kwarg": self.new_kwarg,
|
| 191 |
+
"new_value": self.new_value,
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
@dataclass
|
| 196 |
+
class RemoveDeprecatedMethod(BreakagePrimitive):
|
| 197 |
+
"""Remove a method that has been deprecated, leaving a sentinel that
|
| 198 |
+
raises AttributeError-style errors when the script runs."""
|
| 199 |
+
|
| 200 |
+
class_name: str = ""
|
| 201 |
+
method_name: str = ""
|
| 202 |
+
replacement: str = ""
|
| 203 |
+
|
| 204 |
+
def __post_init__(self) -> None:
|
| 205 |
+
self.category = "api_drift"
|
| 206 |
+
self.name = "RemoveDeprecatedMethod"
|
| 207 |
+
self.description = f"Remove {self.class_name}.{self.method_name}"
|
| 208 |
+
|
| 209 |
+
def apply(self, script: str) -> str:
|
| 210 |
+
if not self.method_name:
|
| 211 |
+
return script
|
| 212 |
+
return script.replace(
|
| 213 |
+
f".{self.method_name}(", f".{self.method_name}_DEPRECATED("
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
def _get_params(self) -> dict:
|
| 217 |
+
return {
|
| 218 |
+
"class_name": self.class_name,
|
| 219 |
+
"method_name": self.method_name,
|
| 220 |
+
"replacement": self.replacement,
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@dataclass
|
| 225 |
+
class ChangeReturnType(BreakagePrimitive):
|
| 226 |
+
"""A function now returns a different structure (e.g. tuple -> object)."""
|
| 227 |
+
|
| 228 |
+
function_name: str = ""
|
| 229 |
+
old_access: str = ""
|
| 230 |
+
new_access: str = ""
|
| 231 |
+
|
| 232 |
+
def __post_init__(self) -> None:
|
| 233 |
+
self.category = "api_drift"
|
| 234 |
+
self.name = "ChangeReturnType"
|
| 235 |
+
self.description = f"Change return type of {self.function_name}"
|
| 236 |
+
|
| 237 |
+
def apply(self, script: str) -> str:
|
| 238 |
+
if self.old_access and self.new_access:
|
| 239 |
+
return script.replace(self.old_access, self.new_access)
|
| 240 |
+
return script
|
| 241 |
+
|
| 242 |
+
def _get_params(self) -> dict:
|
| 243 |
+
return {
|
| 244 |
+
"function_name": self.function_name,
|
| 245 |
+
"old_access": self.old_access,
|
| 246 |
+
"new_access": self.new_access,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
PRIMITIVE_REGISTRY: dict[str, type[BreakagePrimitive]] = {
|
| 251 |
+
"RenameApiCall": RenameApiCall,
|
| 252 |
+
"DeprecateImport": DeprecateImport,
|
| 253 |
+
"ChangeArgumentSignature": ChangeArgumentSignature,
|
| 254 |
+
"ModifyConfigField": ModifyConfigField,
|
| 255 |
+
"RestructureDatasetSchema": RestructureDatasetSchema,
|
| 256 |
+
"ChangeTokenizerBehavior": ChangeTokenizerBehavior,
|
| 257 |
+
"RemoveDeprecatedMethod": RemoveDeprecatedMethod,
|
| 258 |
+
"ChangeReturnType": ChangeReturnType,
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def parse_breakage_spec(spec: dict) -> BreakagePrimitive:
|
| 263 |
+
"""Parse a JSON breakage spec into a BreakagePrimitive object.
|
| 264 |
+
|
| 265 |
+
Tolerates extra keys; ignores unknown params (LLMs hallucinate these).
|
| 266 |
+
"""
|
| 267 |
+
ptype = spec.get("primitive_type", "")
|
| 268 |
+
params = spec.get("params", {}) or {}
|
| 269 |
+
|
| 270 |
+
if ptype not in PRIMITIVE_REGISTRY:
|
| 271 |
+
raise ValueError(
|
| 272 |
+
f"Unknown primitive type: {ptype!r}. "
|
| 273 |
+
f"Valid types: {list(PRIMITIVE_REGISTRY)}"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
cls = PRIMITIVE_REGISTRY[ptype]
|
| 277 |
+
# Filter to known fields only so a hallucinated kwarg can't crash us.
|
| 278 |
+
valid_fields = {
|
| 279 |
+
f.name for f in cls.__dataclass_fields__.values() if f.init # type: ignore[attr-defined]
|
| 280 |
+
}
|
| 281 |
+
filtered = {k: v for k, v in params.items() if k in valid_fields}
|
| 282 |
+
return cls(**filtered)
|
forgeenv-space/forgeenv/primitives/drift_taxonomy.yaml
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Drift taxonomy: real HuggingFace/PyTorch breakages observed across version bumps.
|
| 2 |
+
# Used to seed the Drift Generator's initial proposal distribution and to anchor
|
| 3 |
+
# warm-start pair generation in things that actually happened in the wild.
|
| 4 |
+
- version_range: "transformers 4.36 -> 4.45"
|
| 5 |
+
affected_api: "Trainer.evaluate"
|
| 6 |
+
description: "Trainer.evaluate() return type changed shape; metrics now nested under .metrics"
|
| 7 |
+
breakage_primitive: "ChangeReturnType"
|
| 8 |
+
params:
|
| 9 |
+
function_name: "evaluate"
|
| 10 |
+
old_access: "trainer.evaluate()"
|
| 11 |
+
new_access: "trainer.evaluate().metrics"
|
| 12 |
+
repair_primitive: "RestoreReturnAccess"
|
| 13 |
+
category: "api_drift"
|
| 14 |
+
|
| 15 |
+
- version_range: "transformers 4.30 -> 4.40"
|
| 16 |
+
affected_api: "TrainingArguments.evaluation_strategy"
|
| 17 |
+
description: "Renamed evaluation_strategy -> eval_strategy"
|
| 18 |
+
breakage_primitive: "RenameApiCall"
|
| 19 |
+
params:
|
| 20 |
+
old_name: "evaluation_strategy"
|
| 21 |
+
new_name: "eval_strategy"
|
| 22 |
+
repair_primitive: "RestoreApiCall"
|
| 23 |
+
category: "api_drift"
|
| 24 |
+
|
| 25 |
+
- version_range: "datasets 2.14 -> 3.0"
|
| 26 |
+
affected_api: "load_dataset"
|
| 27 |
+
description: "Default split column was renamed in some GLUE configs"
|
| 28 |
+
breakage_primitive: "RestructureDatasetSchema"
|
| 29 |
+
params:
|
| 30 |
+
old_column: "label"
|
| 31 |
+
new_column: "labels"
|
| 32 |
+
repair_primitive: "RestoreColumn"
|
| 33 |
+
category: "dataset_drift"
|
| 34 |
+
|
| 35 |
+
- version_range: "transformers 4.40 -> 4.50"
|
| 36 |
+
affected_api: "Trainer.predict"
|
| 37 |
+
description: "Method removed; users should use evaluate() with prediction_loss_only=False"
|
| 38 |
+
breakage_primitive: "RemoveDeprecatedMethod"
|
| 39 |
+
params:
|
| 40 |
+
class_name: "Trainer"
|
| 41 |
+
method_name: "predict"
|
| 42 |
+
replacement: "evaluate"
|
| 43 |
+
repair_primitive: "RestoreMethod"
|
| 44 |
+
category: "api_drift"
|
| 45 |
+
|
| 46 |
+
- version_range: "transformers 4.36 -> 4.40"
|
| 47 |
+
affected_api: "TrainingArguments"
|
| 48 |
+
description: "num_train_epochs default behavior changed; max_steps now preferred"
|
| 49 |
+
breakage_primitive: "ModifyConfigField"
|
| 50 |
+
params:
|
| 51 |
+
config_class: "TrainingArguments"
|
| 52 |
+
field_name: "num_train_epochs"
|
| 53 |
+
new_value: "0"
|
| 54 |
+
repair_primitive: "RestoreConfigField"
|
| 55 |
+
category: "config_drift"
|
| 56 |
+
|
| 57 |
+
- version_range: "transformers 4.34 -> 4.42"
|
| 58 |
+
affected_api: "Tokenizer.__call__"
|
| 59 |
+
description: "padding=True semantics changed; users should pass padding='max_length'"
|
| 60 |
+
breakage_primitive: "ChangeTokenizerBehavior"
|
| 61 |
+
params:
|
| 62 |
+
old_kwarg: "padding"
|
| 63 |
+
old_value: "True"
|
| 64 |
+
new_kwarg: "padding"
|
| 65 |
+
new_value: '"max_length"'
|
| 66 |
+
repair_primitive: "RestoreTokenizerKwarg"
|
| 67 |
+
category: "tokenizer_drift"
|
| 68 |
+
|
| 69 |
+
- version_range: "transformers 4.20 -> 4.30"
|
| 70 |
+
affected_api: "imports"
|
| 71 |
+
description: "transformers.training_args moved to transformers.training_args_pt"
|
| 72 |
+
breakage_primitive: "DeprecateImport"
|
| 73 |
+
params:
|
| 74 |
+
old_module: "from transformers.training_args"
|
| 75 |
+
new_module: "from transformers.training_args_pt"
|
| 76 |
+
repair_primitive: "RestoreImport"
|
| 77 |
+
category: "import_drift"
|
| 78 |
+
|
| 79 |
+
- version_range: "transformers 4.45 -> 4.50"
|
| 80 |
+
affected_api: "save_pretrained"
|
| 81 |
+
description: "save_pretrained() now requires safe_serialization to default True"
|
| 82 |
+
breakage_primitive: "ChangeArgumentSignature"
|
| 83 |
+
params:
|
| 84 |
+
function_name: "save_pretrained"
|
| 85 |
+
removed_arg: "safe_serialization"
|
| 86 |
+
added_arg: "safe_serialization"
|
| 87 |
+
added_value: "True"
|
| 88 |
+
repair_primitive: "RestoreArgument"
|
| 89 |
+
category: "api_drift"
|
| 90 |
+
|
| 91 |
+
- version_range: "datasets 2.18 -> 3.0"
|
| 92 |
+
affected_api: "Dataset.set_format"
|
| 93 |
+
description: "set_format(type='torch') signature stricter, columns required"
|
| 94 |
+
breakage_primitive: "ChangeArgumentSignature"
|
| 95 |
+
params:
|
| 96 |
+
function_name: "set_format"
|
| 97 |
+
removed_arg: "columns"
|
| 98 |
+
added_arg: "columns"
|
| 99 |
+
added_value: '["input_ids", "attention_mask", "labels"]'
|
| 100 |
+
repair_primitive: "RestoreArgument"
|
| 101 |
+
category: "api_drift"
|
| 102 |
+
|
| 103 |
+
- version_range: "transformers 4.36 -> 4.45"
|
| 104 |
+
affected_api: "Tokenizer.__call__"
|
| 105 |
+
description: "max_length default reduced from 512 -> 256 for some tokenizers"
|
| 106 |
+
breakage_primitive: "ModifyConfigField"
|
| 107 |
+
params:
|
| 108 |
+
config_class: "tokenizer"
|
| 109 |
+
field_name: "max_length"
|
| 110 |
+
new_value: "256"
|
| 111 |
+
repair_primitive: "RestoreConfigField"
|
| 112 |
+
category: "tokenizer_drift"
|
| 113 |
+
|
| 114 |
+
- version_range: "transformers 4.40 -> 4.45"
|
| 115 |
+
affected_api: "DataCollatorWithPadding"
|
| 116 |
+
description: "Renamed `tokenizer` -> `processing_class` in DataCollator constructors"
|
| 117 |
+
breakage_primitive: "RenameApiCall"
|
| 118 |
+
params:
|
| 119 |
+
old_name: "tokenizer"
|
| 120 |
+
new_name: "processing_class"
|
| 121 |
+
repair_primitive: "RestoreApiCall"
|
| 122 |
+
category: "api_drift"
|
| 123 |
+
|
| 124 |
+
- version_range: "datasets 2.14 -> 2.18"
|
| 125 |
+
affected_api: "load_dataset"
|
| 126 |
+
description: "Some splits renamed train[:500] semantics changed"
|
| 127 |
+
breakage_primitive: "RestructureDatasetSchema"
|
| 128 |
+
params:
|
| 129 |
+
old_column: "sentence"
|
| 130 |
+
new_column: "text"
|
| 131 |
+
repair_primitive: "RestoreColumn"
|
| 132 |
+
category: "dataset_drift"
|
| 133 |
+
|
| 134 |
+
- version_range: "transformers 4.45 -> 4.50"
|
| 135 |
+
affected_api: "Trainer"
|
| 136 |
+
description: "evaluation_strategy was deprecated and removed"
|
| 137 |
+
breakage_primitive: "RemoveDeprecatedMethod"
|
| 138 |
+
params:
|
| 139 |
+
class_name: "Trainer"
|
| 140 |
+
method_name: "evaluate"
|
| 141 |
+
replacement: "evaluate_legacy"
|
| 142 |
+
repair_primitive: "RestoreMethod"
|
| 143 |
+
category: "api_drift"
|
| 144 |
+
|
| 145 |
+
- version_range: "transformers 4.30 -> 4.40"
|
| 146 |
+
affected_api: "PreTrainedModel.from_pretrained"
|
| 147 |
+
description: "torch_dtype now required for some quantized model paths"
|
| 148 |
+
breakage_primitive: "ChangeArgumentSignature"
|
| 149 |
+
params:
|
| 150 |
+
function_name: "from_pretrained"
|
| 151 |
+
removed_arg: "torch_dtype"
|
| 152 |
+
added_arg: "torch_dtype"
|
| 153 |
+
added_value: '"auto"'
|
| 154 |
+
repair_primitive: "RestoreArgument"
|
| 155 |
+
category: "api_drift"
|
| 156 |
+
|
| 157 |
+
- version_range: "datasets 3.0 -> 3.2"
|
| 158 |
+
affected_api: "Dataset.rename_column"
|
| 159 |
+
description: "rename_column raises if target name exists"
|
| 160 |
+
breakage_primitive: "RestructureDatasetSchema"
|
| 161 |
+
params:
|
| 162 |
+
old_column: "labels"
|
| 163 |
+
new_column: "label"
|
| 164 |
+
repair_primitive: "RestoreColumn"
|
| 165 |
+
category: "dataset_drift"
|
| 166 |
+
|
| 167 |
+
- version_range: "transformers 4.36 -> 4.42"
|
| 168 |
+
affected_api: "TrainingArguments.report_to"
|
| 169 |
+
description: "Default report_to changed from 'all' to 'none'"
|
| 170 |
+
breakage_primitive: "ModifyConfigField"
|
| 171 |
+
params:
|
| 172 |
+
config_class: "TrainingArguments"
|
| 173 |
+
field_name: "report_to"
|
| 174 |
+
new_value: '"all"'
|
| 175 |
+
repair_primitive: "RestoreConfigField"
|
| 176 |
+
category: "config_drift"
|
| 177 |
+
|
| 178 |
+
- version_range: "transformers 4.40 -> 4.50"
|
| 179 |
+
affected_api: "imports"
|
| 180 |
+
description: "transformers.deepspeed moved to accelerate.utils.deepspeed"
|
| 181 |
+
breakage_primitive: "DeprecateImport"
|
| 182 |
+
params:
|
| 183 |
+
old_module: "from transformers.deepspeed"
|
| 184 |
+
new_module: "from accelerate.utils.deepspeed"
|
| 185 |
+
repair_primitive: "RestoreImport"
|
| 186 |
+
category: "import_drift"
|
| 187 |
+
|
| 188 |
+
- version_range: "transformers 4.45 -> 4.50"
|
| 189 |
+
affected_api: "Tokenizer return"
|
| 190 |
+
description: "Tokenizer call output now returns a BatchEncoding with .encodings attribute"
|
| 191 |
+
breakage_primitive: "ChangeReturnType"
|
| 192 |
+
params:
|
| 193 |
+
function_name: "tokenizer"
|
| 194 |
+
old_access: "tokenizer(text)"
|
| 195 |
+
new_access: "tokenizer(text).encodings"
|
| 196 |
+
repair_primitive: "RestoreReturnAccess"
|
| 197 |
+
category: "api_drift"
|
| 198 |
+
|
| 199 |
+
- version_range: "transformers 4.30 -> 4.40"
|
| 200 |
+
affected_api: "save_pretrained"
|
| 201 |
+
description: "save_pretrained -> save_pretrained_directory rename in some classes"
|
| 202 |
+
breakage_primitive: "RenameApiCall"
|
| 203 |
+
params:
|
| 204 |
+
old_name: "save_pretrained"
|
| 205 |
+
new_name: "save_pretrained_directory"
|
| 206 |
+
repair_primitive: "RestoreApiCall"
|
| 207 |
+
category: "api_drift"
|
| 208 |
+
|
| 209 |
+
- version_range: "transformers 4.45 -> 4.50"
|
| 210 |
+
affected_api: "TrainingArguments.no_cuda"
|
| 211 |
+
description: "no_cuda renamed to use_cpu (logic inverted)"
|
| 212 |
+
breakage_primitive: "RenameApiCall"
|
| 213 |
+
params:
|
| 214 |
+
old_name: "no_cuda"
|
| 215 |
+
new_name: "use_cpu"
|
| 216 |
+
repair_primitive: "RestoreApiCall"
|
| 217 |
+
category: "config_drift"
|
forgeenv-space/forgeenv/primitives/repair_primitives.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Repair primitives — direct inverses of the 8 breakage primitives.
|
| 2 |
+
|
| 3 |
+
Used during warm-start data generation: for every (script, breakage)
|
| 4 |
+
pair we know the canonical repair, so we can write SFT pairs.
|
| 5 |
+
|
| 6 |
+
These are also useful for unit-testing the breakage primitives:
|
| 7 |
+
apply(breakage) then apply(repair) should be (close to) the identity.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import re
|
| 12 |
+
from abc import ABC, abstractmethod
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class RepairPrimitive(ABC):
|
| 18 |
+
category: str = field(default="generic", init=False)
|
| 19 |
+
name: str = field(default="RepairPrimitive", init=False)
|
| 20 |
+
description: str = field(default="", init=False)
|
| 21 |
+
|
| 22 |
+
@abstractmethod
|
| 23 |
+
def apply(self, script: str) -> str:
|
| 24 |
+
"""Transform `script` to undo the corresponding breakage."""
|
| 25 |
+
|
| 26 |
+
def to_spec(self) -> dict:
|
| 27 |
+
return {
|
| 28 |
+
"primitive_type": self.__class__.__name__,
|
| 29 |
+
"category": self.category,
|
| 30 |
+
"params": self._get_params(),
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def _get_params(self) -> dict:
|
| 35 |
+
"""Return JSON-serializable constructor parameters."""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class RestoreApiCall(RepairPrimitive):
|
| 40 |
+
new_name: str = ""
|
| 41 |
+
old_name: str = ""
|
| 42 |
+
|
| 43 |
+
def __post_init__(self) -> None:
|
| 44 |
+
self.category = "api_drift"
|
| 45 |
+
self.name = "RestoreApiCall"
|
| 46 |
+
self.description = f"Rename {self.new_name} -> {self.old_name}"
|
| 47 |
+
|
| 48 |
+
def apply(self, script: str) -> str:
|
| 49 |
+
if not self.new_name:
|
| 50 |
+
return script
|
| 51 |
+
pattern = re.compile(rf"(?<!\w){re.escape(self.new_name)}(?!\w)")
|
| 52 |
+
return pattern.sub(self.old_name, script)
|
| 53 |
+
|
| 54 |
+
def _get_params(self) -> dict:
|
| 55 |
+
return {"new_name": self.new_name, "old_name": self.old_name}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class RestoreImport(RepairPrimitive):
|
| 60 |
+
new_module: str = ""
|
| 61 |
+
old_module: str = ""
|
| 62 |
+
|
| 63 |
+
def __post_init__(self) -> None:
|
| 64 |
+
self.category = "import_drift"
|
| 65 |
+
self.name = "RestoreImport"
|
| 66 |
+
self.description = f"Restore import {self.new_module} -> {self.old_module}"
|
| 67 |
+
|
| 68 |
+
def apply(self, script: str) -> str:
|
| 69 |
+
return script.replace(self.new_module, self.old_module)
|
| 70 |
+
|
| 71 |
+
def _get_params(self) -> dict:
|
| 72 |
+
return {"new_module": self.new_module, "old_module": self.old_module}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@dataclass
|
| 76 |
+
class RestoreArgument(RepairPrimitive):
|
| 77 |
+
"""Re-add a removed argument to a function call."""
|
| 78 |
+
|
| 79 |
+
function_name: str = ""
|
| 80 |
+
arg_name: str = ""
|
| 81 |
+
arg_value: str = ""
|
| 82 |
+
|
| 83 |
+
def __post_init__(self) -> None:
|
| 84 |
+
self.category = "api_drift"
|
| 85 |
+
self.name = "RestoreArgument"
|
| 86 |
+
self.description = (
|
| 87 |
+
f"Add {self.arg_name}={self.arg_value} to {self.function_name}()"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
def apply(self, script: str) -> str:
|
| 91 |
+
if not self.function_name:
|
| 92 |
+
return script
|
| 93 |
+
# Insert the kwarg right after the function-name's opening paren.
|
| 94 |
+
pattern = rf"({re.escape(self.function_name)}\s*\()(\s*)"
|
| 95 |
+
replacement = rf"\g<1>{self.arg_name}={self.arg_value}, \g<2>"
|
| 96 |
+
return re.sub(pattern, replacement, script, count=1)
|
| 97 |
+
|
| 98 |
+
def _get_params(self) -> dict:
|
| 99 |
+
return {
|
| 100 |
+
"function_name": self.function_name,
|
| 101 |
+
"arg_name": self.arg_name,
|
| 102 |
+
"arg_value": self.arg_value,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@dataclass
|
| 107 |
+
class RestoreConfigField(RepairPrimitive):
|
| 108 |
+
field_name: str = ""
|
| 109 |
+
old_value: str = ""
|
| 110 |
+
|
| 111 |
+
def __post_init__(self) -> None:
|
| 112 |
+
self.category = "config_drift"
|
| 113 |
+
self.name = "RestoreConfigField"
|
| 114 |
+
self.description = f"Restore {self.field_name}={self.old_value}"
|
| 115 |
+
|
| 116 |
+
def apply(self, script: str) -> str:
|
| 117 |
+
if not self.field_name:
|
| 118 |
+
return script
|
| 119 |
+
pattern = rf"({re.escape(self.field_name)}\s*=\s*)([^,)\n]+)"
|
| 120 |
+
return re.sub(pattern, rf"\g<1>{self.old_value}", script)
|
| 121 |
+
|
| 122 |
+
def _get_params(self) -> dict:
|
| 123 |
+
return {"field_name": self.field_name, "old_value": self.old_value}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@dataclass
|
| 127 |
+
class RestoreColumn(RepairPrimitive):
|
| 128 |
+
new_column: str = ""
|
| 129 |
+
old_column: str = ""
|
| 130 |
+
|
| 131 |
+
def __post_init__(self) -> None:
|
| 132 |
+
self.category = "dataset_drift"
|
| 133 |
+
self.name = "RestoreColumn"
|
| 134 |
+
self.description = f"Rename column {self.new_column} -> {self.old_column}"
|
| 135 |
+
|
| 136 |
+
def apply(self, script: str) -> str:
|
| 137 |
+
return script.replace(
|
| 138 |
+
f'"{self.new_column}"', f'"{self.old_column}"'
|
| 139 |
+
).replace(
|
| 140 |
+
f"'{self.new_column}'", f"'{self.old_column}'"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
def _get_params(self) -> dict:
|
| 144 |
+
return {"new_column": self.new_column, "old_column": self.old_column}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
@dataclass
|
| 148 |
+
class RestoreTokenizerKwarg(RepairPrimitive):
|
| 149 |
+
new_kwarg: str = ""
|
| 150 |
+
new_value: str = ""
|
| 151 |
+
old_kwarg: str = ""
|
| 152 |
+
old_value: str = ""
|
| 153 |
+
|
| 154 |
+
def __post_init__(self) -> None:
|
| 155 |
+
self.category = "tokenizer_drift"
|
| 156 |
+
self.name = "RestoreTokenizerKwarg"
|
| 157 |
+
self.description = (
|
| 158 |
+
f"Restore tokenizer {self.new_kwarg}={self.new_value} -> "
|
| 159 |
+
f"{self.old_kwarg}={self.old_value}"
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
def apply(self, script: str) -> str:
|
| 163 |
+
if not self.new_kwarg:
|
| 164 |
+
return script
|
| 165 |
+
pattern = rf"{re.escape(self.new_kwarg)}\s*=\s*{re.escape(self.new_value)}"
|
| 166 |
+
replacement = f"{self.old_kwarg}={self.old_value}"
|
| 167 |
+
return re.sub(pattern, replacement, script)
|
| 168 |
+
|
| 169 |
+
def _get_params(self) -> dict:
|
| 170 |
+
return {
|
| 171 |
+
"new_kwarg": self.new_kwarg,
|
| 172 |
+
"new_value": self.new_value,
|
| 173 |
+
"old_kwarg": self.old_kwarg,
|
| 174 |
+
"old_value": self.old_value,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@dataclass
|
| 179 |
+
class RestoreMethod(RepairPrimitive):
|
| 180 |
+
method_name: str = ""
|
| 181 |
+
|
| 182 |
+
def __post_init__(self) -> None:
|
| 183 |
+
self.category = "api_drift"
|
| 184 |
+
self.name = "RestoreMethod"
|
| 185 |
+
self.description = f"Un-deprecate .{self.method_name}()"
|
| 186 |
+
|
| 187 |
+
def apply(self, script: str) -> str:
|
| 188 |
+
if not self.method_name:
|
| 189 |
+
return script
|
| 190 |
+
return script.replace(
|
| 191 |
+
f".{self.method_name}_DEPRECATED(", f".{self.method_name}("
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
def _get_params(self) -> dict:
|
| 195 |
+
return {"method_name": self.method_name}
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
@dataclass
|
| 199 |
+
class RestoreReturnAccess(RepairPrimitive):
|
| 200 |
+
new_access: str = ""
|
| 201 |
+
old_access: str = ""
|
| 202 |
+
|
| 203 |
+
def __post_init__(self) -> None:
|
| 204 |
+
self.category = "api_drift"
|
| 205 |
+
self.name = "RestoreReturnAccess"
|
| 206 |
+
self.description = f"Restore return-access {self.new_access} -> {self.old_access}"
|
| 207 |
+
|
| 208 |
+
def apply(self, script: str) -> str:
|
| 209 |
+
if not self.new_access:
|
| 210 |
+
return script
|
| 211 |
+
return script.replace(self.new_access, self.old_access)
|
| 212 |
+
|
| 213 |
+
def _get_params(self) -> dict:
|
| 214 |
+
return {"new_access": self.new_access, "old_access": self.old_access}
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
REPAIR_REGISTRY: dict[str, type[RepairPrimitive]] = {
|
| 218 |
+
"RestoreApiCall": RestoreApiCall,
|
| 219 |
+
"RestoreImport": RestoreImport,
|
| 220 |
+
"RestoreArgument": RestoreArgument,
|
| 221 |
+
"RestoreConfigField": RestoreConfigField,
|
| 222 |
+
"RestoreColumn": RestoreColumn,
|
| 223 |
+
"RestoreTokenizerKwarg": RestoreTokenizerKwarg,
|
| 224 |
+
"RestoreMethod": RestoreMethod,
|
| 225 |
+
"RestoreReturnAccess": RestoreReturnAccess,
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# Map a breakage primitive's class name to the repair-primitive class that
|
| 230 |
+
# inverts it. Used by the warm-start pair generator and by the demo / repair
|
| 231 |
+
# library curator.
|
| 232 |
+
BREAKAGE_TO_REPAIR: dict[str, str] = {
|
| 233 |
+
"RenameApiCall": "RestoreApiCall",
|
| 234 |
+
"DeprecateImport": "RestoreImport",
|
| 235 |
+
"ChangeArgumentSignature": "RestoreArgument",
|
| 236 |
+
"ModifyConfigField": "RestoreConfigField",
|
| 237 |
+
"RestructureDatasetSchema": "RestoreColumn",
|
| 238 |
+
"ChangeTokenizerBehavior": "RestoreTokenizerKwarg",
|
| 239 |
+
"RemoveDeprecatedMethod": "RestoreMethod",
|
| 240 |
+
"ChangeReturnType": "RestoreReturnAccess",
|
| 241 |
+
}
|
forgeenv-space/forgeenv/roles/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/roles/drift_generator.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Drift Generator parser + a deterministic baseline policy.
|
| 2 |
+
|
| 3 |
+
In training the LLM produces a JSON breakage spec; we parse it. In rollouts
|
| 4 |
+
where we want a baseline (or a fallback when the LLM emits malformed JSON)
|
| 5 |
+
we use `BaselineDriftGenerator`, which samples from the per-category set of
|
| 6 |
+
known good primitive parameterisations.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import random
|
| 12 |
+
import re
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from forgeenv.primitives.breakage_primitives import (
|
| 17 |
+
PRIMITIVE_REGISTRY,
|
| 18 |
+
parse_breakage_spec,
|
| 19 |
+
BreakagePrimitive,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_JSON_RE = re.compile(r"\{[\s\S]*\}")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def parse_drift_output(text: str) -> Optional[dict]:
|
| 27 |
+
"""Extract a JSON object from possibly-noisy LLM output.
|
| 28 |
+
|
| 29 |
+
Handles markdown fences, prose preamble, trailing commas (best-effort).
|
| 30 |
+
Returns None on failure.
|
| 31 |
+
"""
|
| 32 |
+
if not text:
|
| 33 |
+
return None
|
| 34 |
+
text = text.strip()
|
| 35 |
+
if text.startswith("```"):
|
| 36 |
+
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
|
| 37 |
+
text = re.sub(r"\n?```$", "", text)
|
| 38 |
+
match = _JSON_RE.search(text)
|
| 39 |
+
if not match:
|
| 40 |
+
return None
|
| 41 |
+
blob = match.group(0)
|
| 42 |
+
try:
|
| 43 |
+
return json.loads(blob)
|
| 44 |
+
except json.JSONDecodeError:
|
| 45 |
+
cleaned = re.sub(r",\s*([}\]])", r"\1", blob)
|
| 46 |
+
try:
|
| 47 |
+
return json.loads(cleaned)
|
| 48 |
+
except json.JSONDecodeError:
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def parse_drift_to_primitive(text: str) -> Optional[BreakagePrimitive]:
|
| 53 |
+
"""End-to-end: LLM text -> validated BreakagePrimitive (or None)."""
|
| 54 |
+
data = parse_drift_output(text)
|
| 55 |
+
if not isinstance(data, dict):
|
| 56 |
+
return None
|
| 57 |
+
try:
|
| 58 |
+
return parse_breakage_spec(data)
|
| 59 |
+
except (ValueError, TypeError):
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ---------------------------------------------------------------- baselines
|
| 64 |
+
_DEFAULT_PARAMS_BY_TYPE: dict[str, list[dict]] = {
|
| 65 |
+
"RenameApiCall": [
|
| 66 |
+
{"old_name": "trainer.train", "new_name": "trainer.start_training"},
|
| 67 |
+
{"old_name": "save_pretrained", "new_name": "save_to_hub"},
|
| 68 |
+
{"old_name": "from_pretrained", "new_name": "load_from_hub"},
|
| 69 |
+
],
|
| 70 |
+
"DeprecateImport": [
|
| 71 |
+
{
|
| 72 |
+
"old_module": "from transformers import Trainer",
|
| 73 |
+
"new_module": "from transformers.legacy import Trainer",
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"old_module": "from transformers import TrainingArguments",
|
| 77 |
+
"new_module": "from transformers.training import TrainingArguments",
|
| 78 |
+
},
|
| 79 |
+
],
|
| 80 |
+
"ChangeArgumentSignature": [
|
| 81 |
+
{
|
| 82 |
+
"function_name": "TrainingArguments",
|
| 83 |
+
"removed_arg": "num_train_epochs",
|
| 84 |
+
"added_arg": "max_steps",
|
| 85 |
+
"added_value": "1000",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"function_name": "TrainingArguments",
|
| 89 |
+
"removed_arg": "evaluation_strategy",
|
| 90 |
+
"added_arg": "eval_strategy",
|
| 91 |
+
"added_value": '"steps"',
|
| 92 |
+
},
|
| 93 |
+
],
|
| 94 |
+
"ModifyConfigField": [
|
| 95 |
+
{"config_class": "TrainingArguments", "field_name": "learning_rate", "new_value": "5e-3"},
|
| 96 |
+
{"config_class": "TrainingArguments", "field_name": "per_device_train_batch_size", "new_value": "1"},
|
| 97 |
+
],
|
| 98 |
+
"RestructureDatasetSchema": [
|
| 99 |
+
{"old_column": "text", "new_column": "input_text"},
|
| 100 |
+
{"old_column": "label", "new_column": "labels"},
|
| 101 |
+
{"old_column": "tokens", "new_column": "words"},
|
| 102 |
+
],
|
| 103 |
+
"ChangeTokenizerBehavior": [
|
| 104 |
+
{"old_kwarg": "padding", "old_value": "True", "new_kwarg": "pad_to_max_length", "new_value": "True"},
|
| 105 |
+
{"old_kwarg": "truncation", "old_value": "True", "new_kwarg": "truncate", "new_value": "True"},
|
| 106 |
+
],
|
| 107 |
+
"RemoveDeprecatedMethod": [
|
| 108 |
+
{"class_name": "Trainer", "method_name": "evaluate", "replacement": "evaluation_loop"},
|
| 109 |
+
{"class_name": "Trainer", "method_name": "save_model", "replacement": "save_to_hub"},
|
| 110 |
+
],
|
| 111 |
+
"ChangeReturnType": [
|
| 112 |
+
{"function_name": "Trainer.predict", "old_access": ".predictions", "new_access": "[0]"},
|
| 113 |
+
{"function_name": "tokenizer", "old_access": '["input_ids"]', "new_access": ".input_ids"},
|
| 114 |
+
],
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@dataclass
|
| 119 |
+
class BaselineDriftGenerator:
|
| 120 |
+
"""Deterministic stand-in for the LLM Drift Generator.
|
| 121 |
+
|
| 122 |
+
Used for warm-start data, baseline rollouts, and unit tests.
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
seed: Optional[int] = None
|
| 126 |
+
|
| 127 |
+
def __post_init__(self) -> None:
|
| 128 |
+
self._rng = random.Random(self.seed) if self.seed is not None else random
|
| 129 |
+
|
| 130 |
+
def propose(
|
| 131 |
+
self, target_category: str = "", script: str = ""
|
| 132 |
+
) -> dict:
|
| 133 |
+
"""Produce a JSON-serializable breakage spec for `target_category`.
|
| 134 |
+
|
| 135 |
+
Order of preference:
|
| 136 |
+
1. A primitive of `target_category` whose default params apply to `script`.
|
| 137 |
+
2. A primitive of any type whose default params apply to `script`.
|
| 138 |
+
3. A primitive of `target_category` (no-op fallback).
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
preferred_types = (
|
| 142 |
+
[target_category] if target_category in _DEFAULT_PARAMS_BY_TYPE else []
|
| 143 |
+
)
|
| 144 |
+
all_types = list(_DEFAULT_PARAMS_BY_TYPE.keys())
|
| 145 |
+
|
| 146 |
+
for type_set in (preferred_types, all_types):
|
| 147 |
+
shuffled = self._rng.sample(type_set, len(type_set)) if type_set else []
|
| 148 |
+
for ptype in shuffled:
|
| 149 |
+
for params in self._rng.sample(
|
| 150 |
+
_DEFAULT_PARAMS_BY_TYPE[ptype],
|
| 151 |
+
len(_DEFAULT_PARAMS_BY_TYPE[ptype]),
|
| 152 |
+
):
|
| 153 |
+
if self._params_apply_to_script(ptype, params, script):
|
| 154 |
+
return {"primitive_type": ptype, "params": dict(params)}
|
| 155 |
+
|
| 156 |
+
ptype = preferred_types[0] if preferred_types else all_types[0]
|
| 157 |
+
return {
|
| 158 |
+
"primitive_type": ptype,
|
| 159 |
+
"params": dict(_DEFAULT_PARAMS_BY_TYPE[ptype][0]),
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
@staticmethod
|
| 163 |
+
def _params_apply_to_script(ptype: str, params: dict, script: str) -> bool:
|
| 164 |
+
"""Heuristic: would this primitive actually mutate `script`?"""
|
| 165 |
+
if not script:
|
| 166 |
+
return True
|
| 167 |
+
for key in ("old_name", "old_module", "removed_arg", "field_name", "old_column", "old_kwarg", "method_name", "old_access"):
|
| 168 |
+
if key in params and params[key] and params[key] in script:
|
| 169 |
+
return True
|
| 170 |
+
return False
|
forgeenv-space/forgeenv/roles/prompts.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System and user prompts for the two RL roles.
|
| 2 |
+
|
| 3 |
+
Both roles are trained from the same base policy (Qwen-2.5-Coder-7B) with
|
| 4 |
+
LoRA adapters per role, so role prompts are the only thing distinguishing
|
| 5 |
+
them at inference time. Keep them concise — every token is a token of GPU
|
| 6 |
+
budget during GRPO rollouts.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Iterable
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
PRIMITIVE_DESCRIPTIONS = {
|
| 14 |
+
"RenameApiCall": "Rename a function/method call (api_drift)",
|
| 15 |
+
"DeprecateImport": "Change an import path (import_drift)",
|
| 16 |
+
"ChangeArgumentSignature": "Remove an expected kwarg from a call (api_drift)",
|
| 17 |
+
"ModifyConfigField": "Change a config-class default (config_drift)",
|
| 18 |
+
"RestructureDatasetSchema": "Rename a dataset column reference (dataset_drift)",
|
| 19 |
+
"ChangeTokenizerBehavior": "Change tokenizer call kwargs (tokenizer_drift)",
|
| 20 |
+
"RemoveDeprecatedMethod": "Remove a method, leaving a sentinel _DEPRECATED suffix (api_drift)",
|
| 21 |
+
"ChangeReturnType": "Function returns a different structure (api_drift)",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
DRIFT_GENERATOR_SYSTEM_PROMPT = """You are the Drift Generator.
|
| 25 |
+
You see a working HuggingFace training script and the curriculum target category.
|
| 26 |
+
Output exactly one JSON object describing a breakage primitive that simulates
|
| 27 |
+
realistic library version drift. The primitive must:
|
| 28 |
+
1. Be PLAUSIBLE — match the kind of breakage that happens between real
|
| 29 |
+
transformers/datasets/trl releases.
|
| 30 |
+
2. Be SOLVABLE — the Repair Agent should be able to fix it from the error trace alone.
|
| 31 |
+
3. Match the requested target_category.
|
| 32 |
+
|
| 33 |
+
Output schema:
|
| 34 |
+
{"primitive_type": "<one of the 8 types>", "params": { ... }}
|
| 35 |
+
|
| 36 |
+
Available primitive types and parameter schemas:
|
| 37 |
+
- RenameApiCall: {"old_name": str, "new_name": str}
|
| 38 |
+
- DeprecateImport: {"old_module": str, "new_module": str}
|
| 39 |
+
- ChangeArgumentSignature: {"function_name": str, "removed_arg": str, "added_arg": str, "added_value": str}
|
| 40 |
+
- ModifyConfigField: {"config_class": str, "field_name": str, "new_value": str}
|
| 41 |
+
- RestructureDatasetSchema: {"old_column": str, "new_column": str}
|
| 42 |
+
- ChangeTokenizerBehavior: {"old_kwarg": str, "old_value": str, "new_kwarg": str, "new_value": str}
|
| 43 |
+
- RemoveDeprecatedMethod: {"class_name": str, "method_name": str, "replacement": str}
|
| 44 |
+
- ChangeReturnType: {"function_name": str, "old_access": str, "new_access": str}
|
| 45 |
+
|
| 46 |
+
Output ONLY the JSON object — no commentary, no markdown fences.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
REPAIR_AGENT_SYSTEM_PROMPT = """You are the Repair Agent.
|
| 51 |
+
You see a broken HuggingFace training script, an error trace, and the current
|
| 52 |
+
library version snapshot. Output ONLY a unified diff that fixes the script.
|
| 53 |
+
|
| 54 |
+
Rules:
|
| 55 |
+
1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`
|
| 56 |
+
headers and `@@ ... @@` hunk markers.
|
| 57 |
+
2. Make the MINIMAL change that resolves the error AND preserves the original
|
| 58 |
+
training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit
|
| 59 |
+
calls.
|
| 60 |
+
3. Do NOT add any prose, markdown fences, or thinking output — diff only.
|
| 61 |
+
4. If the error is unfixable, output an empty diff.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def render_drift_generator_prompt(
|
| 66 |
+
script: str, target_category: str, library_versions: dict
|
| 67 |
+
) -> str:
|
| 68 |
+
versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
|
| 69 |
+
return f"""Target category: {target_category}
|
| 70 |
+
Library versions: {versions_str}
|
| 71 |
+
|
| 72 |
+
Working script:
|
| 73 |
+
```python
|
| 74 |
+
{script}
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Output JSON breakage primitive:"""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def render_repair_agent_prompt(
|
| 81 |
+
broken_script: str,
|
| 82 |
+
error_trace: str,
|
| 83 |
+
library_versions: dict,
|
| 84 |
+
target_category: str = "",
|
| 85 |
+
) -> str:
|
| 86 |
+
versions_str = ", ".join(f"{k}={v}" for k, v in library_versions.items())
|
| 87 |
+
return f"""Library versions: {versions_str}
|
| 88 |
+
Target category hint: {target_category or 'unknown'}
|
| 89 |
+
|
| 90 |
+
Broken script:
|
| 91 |
+
```python
|
| 92 |
+
{broken_script}
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
Error trace:
|
| 96 |
+
{error_trace}
|
| 97 |
+
|
| 98 |
+
Output unified diff (no prose, no fences):"""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def list_primitive_descriptions() -> Iterable[str]:
|
| 102 |
+
return (f"- {k}: {v}" for k, v in PRIMITIVE_DESCRIPTIONS.items())
|
forgeenv-space/forgeenv/roles/repair_agent.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Repair Agent helpers: response sanitisation + a deterministic baseline.
|
| 2 |
+
|
| 3 |
+
The Repair Agent's training output is a unified diff. LLMs frequently emit
|
| 4 |
+
prose / fences / chain-of-thought before the diff; this module strips that
|
| 5 |
+
preamble. The baseline policy uses the inverse-primitive map from
|
| 6 |
+
`repair_primitives.py` to produce ground-truth diffs for warm-start.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
from forgeenv.env.diff_utils import make_unified_diff
|
| 15 |
+
from forgeenv.primitives.breakage_primitives import (
|
| 16 |
+
parse_breakage_spec,
|
| 17 |
+
BreakagePrimitive,
|
| 18 |
+
)
|
| 19 |
+
from forgeenv.primitives.repair_primitives import (
|
| 20 |
+
BREAKAGE_TO_REPAIR,
|
| 21 |
+
REPAIR_REGISTRY,
|
| 22 |
+
RepairPrimitive,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
_DIFF_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
|
| 27 |
+
_FENCE_RE = re.compile(r"```[a-zA-Z]*\n([\s\S]*?)\n```")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def extract_diff(raw_text: str) -> str:
|
| 31 |
+
"""Pull the unified diff out of an LLM response.
|
| 32 |
+
|
| 33 |
+
Handles: code fences, leading prose / chain-of-thought, trailing notes.
|
| 34 |
+
"""
|
| 35 |
+
if not raw_text:
|
| 36 |
+
return ""
|
| 37 |
+
raw_text = raw_text.strip()
|
| 38 |
+
|
| 39 |
+
fence_match = _FENCE_RE.search(raw_text)
|
| 40 |
+
if fence_match:
|
| 41 |
+
raw_text = fence_match.group(1).strip()
|
| 42 |
+
|
| 43 |
+
lines = raw_text.splitlines()
|
| 44 |
+
start = 0
|
| 45 |
+
for i, line in enumerate(lines):
|
| 46 |
+
if line.startswith(("---", "+++", "@@")):
|
| 47 |
+
start = i
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
return "\n".join(lines[start:])
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def looks_like_diff(text: str) -> bool:
|
| 54 |
+
if not text:
|
| 55 |
+
return False
|
| 56 |
+
has_header = "---" in text and "+++" in text
|
| 57 |
+
has_hunk = bool(_DIFF_HUNK_RE.search(text))
|
| 58 |
+
has_pm = any(line.startswith(("+", "-")) for line in text.splitlines())
|
| 59 |
+
return (has_header and has_hunk) or (has_hunk and has_pm)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------- baselines
|
| 63 |
+
@dataclass
|
| 64 |
+
class BaselineRepairAgent:
|
| 65 |
+
"""Deterministic Repair Agent that uses the primitive inverse map.
|
| 66 |
+
|
| 67 |
+
Used for warm-start dataset generation and baseline rollout comparisons.
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
def repair(
|
| 71 |
+
self,
|
| 72 |
+
broken_script: str,
|
| 73 |
+
breakage_spec: Optional[dict] = None,
|
| 74 |
+
original_script: str = "",
|
| 75 |
+
) -> str:
|
| 76 |
+
"""Return a unified diff (or full replacement script) that fixes the
|
| 77 |
+
broken script.
|
| 78 |
+
|
| 79 |
+
Strategy preference:
|
| 80 |
+
1. If `original_script` is provided, return a diff between the
|
| 81 |
+
broken script and the original (oracle). This is the warm-start
|
| 82 |
+
path — we always know the ground truth.
|
| 83 |
+
2. Otherwise try to invert the structured breakage_spec via the
|
| 84 |
+
repair-primitive registry.
|
| 85 |
+
3. Otherwise return an empty diff.
|
| 86 |
+
"""
|
| 87 |
+
if original_script and original_script != broken_script:
|
| 88 |
+
return make_unified_diff(broken_script, original_script)
|
| 89 |
+
|
| 90 |
+
if breakage_spec:
|
| 91 |
+
try:
|
| 92 |
+
breakage = parse_breakage_spec(breakage_spec)
|
| 93 |
+
except (ValueError, TypeError):
|
| 94 |
+
breakage = None
|
| 95 |
+
if breakage is not None:
|
| 96 |
+
repair = _invert_breakage(breakage)
|
| 97 |
+
if repair is not None:
|
| 98 |
+
repaired = repair.apply(broken_script)
|
| 99 |
+
if repaired != broken_script:
|
| 100 |
+
return make_unified_diff(broken_script, repaired)
|
| 101 |
+
|
| 102 |
+
return ""
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
_PARAM_REMAP: dict[str, dict[str, str]] = {
|
| 106 |
+
"RenameApiCall": {"old_name": "old_name", "new_name": "new_name"},
|
| 107 |
+
"DeprecateImport": {"old_module": "old_module", "new_module": "new_module"},
|
| 108 |
+
"ChangeArgumentSignature": {
|
| 109 |
+
"function_name": "function_name",
|
| 110 |
+
"removed_arg": "arg_name",
|
| 111 |
+
},
|
| 112 |
+
"ModifyConfigField": {"field_name": "field_name"},
|
| 113 |
+
"RestructureDatasetSchema": {
|
| 114 |
+
"old_column": "old_column",
|
| 115 |
+
"new_column": "new_column",
|
| 116 |
+
},
|
| 117 |
+
"ChangeTokenizerBehavior": {
|
| 118 |
+
"old_kwarg": "old_kwarg",
|
| 119 |
+
"old_value": "old_value",
|
| 120 |
+
"new_kwarg": "new_kwarg",
|
| 121 |
+
"new_value": "new_value",
|
| 122 |
+
},
|
| 123 |
+
"RemoveDeprecatedMethod": {"method_name": "method_name"},
|
| 124 |
+
"ChangeReturnType": {"old_access": "old_access", "new_access": "new_access"},
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _invert_breakage(breakage: BreakagePrimitive) -> Optional[RepairPrimitive]:
|
| 129 |
+
breakage_name = type(breakage).__name__
|
| 130 |
+
repair_name = BREAKAGE_TO_REPAIR.get(breakage_name)
|
| 131 |
+
if repair_name is None:
|
| 132 |
+
return None
|
| 133 |
+
repair_cls = REPAIR_REGISTRY.get(repair_name)
|
| 134 |
+
if repair_cls is None:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
breakage_params = breakage._get_params() # type: ignore[attr-defined]
|
| 138 |
+
remap = _PARAM_REMAP.get(breakage_name, {})
|
| 139 |
+
mapped: dict[str, str] = {}
|
| 140 |
+
for src_key, dst_key in remap.items():
|
| 141 |
+
if src_key in breakage_params:
|
| 142 |
+
mapped[dst_key] = breakage_params[src_key]
|
| 143 |
+
|
| 144 |
+
valid_fields = {
|
| 145 |
+
f.name
|
| 146 |
+
for f in repair_cls.__dataclass_fields__.values() # type: ignore[attr-defined]
|
| 147 |
+
if f.init
|
| 148 |
+
}
|
| 149 |
+
filtered = {k: v for k, v in mapped.items() if k in valid_fields}
|
| 150 |
+
try:
|
| 151 |
+
return repair_cls(**filtered)
|
| 152 |
+
except TypeError:
|
| 153 |
+
return None
|
forgeenv-space/forgeenv/roles/teacher.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Teacher (curriculum controller).
|
| 2 |
+
|
| 3 |
+
Deterministic — NOT an LLM. Maintains an EMA success rate per breakage
|
| 4 |
+
category and routes the next episode toward the category where the
|
| 5 |
+
Repair Agent is closest to a 50% success rate (R-Zero's difficulty band).
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import random
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class Teacher:
|
| 15 |
+
categories: list[str]
|
| 16 |
+
alpha: float = 0.9
|
| 17 |
+
success_counts: dict[str, int] = field(default_factory=dict)
|
| 18 |
+
attempt_counts: dict[str, int] = field(default_factory=dict)
|
| 19 |
+
ema_success: dict[str, float] = field(default_factory=dict)
|
| 20 |
+
|
| 21 |
+
def __post_init__(self) -> None:
|
| 22 |
+
for category in self.categories:
|
| 23 |
+
self.success_counts.setdefault(category, 0)
|
| 24 |
+
self.attempt_counts.setdefault(category, 0)
|
| 25 |
+
self.ema_success.setdefault(category, 0.5)
|
| 26 |
+
|
| 27 |
+
def update(self, category: str, success: bool) -> None:
|
| 28 |
+
if category not in self.ema_success:
|
| 29 |
+
self.categories.append(category)
|
| 30 |
+
self.ema_success[category] = 0.5
|
| 31 |
+
self.success_counts[category] = 0
|
| 32 |
+
self.attempt_counts[category] = 0
|
| 33 |
+
|
| 34 |
+
self.attempt_counts[category] += 1
|
| 35 |
+
self.success_counts[category] += int(success)
|
| 36 |
+
rate = self.success_counts[category] / max(1, self.attempt_counts[category])
|
| 37 |
+
self.ema_success[category] = (
|
| 38 |
+
self.alpha * self.ema_success[category] + (1 - self.alpha) * rate
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def select_next_category(self) -> str:
|
| 42 |
+
in_zone = {
|
| 43 |
+
c: abs(s - 0.5) for c, s in self.ema_success.items() if 0.3 <= s <= 0.7
|
| 44 |
+
}
|
| 45 |
+
if in_zone:
|
| 46 |
+
weights = [1.0 / (v + 0.01) for v in in_zone.values()]
|
| 47 |
+
return random.choices(list(in_zone.keys()), weights=weights, k=1)[0]
|
| 48 |
+
return min(self.ema_success, key=lambda c: abs(self.ema_success[c] - 0.5))
|
| 49 |
+
|
| 50 |
+
def get_state(self) -> dict:
|
| 51 |
+
return {
|
| 52 |
+
c: {
|
| 53 |
+
"ema_success": round(self.ema_success[c], 4),
|
| 54 |
+
"attempts": self.attempt_counts[c],
|
| 55 |
+
"successes": self.success_counts[c],
|
| 56 |
+
}
|
| 57 |
+
for c in self.categories
|
| 58 |
+
}
|
forgeenv-space/forgeenv/sandbox/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/sandbox/ast_validator.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""AST-based script validator.
|
| 2 |
+
|
| 3 |
+
Catches forbidden imports and dangerous patterns BEFORE any execution
|
| 4 |
+
happens. This is a critical defense against reward hacking via system
|
| 5 |
+
calls, network access, or process manipulation.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import ast
|
| 10 |
+
|
| 11 |
+
from forgeenv.tasks.models import ValidationResult
|
| 12 |
+
|
| 13 |
+
FORBIDDEN_MODULES = {
|
| 14 |
+
"os",
|
| 15 |
+
"subprocess",
|
| 16 |
+
"socket",
|
| 17 |
+
"urllib",
|
| 18 |
+
"requests",
|
| 19 |
+
"ctypes",
|
| 20 |
+
"shutil",
|
| 21 |
+
"signal",
|
| 22 |
+
"multiprocessing",
|
| 23 |
+
"threading",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
FORBIDDEN_FUNCTIONS = {"eval", "exec", "compile", "__import__"}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def validate_script(script_content: str) -> ValidationResult:
|
| 30 |
+
"""Parse a script as AST and reject forbidden patterns.
|
| 31 |
+
|
| 32 |
+
Returns a ValidationResult with `is_valid` and a list of `violations`.
|
| 33 |
+
"""
|
| 34 |
+
violations: list[str] = []
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
tree = ast.parse(script_content)
|
| 38 |
+
except SyntaxError as e:
|
| 39 |
+
return ValidationResult(is_valid=False, violations=[f"SyntaxError: {e}"])
|
| 40 |
+
|
| 41 |
+
for node in ast.walk(tree):
|
| 42 |
+
if isinstance(node, ast.Import):
|
| 43 |
+
for alias in node.names:
|
| 44 |
+
module_root = alias.name.split(".")[0]
|
| 45 |
+
if module_root in FORBIDDEN_MODULES:
|
| 46 |
+
violations.append(f"Forbidden import: {alias.name}")
|
| 47 |
+
|
| 48 |
+
if isinstance(node, ast.ImportFrom):
|
| 49 |
+
if node.module:
|
| 50 |
+
module_root = node.module.split(".")[0]
|
| 51 |
+
if module_root in FORBIDDEN_MODULES:
|
| 52 |
+
violations.append(f"Forbidden import from: {node.module}")
|
| 53 |
+
|
| 54 |
+
if isinstance(node, ast.Call):
|
| 55 |
+
if isinstance(node.func, ast.Name):
|
| 56 |
+
if node.func.id in FORBIDDEN_FUNCTIONS:
|
| 57 |
+
violations.append(f"Forbidden call: {node.func.id}()")
|
| 58 |
+
if isinstance(node.func, ast.Attribute):
|
| 59 |
+
if node.func.attr in FORBIDDEN_FUNCTIONS:
|
| 60 |
+
violations.append(f"Forbidden call: .{node.func.attr}()")
|
| 61 |
+
|
| 62 |
+
if isinstance(node, ast.Assign):
|
| 63 |
+
for target in node.targets:
|
| 64 |
+
if isinstance(target, ast.Name) and target.id == "__builtins__":
|
| 65 |
+
violations.append("Forbidden: __builtins__ assignment")
|
| 66 |
+
|
| 67 |
+
return ValidationResult(
|
| 68 |
+
is_valid=len(violations) == 0,
|
| 69 |
+
violations=violations,
|
| 70 |
+
)
|
forgeenv-space/forgeenv/sandbox/simulation_mode.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fast simulation executor for development.
|
| 2 |
+
|
| 3 |
+
Static-analysis-based execution simulator. Sub-100ms per call. No Docker
|
| 4 |
+
required. The success probability of a simulated run depends on whether
|
| 5 |
+
the script contains expected HF training markers (model imports, training
|
| 6 |
+
calls, save calls). When the simulation succeeds, a synthetic decreasing
|
| 7 |
+
loss curve is emitted; when it fails, a representative HF error is raised.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import random
|
| 12 |
+
import time
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
from forgeenv.sandbox.ast_validator import validate_script
|
| 16 |
+
from forgeenv.tasks.models import ExecutionResult, Task
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SimulationExecutor:
|
| 20 |
+
"""Simulates script execution via static analysis.
|
| 21 |
+
|
| 22 |
+
Use this throughout development phases. Real Docker execution is added
|
| 23 |
+
later for grounded final-stage verification.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, seed: Optional[int] = None) -> None:
|
| 27 |
+
self._rng = random.Random(seed) if seed is not None else random
|
| 28 |
+
|
| 29 |
+
def execute(
|
| 30 |
+
self, script_content: str, task: Optional[Task] = None
|
| 31 |
+
) -> ExecutionResult:
|
| 32 |
+
start = time.time()
|
| 33 |
+
|
| 34 |
+
validation = validate_script(script_content)
|
| 35 |
+
if not validation.is_valid:
|
| 36 |
+
return ExecutionResult(
|
| 37 |
+
exit_code=1,
|
| 38 |
+
stdout="",
|
| 39 |
+
stderr=f"Validation failed: {'; '.join(validation.violations)}",
|
| 40 |
+
wall_time_ms=int((time.time() - start) * 1000),
|
| 41 |
+
script_content=script_content,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
compile(script_content, "<forge_script>", "exec")
|
| 46 |
+
except SyntaxError as e:
|
| 47 |
+
return ExecutionResult(
|
| 48 |
+
exit_code=1,
|
| 49 |
+
stdout="",
|
| 50 |
+
stderr=f"SyntaxError: {e}",
|
| 51 |
+
wall_time_ms=int((time.time() - start) * 1000),
|
| 52 |
+
script_content=script_content,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
has_model_import = any(
|
| 56 |
+
kw in script_content
|
| 57 |
+
for kw in ("from transformers", "import torch", "from datasets")
|
| 58 |
+
)
|
| 59 |
+
has_training_call = any(
|
| 60 |
+
kw in script_content
|
| 61 |
+
for kw in ("trainer.train()", ".fit(", "train_loop", "for epoch")
|
| 62 |
+
)
|
| 63 |
+
has_save = any(
|
| 64 |
+
kw in script_content
|
| 65 |
+
for kw in ("save_pretrained", "save_model", "torch.save")
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
success_prob = 0.3
|
| 69 |
+
if has_model_import:
|
| 70 |
+
success_prob += 0.3
|
| 71 |
+
if has_training_call:
|
| 72 |
+
success_prob += 0.2
|
| 73 |
+
if has_save:
|
| 74 |
+
success_prob += 0.1
|
| 75 |
+
|
| 76 |
+
# Mark obviously broken patterns as definite failures even when
|
| 77 |
+
# they pass syntactic compilation. The simulator pretends to be a
|
| 78 |
+
# static linter that catches AttributeError / ImportError signatures
|
| 79 |
+
# before they would fire at runtime.
|
| 80 |
+
broken_markers = (
|
| 81 |
+
"_DEPRECATED(",
|
| 82 |
+
"transformers.legacy",
|
| 83 |
+
"from transformers.training import",
|
| 84 |
+
".start_training(",
|
| 85 |
+
"load_from_hub(",
|
| 86 |
+
"save_to_hub(",
|
| 87 |
+
"pad_to_max_length=",
|
| 88 |
+
"evaluation_loop(",
|
| 89 |
+
)
|
| 90 |
+
if any(marker in script_content for marker in broken_markers):
|
| 91 |
+
success_prob = 0.0
|
| 92 |
+
# Patterns that look like dataset column drift: a renamed column
|
| 93 |
+
# that doesn't appear in real HF datasets.
|
| 94 |
+
import re as _re
|
| 95 |
+
|
| 96 |
+
if _re.search(r"['\"]input_text['\"]\s*[]:),]", script_content):
|
| 97 |
+
success_prob = min(success_prob, 0.05)
|
| 98 |
+
if _re.search(r"['\"]words['\"]\s*[]:),]", script_content):
|
| 99 |
+
success_prob = min(success_prob, 0.05)
|
| 100 |
+
# Tokenizer kwarg drift (truncate is not valid; truncation is).
|
| 101 |
+
if _re.search(r"\btruncate\s*=", script_content):
|
| 102 |
+
success_prob = min(success_prob, 0.05)
|
| 103 |
+
|
| 104 |
+
succeeded = self._rng.random() < success_prob
|
| 105 |
+
|
| 106 |
+
if succeeded:
|
| 107 |
+
steps = self._rng.randint(20, 50)
|
| 108 |
+
log_lines: list[str] = []
|
| 109 |
+
loss = self._rng.uniform(2.0, 4.0)
|
| 110 |
+
for step in range(1, steps + 1):
|
| 111 |
+
loss *= self._rng.uniform(0.92, 0.99)
|
| 112 |
+
log_lines.append(f"step={step} loss={loss:.4f}")
|
| 113 |
+
log_lines.append("eval_accuracy=0.78")
|
| 114 |
+
log_lines.append("TRAINING_COMPLETE")
|
| 115 |
+
|
| 116 |
+
return ExecutionResult(
|
| 117 |
+
exit_code=0,
|
| 118 |
+
stdout="\n".join(log_lines),
|
| 119 |
+
stderr="",
|
| 120 |
+
wall_time_ms=int((time.time() - start) * 1000)
|
| 121 |
+
+ self._rng.randint(1000, 5000),
|
| 122 |
+
checkpoint_exists=True,
|
| 123 |
+
peak_memory_mb=self._rng.uniform(500, 2000),
|
| 124 |
+
script_content=script_content,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
error_types = [
|
| 128 |
+
"ImportError: cannot import name 'OldTrainer' from 'transformers'",
|
| 129 |
+
"AttributeError: 'Trainer' object has no attribute 'evaluate_model'",
|
| 130 |
+
"KeyError: 'text' column not found in dataset",
|
| 131 |
+
"TypeError: __init__() got an unexpected keyword argument 'num_epochs'",
|
| 132 |
+
"RuntimeError: Expected input batch_size (16) to match target batch_size (32)",
|
| 133 |
+
"ModuleNotFoundError: No module named 'transformers.legacy'",
|
| 134 |
+
]
|
| 135 |
+
return ExecutionResult(
|
| 136 |
+
exit_code=1,
|
| 137 |
+
stdout="",
|
| 138 |
+
stderr=self._rng.choice(error_types),
|
| 139 |
+
wall_time_ms=int((time.time() - start) * 1000)
|
| 140 |
+
+ self._rng.randint(100, 500),
|
| 141 |
+
script_content=script_content,
|
| 142 |
+
)
|
forgeenv-space/forgeenv/tasks/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/tasks/models.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core data models for ForgeEnv tasks and execution results.
|
| 2 |
+
|
| 3 |
+
These are framework-internal dataclasses (not Pydantic) used throughout the
|
| 4 |
+
simulation, verifier, and primitive layers. The OpenEnv-facing Pydantic
|
| 5 |
+
models live in `forgeenv.env.actions` / `forgeenv.env.observations`.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class Task:
|
| 15 |
+
"""A HuggingFace training script with execution metadata."""
|
| 16 |
+
|
| 17 |
+
task_id: str
|
| 18 |
+
description: str
|
| 19 |
+
script_content: str
|
| 20 |
+
difficulty: str # "easy", "medium", "hard"
|
| 21 |
+
category: str = "general"
|
| 22 |
+
expected_loss_range: tuple[float, float] = (0.0, 5.0)
|
| 23 |
+
expected_accuracy_range: tuple[float, float] = (0.0, 1.0)
|
| 24 |
+
checkpoint_output_path: str = "/tmp/forge_output/checkpoint"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class ExecutionResult:
|
| 29 |
+
"""Result of executing a Python script in the sandbox."""
|
| 30 |
+
|
| 31 |
+
exit_code: int
|
| 32 |
+
stdout: str
|
| 33 |
+
stderr: str
|
| 34 |
+
wall_time_ms: int
|
| 35 |
+
checkpoint_exists: bool = False
|
| 36 |
+
peak_memory_mb: float = 0.0
|
| 37 |
+
script_content: str = ""
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class ValidationResult:
|
| 42 |
+
"""Result of AST validation on a script."""
|
| 43 |
+
|
| 44 |
+
is_valid: bool
|
| 45 |
+
violations: list[str] = field(default_factory=list)
|
forgeenv-space/forgeenv/tasks/seed_corpus/__init__.py
ADDED
|
File without changes
|
forgeenv-space/forgeenv/tasks/seed_corpus/albert_qa.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ALBERT-tiny extractive QA on 100-sample SQuAD subset."""
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
AutoModelForQuestionAnswering,
|
| 5 |
+
Trainer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
DefaultDataCollator,
|
| 8 |
+
)
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
|
| 11 |
+
dataset = load_dataset("squad", split="train[:100]")
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def prepare(examples):
|
| 16 |
+
enc = tokenizer(
|
| 17 |
+
examples["question"],
|
| 18 |
+
examples["context"],
|
| 19 |
+
max_length=128,
|
| 20 |
+
truncation="only_second",
|
| 21 |
+
padding="max_length",
|
| 22 |
+
return_offsets_mapping=True,
|
| 23 |
+
)
|
| 24 |
+
start_positions, end_positions = [], []
|
| 25 |
+
for i, offsets in enumerate(enc["offset_mapping"]):
|
| 26 |
+
answer = examples["answers"][i]
|
| 27 |
+
start_char = answer["answer_start"][0]
|
| 28 |
+
end_char = start_char + len(answer["text"][0])
|
| 29 |
+
|
| 30 |
+
token_start = next(
|
| 31 |
+
(idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0
|
| 32 |
+
)
|
| 33 |
+
token_end = next(
|
| 34 |
+
(idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start
|
| 35 |
+
)
|
| 36 |
+
start_positions.append(token_start)
|
| 37 |
+
end_positions.append(token_end)
|
| 38 |
+
|
| 39 |
+
enc["start_positions"] = start_positions
|
| 40 |
+
enc["end_positions"] = end_positions
|
| 41 |
+
enc.pop("offset_mapping")
|
| 42 |
+
return enc
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
dataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)
|
| 46 |
+
|
| 47 |
+
model = AutoModelForQuestionAnswering.from_pretrained("albert-base-v2")
|
| 48 |
+
|
| 49 |
+
training_args = TrainingArguments(
|
| 50 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 51 |
+
num_train_epochs=1,
|
| 52 |
+
per_device_train_batch_size=4,
|
| 53 |
+
logging_steps=5,
|
| 54 |
+
save_strategy="epoch",
|
| 55 |
+
no_cuda=True,
|
| 56 |
+
report_to="none",
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
trainer = Trainer(
|
| 60 |
+
model=model,
|
| 61 |
+
args=training_args,
|
| 62 |
+
train_dataset=dataset,
|
| 63 |
+
data_collator=DefaultDataCollator(),
|
| 64 |
+
)
|
| 65 |
+
trainer.train()
|
| 66 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 67 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/bert_ner.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset."""
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
AutoModelForTokenClassification,
|
| 5 |
+
Trainer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
DataCollatorForTokenClassification,
|
| 8 |
+
)
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
|
| 11 |
+
dataset = load_dataset("conll2003", split="train[:200]")
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def tokenize_and_align(example):
|
| 16 |
+
enc = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, max_length=64)
|
| 17 |
+
word_ids = enc.word_ids()
|
| 18 |
+
labels = []
|
| 19 |
+
prev_id = None
|
| 20 |
+
for wid in word_ids:
|
| 21 |
+
if wid is None:
|
| 22 |
+
labels.append(-100)
|
| 23 |
+
elif wid != prev_id:
|
| 24 |
+
labels.append(example["ner_tags"][wid])
|
| 25 |
+
else:
|
| 26 |
+
labels.append(-100)
|
| 27 |
+
prev_id = wid
|
| 28 |
+
enc["labels"] = labels
|
| 29 |
+
return enc
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
dataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)
|
| 33 |
+
|
| 34 |
+
model = AutoModelForTokenClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=9)
|
| 35 |
+
|
| 36 |
+
training_args = TrainingArguments(
|
| 37 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 38 |
+
num_train_epochs=1,
|
| 39 |
+
per_device_train_batch_size=8,
|
| 40 |
+
logging_steps=5,
|
| 41 |
+
save_strategy="epoch",
|
| 42 |
+
no_cuda=True,
|
| 43 |
+
report_to="none",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
trainer = Trainer(
|
| 47 |
+
model=model,
|
| 48 |
+
args=training_args,
|
| 49 |
+
train_dataset=dataset,
|
| 50 |
+
data_collator=DataCollatorForTokenClassification(tokenizer),
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
trainer.train()
|
| 54 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 55 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/distilbert_sst2.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DistilBERT fine-tuning on a tiny SST-2 subset.
|
| 2 |
+
|
| 3 |
+
Minimal HuggingFace text-classification training script. Should complete
|
| 4 |
+
in ~60s on CPU.
|
| 5 |
+
"""
|
| 6 |
+
from transformers import (
|
| 7 |
+
DistilBertTokenizer,
|
| 8 |
+
DistilBertForSequenceClassification,
|
| 9 |
+
Trainer,
|
| 10 |
+
TrainingArguments,
|
| 11 |
+
)
|
| 12 |
+
from datasets import load_dataset
|
| 13 |
+
|
| 14 |
+
dataset = load_dataset("glue", "sst2", split="train[:500]")
|
| 15 |
+
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def tokenize_function(examples):
|
| 19 |
+
return tokenizer(
|
| 20 |
+
examples["sentence"],
|
| 21 |
+
padding="max_length",
|
| 22 |
+
truncation=True,
|
| 23 |
+
max_length=64,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
dataset = dataset.map(tokenize_function, batched=True)
|
| 28 |
+
dataset = dataset.rename_column("label", "labels")
|
| 29 |
+
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
|
| 30 |
+
|
| 31 |
+
model = DistilBertForSequenceClassification.from_pretrained(
|
| 32 |
+
"distilbert-base-uncased", num_labels=2
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
training_args = TrainingArguments(
|
| 36 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 37 |
+
num_train_epochs=1,
|
| 38 |
+
per_device_train_batch_size=16,
|
| 39 |
+
logging_steps=5,
|
| 40 |
+
save_strategy="epoch",
|
| 41 |
+
no_cuda=True,
|
| 42 |
+
report_to="none",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
trainer = Trainer(
|
| 46 |
+
model=model,
|
| 47 |
+
args=training_args,
|
| 48 |
+
train_dataset=dataset,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
trainer.train()
|
| 52 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 53 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/electra_classification.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ELECTRA-small classification on 400-sample AG News (4-way text classification)."""
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
AutoModelForSequenceClassification,
|
| 5 |
+
Trainer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
)
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
|
| 10 |
+
dataset = load_dataset("ag_news", split="train[:400]")
|
| 11 |
+
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def tokenize(examples):
|
| 15 |
+
return tokenizer(
|
| 16 |
+
examples["text"],
|
| 17 |
+
padding="max_length",
|
| 18 |
+
truncation=True,
|
| 19 |
+
max_length=64,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
dataset = dataset.map(tokenize, batched=True)
|
| 24 |
+
dataset = dataset.rename_column("label", "labels")
|
| 25 |
+
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
|
| 26 |
+
|
| 27 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 28 |
+
"google/electra-small-discriminator", num_labels=4
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
training_args = TrainingArguments(
|
| 32 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 33 |
+
num_train_epochs=1,
|
| 34 |
+
per_device_train_batch_size=8,
|
| 35 |
+
logging_steps=5,
|
| 36 |
+
save_strategy="epoch",
|
| 37 |
+
no_cuda=True,
|
| 38 |
+
report_to="none",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
|
| 42 |
+
trainer.train()
|
| 43 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 44 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/gpt2_textgen.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation)."""
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
AutoModelForCausalLM,
|
| 5 |
+
Trainer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
DataCollatorForLanguageModeling,
|
| 8 |
+
)
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
|
| 11 |
+
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:300]")
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
|
| 13 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def tokenize(examples):
|
| 17 |
+
return tokenizer(examples["text"], truncation=True, max_length=64)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
|
| 21 |
+
|
| 22 |
+
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
|
| 23 |
+
|
| 24 |
+
training_args = TrainingArguments(
|
| 25 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 26 |
+
num_train_epochs=1,
|
| 27 |
+
per_device_train_batch_size=4,
|
| 28 |
+
logging_steps=5,
|
| 29 |
+
save_strategy="epoch",
|
| 30 |
+
no_cuda=True,
|
| 31 |
+
report_to="none",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
trainer = Trainer(
|
| 35 |
+
model=model,
|
| 36 |
+
args=training_args,
|
| 37 |
+
train_dataset=dataset,
|
| 38 |
+
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
trainer.train()
|
| 42 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 43 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/logistic_classifier.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sklearn logistic-regression baseline on a 500-sample tabular task.
|
| 2 |
+
|
| 3 |
+
Sanity baseline that doesn't require torch / transformers / datasets.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import pickle
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.datasets import make_classification
|
| 11 |
+
from sklearn.linear_model import LogisticRegression
|
| 12 |
+
from sklearn.model_selection import train_test_split
|
| 13 |
+
|
| 14 |
+
X, y = make_classification(
|
| 15 |
+
n_samples=500, n_features=20, n_informative=10, random_state=0
|
| 16 |
+
)
|
| 17 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
|
| 18 |
+
|
| 19 |
+
model = LogisticRegression(max_iter=200)
|
| 20 |
+
for step in range(1, 11):
|
| 21 |
+
model.set_params(max_iter=step * 20)
|
| 22 |
+
model.fit(X_train, y_train)
|
| 23 |
+
train_loss = -np.mean(np.log(np.maximum(model.predict_proba(X_train)[np.arange(len(y_train)), y_train], 1e-9)))
|
| 24 |
+
print(f"step={step} loss={train_loss:.4f}")
|
| 25 |
+
|
| 26 |
+
acc = model.score(X_test, y_test)
|
| 27 |
+
print(f"eval_accuracy={acc:.4f}")
|
| 28 |
+
|
| 29 |
+
ckpt_dir = Path("/tmp/forge_output/checkpoint")
|
| 30 |
+
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
with open(ckpt_dir / "logreg.pkl", "wb") as f:
|
| 32 |
+
pickle.dump(model, f)
|
| 33 |
+
with open(ckpt_dir / "metrics.json", "w") as f:
|
| 34 |
+
json.dump({"accuracy": acc}, f)
|
| 35 |
+
|
| 36 |
+
print("TRAINING_COMPLETE")
|
forgeenv-space/forgeenv/tasks/seed_corpus/roberta_sentiment.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DistilRoberta sentiment classification on 400-sample IMDB subset."""
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
AutoModelForSequenceClassification,
|
| 5 |
+
Trainer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
)
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
|
| 10 |
+
dataset = load_dataset("imdb", split="train[:400]")
|
| 11 |
+
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def tokenize(examples):
|
| 15 |
+
return tokenizer(
|
| 16 |
+
examples["text"],
|
| 17 |
+
padding="max_length",
|
| 18 |
+
truncation=True,
|
| 19 |
+
max_length=64,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
dataset = dataset.map(tokenize, batched=True)
|
| 24 |
+
dataset = dataset.rename_column("label", "labels")
|
| 25 |
+
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
|
| 26 |
+
|
| 27 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 28 |
+
"distilroberta-base", num_labels=2
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
training_args = TrainingArguments(
|
| 32 |
+
output_dir="/tmp/forge_output/checkpoint",
|
| 33 |
+
num_train_epochs=1,
|
| 34 |
+
per_device_train_batch_size=8,
|
| 35 |
+
logging_steps=5,
|
| 36 |
+
save_strategy="epoch",
|
| 37 |
+
no_cuda=True,
|
| 38 |
+
report_to="none",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
|
| 42 |
+
trainer.train()
|
| 43 |
+
trainer.save_model("/tmp/forge_output/checkpoint")
|
| 44 |
+
print("TRAINING_COMPLETE")
|