anugrahhu commited on
Commit
f82f913
·
verified ·
1 Parent(s): 0a6c641

fix: disable fast_inference (vLLM not installed) in training/training_unsloth.py

Browse files
Files changed (1) hide show
  1. training/training_unsloth.py +342 -341
training/training_unsloth.py CHANGED
@@ -1,341 +1,342 @@
1
- """Unsloth + LoRA (Low-Rank Adaptation) GRPO training for CERNenv.
2
-
3
- This is the recommended path for Colab / single- or multi-GPU runs because
4
- Unsloth's fused kernels and 4-bit loading let us train 2B–8B models with
5
- limited VRAM, while TRL's GRPO (Group-Relative Policy Optimization) loop
6
- handles the policy-gradient math.
7
-
8
- The trainer is wired up to produce **all** "training-progress evidence"
9
- artifacts demanded by the OpenEnv hackathon's scoring rubric:
10
-
11
- * per-step training log + reward/loss curve PNG (Portable Network Graphics)
12
- * mid-training checkpoint evaluations + progression curve PNG
13
- * (post-run) before/after summary + reward-distribution PNG
14
-
15
- All artifacts land in ``--evidence_dir`` (default: ``evidence/``).
16
-
17
- Run on Colab / single GPU:
18
- !python -m training.training_unsloth \
19
- --model_name unsloth/Qwen2.5-3B-Instruct \
20
- --total_episodes 400 --num_generations 4 --output_dir runs/unsloth-grpo
21
-
22
- Run on a 4×A100 Hugging Face Space (multi-GPU via accelerate):
23
- accelerate launch --num_processes 4 -m training.training_unsloth \
24
- --total_episodes 1500 --num_generations 8 --output_dir runs/unsloth-grpo
25
- """
26
-
27
- from __future__ import annotations
28
-
29
- import argparse
30
- import logging
31
- import time
32
- from pathlib import Path
33
- from typing import Any, Dict, List, Optional
34
-
35
-
36
- logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
37
- logger = logging.getLogger(__name__)
38
-
39
-
40
- def _build_args() -> argparse.Namespace:
41
- parser = argparse.ArgumentParser()
42
- parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct")
43
- parser.add_argument("--scenario", default=None)
44
- parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
45
- parser.add_argument(
46
- "--curriculum",
47
- action="store_true",
48
- help=(
49
- "Enable adaptive curriculum: start at --difficulty and promote "
50
- "to medium/hard once held-out success rate clears the threshold "
51
- "(see training/curriculum.py)."
52
- ),
53
- )
54
- parser.add_argument("--curriculum_promote", type=float, default=0.55)
55
- parser.add_argument("--curriculum_demote", type=float, default=0.10)
56
- parser.add_argument("--total_episodes", type=int, default=400)
57
- parser.add_argument("--seed", type=int, default=42)
58
- parser.add_argument("--max_steps", type=int, default=18)
59
- parser.add_argument("--num_generations", type=int, default=4)
60
- parser.add_argument("--max_prompt_length", type=int, default=2048)
61
- parser.add_argument("--max_completion_length", type=int, default=384)
62
- parser.add_argument("--learning_rate", type=float, default=5e-6)
63
- parser.add_argument("--load_in_4bit", action="store_true", default=True)
64
- parser.add_argument("--lora_rank", type=int, default=16)
65
- parser.add_argument("--lora_alpha", type=int, default=16)
66
- parser.add_argument("--per_device_batch_size", type=int, default=1)
67
- parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
68
- parser.add_argument("--logging_steps", type=int, default=2)
69
- parser.add_argument("--save_steps", type=int, default=50)
70
- parser.add_argument("--checkpoint_eval_steps", type=int, default=25,
71
- help="Run a held-out eval every N updates for the progression curve.")
72
- parser.add_argument("--checkpoint_eval_episodes", type=int, default=8,
73
- help="Number of held-out episodes per mid-training eval.")
74
- parser.add_argument("--output_dir", default="runs/unsloth-grpo")
75
- parser.add_argument("--evidence_dir", default="evidence")
76
- return parser.parse_args()
77
-
78
-
79
- def main() -> None: # pragma: no cover - heavy GPU path
80
- args = _build_args()
81
-
82
- # IMPORTANT: Unsloth MUST be imported before transformers / trl. It
83
- # patches transformers' lazy ``_import_structure`` to register a few
84
- # symbols (notably ``PreTrainedModel`` under torch-aware paths). If trl
85
- # loads transformers first, the lazy loader will fail with a confusing
86
- # ``ImportError: cannot import name 'PreTrainedModel' from 'transformers'``
87
- # at GRPOTrainer import time — which is exactly what we hit on the
88
- # trainer Space before this reorder.
89
- # See: https://github.com/unslothai/unsloth and the matching
90
- # transformers issue #42548 for the lazy-import root cause.
91
- from unsloth import FastLanguageModel
92
- from transformers import TrainerCallback
93
- from trl import GRPOConfig, GRPOTrainer
94
-
95
- from server.environment import CERNCollisionEnvironment
96
- from training.curriculum import CurriculumConfig, CurriculumManager
97
- from training.evidence import (
98
- CheckpointEvalWriter,
99
- EvidencePaths,
100
- RewardComponentLogWriter,
101
- TrainingLogWriter,
102
- render_checkpoint_progression,
103
- render_reward_components,
104
- render_training_curve,
105
- )
106
- from training.llm_agent import LLMAgentConfig
107
- from training.rollouts import collect_episode
108
- from training.training_script import (
109
- EpisodeContext,
110
- RewardComponentAccumulator,
111
- )
112
-
113
- paths = EvidencePaths(root=Path(args.evidence_dir))
114
- paths.ensure()
115
- log_writer = TrainingLogWriter(paths.training_log_csv)
116
- ckpt_writer = CheckpointEvalWriter(paths.checkpoint_evals_csv)
117
- component_writer = RewardComponentLogWriter(paths.reward_components_csv)
118
- component_accumulator = RewardComponentAccumulator()
119
-
120
- curriculum: Optional[CurriculumManager] = None
121
- if args.curriculum:
122
- curriculum = CurriculumManager(
123
- CurriculumConfig(
124
- start_difficulty=args.difficulty,
125
- promote_threshold=args.curriculum_promote,
126
- demote_threshold=args.curriculum_demote,
127
- )
128
- )
129
- logger.info("Curriculum enabled: start=%s promote≥%.2f demote≤%.2f",
130
- args.difficulty, args.curriculum_promote, args.curriculum_demote)
131
-
132
- logger.info("Loading Unsloth model: %s", args.model_name)
133
- model, tokenizer = FastLanguageModel.from_pretrained(
134
- model_name=args.model_name,
135
- max_seq_length=args.max_prompt_length + args.max_completion_length,
136
- load_in_4bit=args.load_in_4bit,
137
- fast_inference=True,
138
- )
139
- model = FastLanguageModel.get_peft_model(
140
- model,
141
- r=args.lora_rank,
142
- lora_alpha=args.lora_alpha,
143
- target_modules=[
144
- "q_proj", "k_proj", "v_proj", "o_proj",
145
- "gate_proj", "up_proj", "down_proj",
146
- ],
147
- use_gradient_checkpointing="unsloth",
148
- )
149
- if tokenizer.pad_token is None:
150
- tokenizer.pad_token = tokenizer.eos_token
151
-
152
- from training.training_script import build_dataset, make_reward_fn
153
-
154
- env = CERNCollisionEnvironment(max_steps=args.max_steps)
155
- dataset = build_dataset(
156
- tokenizer=tokenizer,
157
- n_prompts=args.total_episodes,
158
- seed=args.seed,
159
- scenario=args.scenario,
160
- difficulty=args.difficulty,
161
- curriculum=args.curriculum,
162
- )
163
-
164
- ctx = EpisodeContext(
165
- env=env, seed=args.seed,
166
- scenario=args.scenario, difficulty=args.difficulty,
167
- )
168
- reward_fn = make_reward_fn(ctx, accumulator=component_accumulator)
169
-
170
- cfg = GRPOConfig(
171
- output_dir=args.output_dir,
172
- per_device_train_batch_size=args.per_device_batch_size,
173
- gradient_accumulation_steps=args.gradient_accumulation_steps,
174
- num_generations=args.num_generations,
175
- learning_rate=args.learning_rate,
176
- max_prompt_length=args.max_prompt_length,
177
- max_completion_length=args.max_completion_length,
178
- logging_steps=args.logging_steps,
179
- save_steps=args.save_steps,
180
- seed=args.seed,
181
- bf16=True,
182
- report_to=[],
183
- )
184
-
185
- held_out_seeds = list(range(900_000, 900_000 + args.checkpoint_eval_episodes))
186
-
187
- class EvidenceCallback(TrainerCallback):
188
- """Stream training metrics + run periodic mid-training evals."""
189
-
190
- def __init__(self) -> None:
191
- self._t0 = time.time()
192
- self._last_eval_step = -1
193
-
194
- def on_log(self, _args, state, control, logs=None, **kw):
195
- logs = logs or {}
196
- row = {
197
- "step": state.global_step,
198
- "epoch": logs.get("epoch"),
199
- "loss": logs.get("loss"),
200
- "reward": logs.get("reward") or logs.get("rewards/mean"),
201
- "reward_std": logs.get("reward_std") or logs.get("rewards/std"),
202
- "kl": logs.get("kl"),
203
- "grad_norm": logs.get("grad_norm"),
204
- "learning_rate": logs.get("learning_rate"),
205
- "wall_time_s": round(time.time() - self._t0, 2),
206
- }
207
- if any(v is not None for k, v in row.items() if k != "step"):
208
- log_writer.append(row)
209
- render_training_curve(paths.training_log_csv, paths.training_curve_png)
210
-
211
- # Per-component reward summary (FAQ Q17, Q43, Q52: don't watch
212
- # only the mean reward track terminal vs shaping, success
213
- # rates, and parse rate so verifier hacks become visible).
214
- drained = component_accumulator.drain()
215
- if drained:
216
- summary = RewardComponentAccumulator.summarise(drained)
217
- summary["step"] = state.global_step
218
- component_writer.append(summary)
219
- render_reward_components(
220
- paths.reward_components_csv, paths.reward_components_png,
221
- )
222
-
223
- def on_step_end(self, _args, state, control, **kw):
224
- step = state.global_step
225
- if step <= 0 or step == self._last_eval_step:
226
- return control
227
- if step % args.checkpoint_eval_steps != 0:
228
- return control
229
- self._last_eval_step = step
230
- try:
231
- self._run_checkpoint_eval(step, state)
232
- except Exception as exc:
233
- logger.warning("checkpoint eval failed at step %d: %s", step, exc)
234
- return control
235
-
236
- def _run_checkpoint_eval(self, step: int, state) -> None:
237
- FastLanguageModel.for_inference(model)
238
- try:
239
- # When curriculum is enabled, evaluate at whatever tier the
240
- # adaptive manager currently considers appropriate. Otherwise
241
- # use the static --difficulty.
242
- eval_difficulty = (
243
- curriculum.next_difficulty()
244
- if curriculum is not None
245
- else args.difficulty
246
- )
247
- episodes = []
248
- for s in held_out_seeds:
249
- ep = self._rollout_one(seed=s, difficulty=eval_difficulty)
250
- if ep is not None:
251
- episodes.append(ep)
252
- if not episodes:
253
- return
254
- rewards = [e.cumulative_reward for e in episodes]
255
- success_rate = sum(1 for e in episodes if e.discovered) / len(episodes)
256
- ckpt_writer.append(
257
- step=step,
258
- fraction_done=round(step / max(state.max_steps or step, 1), 4),
259
- episodes=len(episodes),
260
- mean_reward=round(sum(rewards) / len(rewards), 4),
261
- success_rate=round(success_rate, 4),
262
- mass_acc=round(sum(1 for e in episodes if e.correct_mass) / len(episodes), 4),
263
- channel_acc=round(sum(1 for e in episodes if e.correct_channel) / len(episodes), 4),
264
- )
265
- render_checkpoint_progression(
266
- paths.checkpoint_evals_csv,
267
- paths.checkpoint_progression_png,
268
- )
269
- if curriculum is not None:
270
- snap = curriculum.record(
271
- success=success_rate >= 0.5,
272
- reward=sum(rewards) / len(rewards),
273
- )
274
- curriculum.save(paths.root / "curriculum_state.json")
275
- if snap.get("event"):
276
- logger.info(
277
- "[curriculum] %s @ step=%d → tier=%s (rolling=%.2f)",
278
- snap["event"], step, snap["current"], snap["rolling_success"],
279
- )
280
- logger.info(
281
- "[checkpoint-eval step=%d difficulty=%s] reward=%.3f success=%.2f",
282
- step, eval_difficulty,
283
- rewards and (sum(rewards) / len(rewards)) or 0.0,
284
- success_rate,
285
- )
286
- finally:
287
- FastLanguageModel.for_training(model)
288
-
289
- def _rollout_one(self, seed: int, difficulty: Optional[str] = None):
290
- def prompt_fn(chat):
291
- return tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
292
-
293
- def generate_fn(prompt: str, _config) -> str:
294
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
295
- outputs = model.generate(
296
- **inputs,
297
- max_new_tokens=args.max_completion_length,
298
- do_sample=True, temperature=0.7, top_p=0.95,
299
- pad_token_id=tokenizer.pad_token_id,
300
- )
301
- gen = outputs[0][inputs["input_ids"].shape[1]:]
302
- return tokenizer.decode(gen, skip_special_tokens=True)
303
-
304
- return collect_episode(
305
- env=env, seed=seed,
306
- scenario=args.scenario,
307
- difficulty=difficulty or args.difficulty,
308
- prompt_fn=prompt_fn, generate_fn=generate_fn,
309
- config=LLMAgentConfig(),
310
- )
311
-
312
- trainer = GRPOTrainer(
313
- model=model,
314
- processing_class=tokenizer,
315
- train_dataset=dataset,
316
- reward_funcs=[reward_fn],
317
- args=cfg,
318
- callbacks=[EvidenceCallback()],
319
- )
320
- logger.info("Starting Unsloth + LoRA GRPO training")
321
- trainer.train()
322
-
323
- # Drain whatever rollouts the final on_log didn't catch so the last
324
- # row of reward_components.csv is correct.
325
- final_drain = component_accumulator.drain()
326
- if final_drain:
327
- summary = RewardComponentAccumulator.summarise(final_drain)
328
- summary["step"] = trainer.state.global_step
329
- component_writer.append(summary)
330
- render_reward_components(
331
- paths.reward_components_csv, paths.reward_components_png,
332
- )
333
-
334
- trainer.save_model(args.output_dir)
335
- tokenizer.save_pretrained(args.output_dir)
336
- logger.info("Saved adapters to %s", args.output_dir)
337
- logger.info("Evidence artifacts in %s", paths.root)
338
-
339
-
340
- if __name__ == "__main__": # pragma: no cover
341
- main()
 
 
1
+ """Unsloth + LoRA (Low-Rank Adaptation) GRPO training for CERNenv.
2
+
3
+ This is the recommended path for Colab / single- or multi-GPU runs because
4
+ Unsloth's fused kernels and 4-bit loading let us train 2B–8B models with
5
+ limited VRAM, while TRL's GRPO (Group-Relative Policy Optimization) loop
6
+ handles the policy-gradient math.
7
+
8
+ The trainer is wired up to produce **all** "training-progress evidence"
9
+ artifacts demanded by the OpenEnv hackathon's scoring rubric:
10
+
11
+ * per-step training log + reward/loss curve PNG (Portable Network Graphics)
12
+ * mid-training checkpoint evaluations + progression curve PNG
13
+ * (post-run) before/after summary + reward-distribution PNG
14
+
15
+ All artifacts land in ``--evidence_dir`` (default: ``evidence/``).
16
+
17
+ Run on Colab / single GPU:
18
+ !python -m training.training_unsloth \
19
+ --model_name unsloth/Qwen2.5-3B-Instruct \
20
+ --total_episodes 400 --num_generations 4 --output_dir runs/unsloth-grpo
21
+
22
+ Run on a 4×A100 Hugging Face Space (multi-GPU via accelerate):
23
+ accelerate launch --num_processes 4 -m training.training_unsloth \
24
+ --total_episodes 1500 --num_generations 8 --output_dir runs/unsloth-grpo
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import logging
31
+ import time
32
+ from pathlib import Path
33
+ from typing import Any, Dict, List, Optional
34
+
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def _build_args() -> argparse.Namespace:
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct")
43
+ parser.add_argument("--scenario", default=None)
44
+ parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
45
+ parser.add_argument(
46
+ "--curriculum",
47
+ action="store_true",
48
+ help=(
49
+ "Enable adaptive curriculum: start at --difficulty and promote "
50
+ "to medium/hard once held-out success rate clears the threshold "
51
+ "(see training/curriculum.py)."
52
+ ),
53
+ )
54
+ parser.add_argument("--curriculum_promote", type=float, default=0.55)
55
+ parser.add_argument("--curriculum_demote", type=float, default=0.10)
56
+ parser.add_argument("--total_episodes", type=int, default=400)
57
+ parser.add_argument("--seed", type=int, default=42)
58
+ parser.add_argument("--max_steps", type=int, default=18)
59
+ parser.add_argument("--num_generations", type=int, default=4)
60
+ parser.add_argument("--max_prompt_length", type=int, default=2048)
61
+ parser.add_argument("--max_completion_length", type=int, default=384)
62
+ parser.add_argument("--learning_rate", type=float, default=5e-6)
63
+ parser.add_argument("--load_in_4bit", action="store_true", default=True)
64
+ parser.add_argument("--lora_rank", type=int, default=16)
65
+ parser.add_argument("--lora_alpha", type=int, default=16)
66
+ parser.add_argument("--per_device_batch_size", type=int, default=1)
67
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
68
+ parser.add_argument("--logging_steps", type=int, default=2)
69
+ parser.add_argument("--save_steps", type=int, default=50)
70
+ parser.add_argument("--checkpoint_eval_steps", type=int, default=25,
71
+ help="Run a held-out eval every N updates for the progression curve.")
72
+ parser.add_argument("--checkpoint_eval_episodes", type=int, default=8,
73
+ help="Number of held-out episodes per mid-training eval.")
74
+ parser.add_argument("--output_dir", default="runs/unsloth-grpo")
75
+ parser.add_argument("--evidence_dir", default="evidence")
76
+ return parser.parse_args()
77
+
78
+
79
+ def main() -> None: # pragma: no cover - heavy GPU path
80
+ args = _build_args()
81
+
82
+ # IMPORTANT: Unsloth MUST be imported before transformers / trl. It
83
+ # patches transformers' lazy ``_import_structure`` to register a few
84
+ # symbols (notably ``PreTrainedModel`` under torch-aware paths). If trl
85
+ # loads transformers first, the lazy loader will fail with a confusing
86
+ # ``ImportError: cannot import name 'PreTrainedModel' from 'transformers'``
87
+ # at GRPOTrainer import time — which is exactly what we hit on the
88
+ # trainer Space before this reorder.
89
+ # See: https://github.com/unslothai/unsloth and the matching
90
+ # transformers issue #42548 for the lazy-import root cause.
91
+ from unsloth import FastLanguageModel
92
+ from transformers import TrainerCallback
93
+ from trl import GRPOConfig, GRPOTrainer
94
+
95
+ from server.environment import CERNCollisionEnvironment
96
+ from training.curriculum import CurriculumConfig, CurriculumManager
97
+ from training.evidence import (
98
+ CheckpointEvalWriter,
99
+ EvidencePaths,
100
+ RewardComponentLogWriter,
101
+ TrainingLogWriter,
102
+ render_checkpoint_progression,
103
+ render_reward_components,
104
+ render_training_curve,
105
+ )
106
+ from training.llm_agent import LLMAgentConfig
107
+ from training.rollouts import collect_episode
108
+ from training.training_script import (
109
+ EpisodeContext,
110
+ RewardComponentAccumulator,
111
+ )
112
+
113
+ paths = EvidencePaths(root=Path(args.evidence_dir))
114
+ paths.ensure()
115
+ log_writer = TrainingLogWriter(paths.training_log_csv)
116
+ ckpt_writer = CheckpointEvalWriter(paths.checkpoint_evals_csv)
117
+ component_writer = RewardComponentLogWriter(paths.reward_components_csv)
118
+ component_accumulator = RewardComponentAccumulator()
119
+
120
+ curriculum: Optional[CurriculumManager] = None
121
+ if args.curriculum:
122
+ curriculum = CurriculumManager(
123
+ CurriculumConfig(
124
+ start_difficulty=args.difficulty,
125
+ promote_threshold=args.curriculum_promote,
126
+ demote_threshold=args.curriculum_demote,
127
+ )
128
+ )
129
+ logger.info("Curriculum enabled: start=%s promote≥%.2f demote≤%.2f",
130
+ args.difficulty, args.curriculum_promote, args.curriculum_demote)
131
+
132
+ logger.info("Loading Unsloth model: %s", args.model_name)
133
+ model, tokenizer = FastLanguageModel.from_pretrained(
134
+ model_name=args.model_name,
135
+ max_seq_length=args.max_prompt_length + args.max_completion_length,
136
+ load_in_4bit=args.load_in_4bit,
137
+ # fast_inference requires vLLM, which is not in requirements; plain transformers generation is used instead. Re-enable after pinning vllm in space/training/requirements.txt.
138
+ fast_inference=False,
139
+ )
140
+ model = FastLanguageModel.get_peft_model(
141
+ model,
142
+ r=args.lora_rank,
143
+ lora_alpha=args.lora_alpha,
144
+ target_modules=[
145
+ "q_proj", "k_proj", "v_proj", "o_proj",
146
+ "gate_proj", "up_proj", "down_proj",
147
+ ],
148
+ use_gradient_checkpointing="unsloth",
149
+ )
150
+ if tokenizer.pad_token is None:
151
+ tokenizer.pad_token = tokenizer.eos_token
152
+
153
+ from training.training_script import build_dataset, make_reward_fn
154
+
155
+ env = CERNCollisionEnvironment(max_steps=args.max_steps)
156
+ dataset = build_dataset(
157
+ tokenizer=tokenizer,
158
+ n_prompts=args.total_episodes,
159
+ seed=args.seed,
160
+ scenario=args.scenario,
161
+ difficulty=args.difficulty,
162
+ curriculum=args.curriculum,
163
+ )
164
+
165
+ ctx = EpisodeContext(
166
+ env=env, seed=args.seed,
167
+ scenario=args.scenario, difficulty=args.difficulty,
168
+ )
169
+ reward_fn = make_reward_fn(ctx, accumulator=component_accumulator)
170
+
171
+ cfg = GRPOConfig(
172
+ output_dir=args.output_dir,
173
+ per_device_train_batch_size=args.per_device_batch_size,
174
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
175
+ num_generations=args.num_generations,
176
+ learning_rate=args.learning_rate,
177
+ max_prompt_length=args.max_prompt_length,
178
+ max_completion_length=args.max_completion_length,
179
+ logging_steps=args.logging_steps,
180
+ save_steps=args.save_steps,
181
+ seed=args.seed,
182
+ bf16=True,
183
+ report_to=[],
184
+ )
185
+
186
+ held_out_seeds = list(range(900_000, 900_000 + args.checkpoint_eval_episodes))
187
+
188
+ class EvidenceCallback(TrainerCallback):
189
+ """Stream training metrics + run periodic mid-training evals."""
190
+
191
+ def __init__(self) -> None:
192
+ self._t0 = time.time()
193
+ self._last_eval_step = -1
194
+
195
+ def on_log(self, _args, state, control, logs=None, **kw):
196
+ logs = logs or {}
197
+ row = {
198
+ "step": state.global_step,
199
+ "epoch": logs.get("epoch"),
200
+ "loss": logs.get("loss"),
201
+ "reward": logs.get("reward") or logs.get("rewards/mean"),
202
+ "reward_std": logs.get("reward_std") or logs.get("rewards/std"),
203
+ "kl": logs.get("kl"),
204
+ "grad_norm": logs.get("grad_norm"),
205
+ "learning_rate": logs.get("learning_rate"),
206
+ "wall_time_s": round(time.time() - self._t0, 2),
207
+ }
208
+ if any(v is not None for k, v in row.items() if k != "step"):
209
+ log_writer.append(row)
210
+ render_training_curve(paths.training_log_csv, paths.training_curve_png)
211
+
212
+ # Per-component reward summary (FAQ Q17, Q43, Q52: don't watch
213
+ # only the mean reward track terminal vs shaping, success
214
+ # rates, and parse rate so verifier hacks become visible).
215
+ drained = component_accumulator.drain()
216
+ if drained:
217
+ summary = RewardComponentAccumulator.summarise(drained)
218
+ summary["step"] = state.global_step
219
+ component_writer.append(summary)
220
+ render_reward_components(
221
+ paths.reward_components_csv, paths.reward_components_png,
222
+ )
223
+
224
+ def on_step_end(self, _args, state, control, **kw):
225
+ step = state.global_step
226
+ if step <= 0 or step == self._last_eval_step:
227
+ return control
228
+ if step % args.checkpoint_eval_steps != 0:
229
+ return control
230
+ self._last_eval_step = step
231
+ try:
232
+ self._run_checkpoint_eval(step, state)
233
+ except Exception as exc:
234
+ logger.warning("checkpoint eval failed at step %d: %s", step, exc)
235
+ return control
236
+
237
+ def _run_checkpoint_eval(self, step: int, state) -> None:
238
+ FastLanguageModel.for_inference(model)
239
+ try:
240
+ # When curriculum is enabled, evaluate at whatever tier the
241
+ # adaptive manager currently considers appropriate. Otherwise
242
+ # use the static --difficulty.
243
+ eval_difficulty = (
244
+ curriculum.next_difficulty()
245
+ if curriculum is not None
246
+ else args.difficulty
247
+ )
248
+ episodes = []
249
+ for s in held_out_seeds:
250
+ ep = self._rollout_one(seed=s, difficulty=eval_difficulty)
251
+ if ep is not None:
252
+ episodes.append(ep)
253
+ if not episodes:
254
+ return
255
+ rewards = [e.cumulative_reward for e in episodes]
256
+ success_rate = sum(1 for e in episodes if e.discovered) / len(episodes)
257
+ ckpt_writer.append(
258
+ step=step,
259
+ fraction_done=round(step / max(state.max_steps or step, 1), 4),
260
+ episodes=len(episodes),
261
+ mean_reward=round(sum(rewards) / len(rewards), 4),
262
+ success_rate=round(success_rate, 4),
263
+ mass_acc=round(sum(1 for e in episodes if e.correct_mass) / len(episodes), 4),
264
+ channel_acc=round(sum(1 for e in episodes if e.correct_channel) / len(episodes), 4),
265
+ )
266
+ render_checkpoint_progression(
267
+ paths.checkpoint_evals_csv,
268
+ paths.checkpoint_progression_png,
269
+ )
270
+ if curriculum is not None:
271
+ snap = curriculum.record(
272
+ success=success_rate >= 0.5,
273
+ reward=sum(rewards) / len(rewards),
274
+ )
275
+ curriculum.save(paths.root / "curriculum_state.json")
276
+ if snap.get("event"):
277
+ logger.info(
278
+ "[curriculum] %s @ step=%d tier=%s (rolling=%.2f)",
279
+ snap["event"], step, snap["current"], snap["rolling_success"],
280
+ )
281
+ logger.info(
282
+ "[checkpoint-eval step=%d difficulty=%s] reward=%.3f success=%.2f",
283
+ step, eval_difficulty,
284
+ rewards and (sum(rewards) / len(rewards)) or 0.0,
285
+ success_rate,
286
+ )
287
+ finally:
288
+ FastLanguageModel.for_training(model)
289
+
290
+ def _rollout_one(self, seed: int, difficulty: Optional[str] = None):
291
+ def prompt_fn(chat):
292
+ return tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
293
+
294
+ def generate_fn(prompt: str, _config) -> str:
295
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
296
+ outputs = model.generate(
297
+ **inputs,
298
+ max_new_tokens=args.max_completion_length,
299
+ do_sample=True, temperature=0.7, top_p=0.95,
300
+ pad_token_id=tokenizer.pad_token_id,
301
+ )
302
+ gen = outputs[0][inputs["input_ids"].shape[1]:]
303
+ return tokenizer.decode(gen, skip_special_tokens=True)
304
+
305
+ return collect_episode(
306
+ env=env, seed=seed,
307
+ scenario=args.scenario,
308
+ difficulty=difficulty or args.difficulty,
309
+ prompt_fn=prompt_fn, generate_fn=generate_fn,
310
+ config=LLMAgentConfig(),
311
+ )
312
+
313
+ trainer = GRPOTrainer(
314
+ model=model,
315
+ processing_class=tokenizer,
316
+ train_dataset=dataset,
317
+ reward_funcs=[reward_fn],
318
+ args=cfg,
319
+ callbacks=[EvidenceCallback()],
320
+ )
321
+ logger.info("Starting Unsloth + LoRA GRPO training")
322
+ trainer.train()
323
+
324
+ # Drain whatever rollouts the final on_log didn't catch so the last
325
+ # row of reward_components.csv is correct.
326
+ final_drain = component_accumulator.drain()
327
+ if final_drain:
328
+ summary = RewardComponentAccumulator.summarise(final_drain)
329
+ summary["step"] = trainer.state.global_step
330
+ component_writer.append(summary)
331
+ render_reward_components(
332
+ paths.reward_components_csv, paths.reward_components_png,
333
+ )
334
+
335
+ trainer.save_model(args.output_dir)
336
+ tokenizer.save_pretrained(args.output_dir)
337
+ logger.info("Saved adapters to %s", args.output_dir)
338
+ logger.info("Evidence artifacts in %s", paths.root)
339
+
340
+
341
+ if __name__ == "__main__": # pragma: no cover
342
+ main()