anugrah55 commited on
Commit
125c829
·
verified ·
1 Parent(s): 5598d1c

fix: import unsloth before transformers/trl to avoid lazy-import bug

Browse files
Files changed (1) hide show
  1. training/training_unsloth.py +107 -27
training/training_unsloth.py CHANGED
@@ -42,6 +42,17 @@ def _build_args() -> argparse.Namespace:
42
  parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct")
43
  parser.add_argument("--scenario", default=None)
44
  parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
 
 
 
 
 
 
 
 
 
 
 
45
  parser.add_argument("--total_episodes", type=int, default=400)
46
  parser.add_argument("--seed", type=int, default=42)
47
  parser.add_argument("--max_steps", type=int, default=18)
@@ -68,27 +79,55 @@ def _build_args() -> argparse.Namespace:
68
  def main() -> None: # pragma: no cover - heavy GPU path
69
  args = _build_args()
70
 
71
- from datasets import Dataset
 
 
 
 
 
 
 
 
 
72
  from transformers import TrainerCallback
73
  from trl import GRPOConfig, GRPOTrainer
74
- from unsloth import FastLanguageModel
75
 
76
  from server.environment import CERNCollisionEnvironment
 
77
  from training.evidence import (
78
  CheckpointEvalWriter,
79
  EvidencePaths,
 
80
  TrainingLogWriter,
81
  render_checkpoint_progression,
 
82
  render_training_curve,
83
  )
84
- from training.llm_agent import LLMAgentConfig, build_chat
85
  from training.rollouts import collect_episode
86
- from training.training_script import EpisodeContext, _format_validity_bonus, _stepwise_reward
 
 
 
87
 
88
  paths = EvidencePaths(root=Path(args.evidence_dir))
89
  paths.ensure()
90
  log_writer = TrainingLogWriter(paths.training_log_csv)
91
  ckpt_writer = CheckpointEvalWriter(paths.checkpoint_evals_csv)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  logger.info("Loading Unsloth model: %s", args.model_name)
94
  model, tokenizer = FastLanguageModel.from_pretrained(
@@ -110,28 +149,23 @@ def main() -> None: # pragma: no cover - heavy GPU path
110
  if tokenizer.pad_token is None:
111
  tokenizer.pad_token = tokenizer.eos_token
112
 
 
 
113
  env = CERNCollisionEnvironment(max_steps=args.max_steps)
114
- prompts: List[str] = []
115
- for i in range(args.total_episodes):
116
- obs = env.reset(seed=args.seed + i, scenario=args.scenario, difficulty=args.difficulty)
117
- chat = build_chat(obs)
118
- prompts.append(
119
- tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
120
- )
121
- dataset = Dataset.from_dict({"prompt": prompts})
122
 
123
  ctx = EpisodeContext(
124
  env=env, seed=args.seed,
125
  scenario=args.scenario, difficulty=args.difficulty,
126
  )
127
-
128
- def reward_fn(prompts: List[str], completions: List[str], **kwargs: Any) -> List[float]:
129
- rewards: List[float] = []
130
- for completion in completions:
131
- r = _stepwise_reward(completion_text=completion, ctx=ctx)
132
- r += _format_validity_bonus(completion)
133
- rewards.append(float(r))
134
- return rewards
135
 
136
  cfg = GRPOConfig(
137
  output_dir=args.output_dir,
@@ -174,6 +208,18 @@ def main() -> None: # pragma: no cover - heavy GPU path
174
  log_writer.append(row)
175
  render_training_curve(paths.training_log_csv, paths.training_curve_png)
176
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def on_step_end(self, _args, state, control, **kw):
178
  step = state.global_step
179
  if step <= 0 or step == self._last_eval_step:
@@ -190,20 +236,29 @@ def main() -> None: # pragma: no cover - heavy GPU path
190
  def _run_checkpoint_eval(self, step: int, state) -> None:
191
  FastLanguageModel.for_inference(model)
192
  try:
 
 
 
 
 
 
 
 
193
  episodes = []
194
  for s in held_out_seeds:
195
- ep = self._rollout_one(seed=s)
196
  if ep is not None:
197
  episodes.append(ep)
198
  if not episodes:
199
  return
200
  rewards = [e.cumulative_reward for e in episodes]
 
201
  ckpt_writer.append(
202
  step=step,
203
  fraction_done=round(step / max(state.max_steps or step, 1), 4),
204
  episodes=len(episodes),
205
  mean_reward=round(sum(rewards) / len(rewards), 4),
206
- success_rate=round(sum(1 for e in episodes if e.discovered) / len(episodes), 4),
207
  mass_acc=round(sum(1 for e in episodes if e.correct_mass) / len(episodes), 4),
208
  channel_acc=round(sum(1 for e in episodes if e.correct_channel) / len(episodes), 4),
209
  )
@@ -211,15 +266,27 @@ def main() -> None: # pragma: no cover - heavy GPU path
211
  paths.checkpoint_evals_csv,
212
  paths.checkpoint_progression_png,
213
  )
 
 
 
 
 
 
 
 
 
 
 
214
  logger.info(
215
- "[checkpoint-eval step=%d] reward=%.3f success=%.2f",
216
- step, rewards and (sum(rewards) / len(rewards)) or 0.0,
217
- sum(1 for e in episodes if e.discovered) / len(episodes),
 
218
  )
219
  finally:
220
  FastLanguageModel.for_training(model)
221
 
222
- def _rollout_one(self, seed: int):
223
  def prompt_fn(chat):
224
  return tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
225
 
@@ -236,7 +303,8 @@ def main() -> None: # pragma: no cover - heavy GPU path
236
 
237
  return collect_episode(
238
  env=env, seed=seed,
239
- scenario=args.scenario, difficulty=args.difficulty,
 
240
  prompt_fn=prompt_fn, generate_fn=generate_fn,
241
  config=LLMAgentConfig(),
242
  )
@@ -251,6 +319,18 @@ def main() -> None: # pragma: no cover - heavy GPU path
251
  )
252
  logger.info("Starting Unsloth + LoRA GRPO training")
253
  trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
254
  trainer.save_model(args.output_dir)
255
  tokenizer.save_pretrained(args.output_dir)
256
  logger.info("Saved adapters to %s", args.output_dir)
 
42
  parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct")
43
  parser.add_argument("--scenario", default=None)
44
  parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
45
+ parser.add_argument(
46
+ "--curriculum",
47
+ action="store_true",
48
+ help=(
49
+ "Enable adaptive curriculum: start at --difficulty and promote "
50
+ "to medium/hard once held-out success rate clears the threshold "
51
+ "(see training/curriculum.py)."
52
+ ),
53
+ )
54
+ parser.add_argument("--curriculum_promote", type=float, default=0.55)
55
+ parser.add_argument("--curriculum_demote", type=float, default=0.10)
56
  parser.add_argument("--total_episodes", type=int, default=400)
57
  parser.add_argument("--seed", type=int, default=42)
58
  parser.add_argument("--max_steps", type=int, default=18)
 
79
  def main() -> None: # pragma: no cover - heavy GPU path
80
  args = _build_args()
81
 
82
+ # IMPORTANT: Unsloth MUST be imported before transformers / trl. It
83
+ # patches transformers' lazy ``_import_structure`` to register a few
84
+ # symbols (notably ``PreTrainedModel`` under torch-aware paths). If trl
85
+ # loads transformers first, the lazy loader will fail with a confusing
86
+ # ``ImportError: cannot import name 'PreTrainedModel' from 'transformers'``
87
+ # at GRPOTrainer import time — which is exactly what we hit on the
88
+ # trainer Space before this reorder.
89
+ # See: https://github.com/unslothai/unsloth and the matching
90
+ # transformers issue #42548 for the lazy-import root cause.
91
+ from unsloth import FastLanguageModel
92
  from transformers import TrainerCallback
93
  from trl import GRPOConfig, GRPOTrainer
 
94
 
95
  from server.environment import CERNCollisionEnvironment
96
+ from training.curriculum import CurriculumConfig, CurriculumManager
97
  from training.evidence import (
98
  CheckpointEvalWriter,
99
  EvidencePaths,
100
+ RewardComponentLogWriter,
101
  TrainingLogWriter,
102
  render_checkpoint_progression,
103
+ render_reward_components,
104
  render_training_curve,
105
  )
106
+ from training.llm_agent import LLMAgentConfig
107
  from training.rollouts import collect_episode
108
+ from training.training_script import (
109
+ EpisodeContext,
110
+ RewardComponentAccumulator,
111
+ )
112
 
113
  paths = EvidencePaths(root=Path(args.evidence_dir))
114
  paths.ensure()
115
  log_writer = TrainingLogWriter(paths.training_log_csv)
116
  ckpt_writer = CheckpointEvalWriter(paths.checkpoint_evals_csv)
117
+ component_writer = RewardComponentLogWriter(paths.reward_components_csv)
118
+ component_accumulator = RewardComponentAccumulator()
119
+
120
+ curriculum: Optional[CurriculumManager] = None
121
+ if args.curriculum:
122
+ curriculum = CurriculumManager(
123
+ CurriculumConfig(
124
+ start_difficulty=args.difficulty,
125
+ promote_threshold=args.curriculum_promote,
126
+ demote_threshold=args.curriculum_demote,
127
+ )
128
+ )
129
+ logger.info("Curriculum enabled: start=%s promote≥%.2f demote≤%.2f",
130
+ args.difficulty, args.curriculum_promote, args.curriculum_demote)
131
 
132
  logger.info("Loading Unsloth model: %s", args.model_name)
133
  model, tokenizer = FastLanguageModel.from_pretrained(
 
149
  if tokenizer.pad_token is None:
150
  tokenizer.pad_token = tokenizer.eos_token
151
 
152
+ from training.training_script import build_dataset, make_reward_fn
153
+
154
  env = CERNCollisionEnvironment(max_steps=args.max_steps)
155
+ dataset = build_dataset(
156
+ tokenizer=tokenizer,
157
+ n_prompts=args.total_episodes,
158
+ seed=args.seed,
159
+ scenario=args.scenario,
160
+ difficulty=args.difficulty,
161
+ curriculum=args.curriculum,
162
+ )
163
 
164
  ctx = EpisodeContext(
165
  env=env, seed=args.seed,
166
  scenario=args.scenario, difficulty=args.difficulty,
167
  )
168
+ reward_fn = make_reward_fn(ctx, accumulator=component_accumulator)
 
 
 
 
 
 
 
169
 
170
  cfg = GRPOConfig(
171
  output_dir=args.output_dir,
 
208
  log_writer.append(row)
209
  render_training_curve(paths.training_log_csv, paths.training_curve_png)
210
 
211
+ # Per-component reward summary (FAQ Q17, Q43, Q52: don't watch
212
+ # only the mean reward — track terminal vs shaping, success
213
+ # rates, and parse rate so verifier hacks become visible).
214
+ drained = component_accumulator.drain()
215
+ if drained:
216
+ summary = RewardComponentAccumulator.summarise(drained)
217
+ summary["step"] = state.global_step
218
+ component_writer.append(summary)
219
+ render_reward_components(
220
+ paths.reward_components_csv, paths.reward_components_png,
221
+ )
222
+
223
  def on_step_end(self, _args, state, control, **kw):
224
  step = state.global_step
225
  if step <= 0 or step == self._last_eval_step:
 
236
  def _run_checkpoint_eval(self, step: int, state) -> None:
237
  FastLanguageModel.for_inference(model)
238
  try:
239
+ # When curriculum is enabled, evaluate at whatever tier the
240
+ # adaptive manager currently considers appropriate. Otherwise
241
+ # use the static --difficulty.
242
+ eval_difficulty = (
243
+ curriculum.next_difficulty()
244
+ if curriculum is not None
245
+ else args.difficulty
246
+ )
247
  episodes = []
248
  for s in held_out_seeds:
249
+ ep = self._rollout_one(seed=s, difficulty=eval_difficulty)
250
  if ep is not None:
251
  episodes.append(ep)
252
  if not episodes:
253
  return
254
  rewards = [e.cumulative_reward for e in episodes]
255
+ success_rate = sum(1 for e in episodes if e.discovered) / len(episodes)
256
  ckpt_writer.append(
257
  step=step,
258
  fraction_done=round(step / max(state.max_steps or step, 1), 4),
259
  episodes=len(episodes),
260
  mean_reward=round(sum(rewards) / len(rewards), 4),
261
+ success_rate=round(success_rate, 4),
262
  mass_acc=round(sum(1 for e in episodes if e.correct_mass) / len(episodes), 4),
263
  channel_acc=round(sum(1 for e in episodes if e.correct_channel) / len(episodes), 4),
264
  )
 
266
  paths.checkpoint_evals_csv,
267
  paths.checkpoint_progression_png,
268
  )
269
+ if curriculum is not None:
270
+ snap = curriculum.record(
271
+ success=success_rate >= 0.5,
272
+ reward=sum(rewards) / len(rewards),
273
+ )
274
+ curriculum.save(paths.root / "curriculum_state.json")
275
+ if snap.get("event"):
276
+ logger.info(
277
+ "[curriculum] %s @ step=%d → tier=%s (rolling=%.2f)",
278
+ snap["event"], step, snap["current"], snap["rolling_success"],
279
+ )
280
  logger.info(
281
+ "[checkpoint-eval step=%d difficulty=%s] reward=%.3f success=%.2f",
282
+ step, eval_difficulty,
283
+ rewards and (sum(rewards) / len(rewards)) or 0.0,
284
+ success_rate,
285
  )
286
  finally:
287
  FastLanguageModel.for_training(model)
288
 
289
+ def _rollout_one(self, seed: int, difficulty: Optional[str] = None):
290
  def prompt_fn(chat):
291
  return tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
292
 
 
303
 
304
  return collect_episode(
305
  env=env, seed=seed,
306
+ scenario=args.scenario,
307
+ difficulty=difficulty or args.difficulty,
308
  prompt_fn=prompt_fn, generate_fn=generate_fn,
309
  config=LLMAgentConfig(),
310
  )
 
319
  )
320
  logger.info("Starting Unsloth + LoRA GRPO training")
321
  trainer.train()
322
+
323
+ # Drain whatever rollouts the final on_log didn't catch so the last
324
+ # row of reward_components.csv is correct.
325
+ final_drain = component_accumulator.drain()
326
+ if final_drain:
327
+ summary = RewardComponentAccumulator.summarise(final_drain)
328
+ summary["step"] = trainer.state.global_step
329
+ component_writer.append(summary)
330
+ render_reward_components(
331
+ paths.reward_components_csv, paths.reward_components_png,
332
+ )
333
+
334
  trainer.save_model(args.output_dir)
335
  tokenizer.save_pretrained(args.output_dir)
336
  logger.info("Saved adapters to %s", args.output_dir)