| """Unsloth + LoRA (Low-Rank Adaptation) GRPO training for CERNenv. |
| |
| This is the recommended path for Colab / single- or multi-GPU runs because |
| Unsloth's fused kernels and 4-bit loading let us train 2B–8B models with |
| limited VRAM, while TRL's GRPO (Group-Relative Policy Optimization) loop |
| handles the policy-gradient math. |
| |
| The trainer is wired up to produce **all** "training-progress evidence" |
| artifacts demanded by the OpenEnv hackathon's scoring rubric: |
| |
| * per-step training log + reward/loss curve PNG (Portable Network Graphics) |
| * mid-training checkpoint evaluations + progression curve PNG |
| * (post-run) before/after summary + reward-distribution PNG |
| |
| All artifacts land in ``--evidence_dir`` (default: ``evidence/``). |
| |
| Run on Colab / single GPU: |
| !python -m training.training_unsloth \ |
| --model_name unsloth/Qwen2.5-3B-Instruct \ |
| --total_episodes 400 --num_generations 4 --output_dir runs/unsloth-grpo |
| |
| Run on a 4×A100 Hugging Face Space (multi-GPU via accelerate): |
| accelerate launch --num_processes 4 -m training.training_unsloth \ |
| --total_episodes 1500 --num_generations 8 --output_dir runs/unsloth-grpo |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import logging |
| import time |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional |
|
|
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
|
|
| def _build_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct") |
| parser.add_argument("--scenario", default=None) |
| parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy") |
| parser.add_argument( |
| "--curriculum", |
| action="store_true", |
| help=( |
| "Enable adaptive curriculum: start at --difficulty and promote " |
| "to medium/hard once held-out success rate clears the threshold " |
| "(see training/curriculum.py)." |
| ), |
| ) |
| parser.add_argument("--curriculum_promote", type=float, default=0.55) |
| parser.add_argument("--curriculum_demote", type=float, default=0.10) |
| parser.add_argument("--total_episodes", type=int, default=400) |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--max_steps", type=int, default=18) |
| parser.add_argument("--num_generations", type=int, default=4) |
| parser.add_argument("--max_prompt_length", type=int, default=2048) |
| parser.add_argument("--max_completion_length", type=int, default=384) |
| parser.add_argument("--learning_rate", type=float, default=5e-6) |
| parser.add_argument("--load_in_4bit", action="store_true", default=True) |
| parser.add_argument("--lora_rank", type=int, default=16) |
| parser.add_argument("--lora_alpha", type=int, default=16) |
| parser.add_argument("--per_device_batch_size", type=int, default=1) |
| parser.add_argument("--gradient_accumulation_steps", type=int, default=4) |
| parser.add_argument("--logging_steps", type=int, default=2) |
| parser.add_argument("--save_steps", type=int, default=50) |
| parser.add_argument("--checkpoint_eval_steps", type=int, default=25, |
| help="Run a held-out eval every N updates for the progression curve.") |
| parser.add_argument("--checkpoint_eval_episodes", type=int, default=8, |
| help="Number of held-out episodes per mid-training eval.") |
| parser.add_argument("--output_dir", default="runs/unsloth-grpo") |
| parser.add_argument("--evidence_dir", default="evidence") |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| args = _build_args() |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from unsloth import FastLanguageModel |
| from transformers import TrainerCallback |
| from trl import GRPOConfig, GRPOTrainer |
|
|
| from server.environment import CERNCollisionEnvironment |
| from training.curriculum import CurriculumConfig, CurriculumManager |
| from training.evidence import ( |
| CheckpointEvalWriter, |
| EvidencePaths, |
| RewardComponentLogWriter, |
| TrainingLogWriter, |
| render_checkpoint_progression, |
| render_reward_components, |
| render_training_curve, |
| ) |
| from training.llm_agent import LLMAgentConfig |
| from training.rollouts import collect_episode |
| from training.training_script import ( |
| EpisodeContext, |
| RewardComponentAccumulator, |
| ) |
|
|
| paths = EvidencePaths(root=Path(args.evidence_dir)) |
| paths.ensure() |
| log_writer = TrainingLogWriter(paths.training_log_csv) |
| ckpt_writer = CheckpointEvalWriter(paths.checkpoint_evals_csv) |
| component_writer = RewardComponentLogWriter(paths.reward_components_csv) |
| component_accumulator = RewardComponentAccumulator() |
|
|
| curriculum: Optional[CurriculumManager] = None |
| if args.curriculum: |
| curriculum = CurriculumManager( |
| CurriculumConfig( |
| start_difficulty=args.difficulty, |
| promote_threshold=args.curriculum_promote, |
| demote_threshold=args.curriculum_demote, |
| ) |
| ) |
| logger.info("Curriculum enabled: start=%s promote≥%.2f demote≤%.2f", |
| args.difficulty, args.curriculum_promote, args.curriculum_demote) |
|
|
| logger.info("Loading Unsloth model: %s", args.model_name) |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=args.model_name, |
| max_seq_length=args.max_prompt_length + args.max_completion_length, |
| load_in_4bit=args.load_in_4bit, |
| |
| fast_inference=False, |
| ) |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r=args.lora_rank, |
| lora_alpha=args.lora_alpha, |
| target_modules=[ |
| "q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj", |
| ], |
| use_gradient_checkpointing="unsloth", |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| from training.training_script import build_dataset, make_reward_fn |
|
|
| env = CERNCollisionEnvironment(max_steps=args.max_steps) |
| dataset = build_dataset( |
| tokenizer=tokenizer, |
| n_prompts=args.total_episodes, |
| seed=args.seed, |
| scenario=args.scenario, |
| difficulty=args.difficulty, |
| curriculum=args.curriculum, |
| ) |
|
|
| ctx = EpisodeContext( |
| env=env, seed=args.seed, |
| scenario=args.scenario, difficulty=args.difficulty, |
| ) |
| reward_fn = make_reward_fn(ctx, accumulator=component_accumulator) |
|
|
| cfg = GRPOConfig( |
| output_dir=args.output_dir, |
| per_device_train_batch_size=args.per_device_batch_size, |
| gradient_accumulation_steps=args.gradient_accumulation_steps, |
| num_generations=args.num_generations, |
| learning_rate=args.learning_rate, |
| max_prompt_length=args.max_prompt_length, |
| max_completion_length=args.max_completion_length, |
| logging_steps=args.logging_steps, |
| save_steps=args.save_steps, |
| seed=args.seed, |
| bf16=True, |
| report_to=[], |
| ) |
|
|
| held_out_seeds = list(range(900_000, 900_000 + args.checkpoint_eval_episodes)) |
|
|
| class EvidenceCallback(TrainerCallback): |
| """Stream training metrics + run periodic mid-training evals.""" |
|
|
| def __init__(self) -> None: |
| self._t0 = time.time() |
| self._last_eval_step = -1 |
|
|
| def on_log(self, _args, state, control, logs=None, **kw): |
| logs = logs or {} |
| row = { |
| "step": state.global_step, |
| "epoch": logs.get("epoch"), |
| "loss": logs.get("loss"), |
| "reward": logs.get("reward") or logs.get("rewards/mean"), |
| "reward_std": logs.get("reward_std") or logs.get("rewards/std"), |
| "kl": logs.get("kl"), |
| "grad_norm": logs.get("grad_norm"), |
| "learning_rate": logs.get("learning_rate"), |
| "wall_time_s": round(time.time() - self._t0, 2), |
| } |
| if any(v is not None for k, v in row.items() if k != "step"): |
| log_writer.append(row) |
| render_training_curve(paths.training_log_csv, paths.training_curve_png) |
|
|
| |
| |
| |
| drained = component_accumulator.drain() |
| if drained: |
| summary = RewardComponentAccumulator.summarise(drained) |
| summary["step"] = state.global_step |
| component_writer.append(summary) |
| render_reward_components( |
| paths.reward_components_csv, paths.reward_components_png, |
| ) |
|
|
| def on_step_end(self, _args, state, control, **kw): |
| step = state.global_step |
| if step <= 0 or step == self._last_eval_step: |
| return control |
| if step % args.checkpoint_eval_steps != 0: |
| return control |
| self._last_eval_step = step |
| try: |
| self._run_checkpoint_eval(step, state) |
| except Exception as exc: |
| logger.warning("checkpoint eval failed at step %d: %s", step, exc) |
| return control |
|
|
| def _run_checkpoint_eval(self, step: int, state) -> None: |
| FastLanguageModel.for_inference(model) |
| try: |
| |
| |
| |
| eval_difficulty = ( |
| curriculum.next_difficulty() |
| if curriculum is not None |
| else args.difficulty |
| ) |
| episodes = [] |
| for s in held_out_seeds: |
| ep = self._rollout_one(seed=s, difficulty=eval_difficulty) |
| if ep is not None: |
| episodes.append(ep) |
| if not episodes: |
| return |
| rewards = [e.cumulative_reward for e in episodes] |
| success_rate = sum(1 for e in episodes if e.discovered) / len(episodes) |
| ckpt_writer.append( |
| step=step, |
| fraction_done=round(step / max(state.max_steps or step, 1), 4), |
| episodes=len(episodes), |
| mean_reward=round(sum(rewards) / len(rewards), 4), |
| success_rate=round(success_rate, 4), |
| mass_acc=round(sum(1 for e in episodes if e.correct_mass) / len(episodes), 4), |
| channel_acc=round(sum(1 for e in episodes if e.correct_channel) / len(episodes), 4), |
| ) |
| render_checkpoint_progression( |
| paths.checkpoint_evals_csv, |
| paths.checkpoint_progression_png, |
| ) |
| if curriculum is not None: |
| snap = curriculum.record( |
| success=success_rate >= 0.5, |
| reward=sum(rewards) / len(rewards), |
| ) |
| curriculum.save(paths.root / "curriculum_state.json") |
| if snap.get("event"): |
| logger.info( |
| "[curriculum] %s @ step=%d → tier=%s (rolling=%.2f)", |
| snap["event"], step, snap["current"], snap["rolling_success"], |
| ) |
| logger.info( |
| "[checkpoint-eval step=%d difficulty=%s] reward=%.3f success=%.2f", |
| step, eval_difficulty, |
| rewards and (sum(rewards) / len(rewards)) or 0.0, |
| success_rate, |
| ) |
| finally: |
| FastLanguageModel.for_training(model) |
|
|
| def _rollout_one(self, seed: int, difficulty: Optional[str] = None): |
| def prompt_fn(chat): |
| return tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False) |
|
|
| def generate_fn(prompt: str, _config) -> str: |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=args.max_completion_length, |
| do_sample=True, temperature=0.7, top_p=0.95, |
| pad_token_id=tokenizer.pad_token_id, |
| ) |
| gen = outputs[0][inputs["input_ids"].shape[1]:] |
| return tokenizer.decode(gen, skip_special_tokens=True) |
|
|
| return collect_episode( |
| env=env, seed=seed, |
| scenario=args.scenario, |
| difficulty=difficulty or args.difficulty, |
| prompt_fn=prompt_fn, generate_fn=generate_fn, |
| config=LLMAgentConfig(), |
| ) |
|
|
| trainer = GRPOTrainer( |
| model=model, |
| processing_class=tokenizer, |
| train_dataset=dataset, |
| reward_funcs=[reward_fn], |
| args=cfg, |
| callbacks=[EvidenceCallback()], |
| ) |
| logger.info("Starting Unsloth + LoRA GRPO training") |
| trainer.train() |
|
|
| |
| |
| final_drain = component_accumulator.drain() |
| if final_drain: |
| summary = RewardComponentAccumulator.summarise(final_drain) |
| summary["step"] = trainer.state.global_step |
| component_writer.append(summary) |
| render_reward_components( |
| paths.reward_components_csv, paths.reward_components_png, |
| ) |
|
|
| trainer.save_model(args.output_dir) |
| tokenizer.save_pretrained(args.output_dir) |
| logger.info("Saved adapters to %s", args.output_dir) |
| logger.info("Evidence artifacts in %s", paths.root) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|