| { | |
| "model": "Qwen/Qwen2.5-1.5B-Instruct", | |
| "train_task": "task_karnataka", | |
| "train_time_minutes": 159.6, | |
| "num_prompts": 600, | |
| "num_epochs": 3, | |
| "num_steps": 449, | |
| "gpu": "NVIDIA A10G (23.9 GB)", | |
| "lora_rank": 16, | |
| "framework": "TRL GRPOTrainer + bitsandbytes 4-bit", | |
| "reward_start": -0.2308, | |
| "reward_end": 0.6638, | |
| "reward_peak": 0.6883, | |
| "note": "Post-training eval OOM'd during model save; reward values from training log", | |
| "baseline": { | |
| "task_easy": { | |
| "avg": 31.99, | |
| "std": 0.0 | |
| }, | |
| "task_medium": { | |
| "avg": 46.69, | |
| "std": 0.36 | |
| }, | |
| "karnataka_easy": { | |
| "avg": 56.33, | |
| "std": 0.25 | |
| }, | |
| "karnataka_medium": { | |
| "avg": 49.57, | |
| "std": 0.21 | |
| }, | |
| "karnataka_hard": { | |
| "avg": -417.15, | |
| "std": 63.02 | |
| }, | |
| "task_karnataka": { | |
| "avg": 49.43, | |
| "std": 0.21 | |
| } | |
| }, | |
| "training_reward": { | |
| "initial_avg_5steps": -0.2308, | |
| "mid_avg_steps100_150": 0.6266, | |
| "final_avg_last50steps": 0.6634 | |
| } | |
| } |