final-iteration / run-output /plots /training_summary.json
ycwhencpp's picture
HF Job: train_grpo run output
17149c8 verified
{
"model": "Qwen/Qwen2.5-3B-Instruct",
"training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
"phases": [],
"rounds_per_phase": 3,
"episodes_per_round": 6,
"before": {
"monthly_engage": 1.0,
"monthly_strategic": 0.8357,
"monthly_competitive": 0.9414
},
"after": {
"monthly_engage": 0.999,
"monthly_strategic": 0.9321439559505211,
"monthly_competitive": 0.999
},
"smart_heuristic": {
"monthly_engage": 0.7519,
"monthly_strategic": 0.9101,
"monthly_competitive": 0.9141
},
"improvement": {
"monthly_engage": -0.0010000000000000009,
"monthly_strategic": 0.09644395595052113,
"monthly_competitive": 0.057599999999999985
},
"training_log": {
"phase": [],
"round": [],
"global_step": [],
"use_hint": [],
"avg_episode_reward": [],
"max_episode_reward": [],
"min_episode_reward": [],
"avg_grader": [],
"max_grader": [],
"n_training_samples": [],
"train_loss": []
}
}