{ "model": "Qwen/Qwen2.5-3B-Instruct", "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase", "phases": [ "phase1_timing", "phase2_content" ], "rounds_per_phase": 3, "episodes_per_round": 6, "before": { "monthly_engage": 0.0, "monthly_strategic": 0.175, "monthly_competitive": 0.035 }, "after": { "monthly_engage": 0.0, "monthly_strategic": 0.175, "monthly_competitive": 0.035 }, "smart_heuristic": { "monthly_engage": 0.7519, "monthly_strategic": 0.9101, "monthly_competitive": 0.9141 }, "improvement": { "monthly_engage": 0.0, "monthly_strategic": 0.0, "monthly_competitive": 0.0 }, "training_log": { "phase": [ "phase1_timing", "phase1_timing", "phase1_timing", "phase2_content", "phase2_content", "phase2_content" ], "round": [ 1, 2, 3, 1, 2, 3 ], "global_step": [ 1, 2, 3, 4, 5, 6 ], "use_hint": [ true, false, false, true, false, false ], "avg_episode_reward": [ 5.127, 3.04, 2.867, 3.538, 2.15, 1.924 ], "max_episode_reward": [ 5.315, 3.303, 3.016, 3.837, 2.807, 2.609 ], "min_episode_reward": [ 4.96, 2.6, 2.555, 3.338, 1.587, 1.375 ], "avg_grader": [ 0.9498, 0.259, 0.2083, 0.8697, 0.3763, 0.2855 ], "max_grader": [ 1.0, 0.3614, 0.3042, 1.0, 0.5979, 0.5027 ], "n_training_samples": [ 81, 96, 102, 77, 90, 76 ], "train_loss": [ 2.833, 3.1413, 3.1255, 2.8381, 2.9281, 2.9184 ] } }