{ "base_label": "./sft_checkpoint (PEFT on unsloth/Qwen2.5-1.5B-Instruct)", "trained_label": "./grpo_checkpoint (PEFT on unsloth/Qwen2.5-1.5B-Instruct)", "seed": 1234, "episodes_per_level": 4, "base": { "1": { "violations_per_episode": 0.25, "ordering_rate": 0.9166666666666666, "close_rate": 0.75, "correct_disqual_rate": 0.0, "mean_final_reward": 0.24625000000000002, "mean_cum_reward": 0.9545833333333333, "mean_turns": 6.75, "n_episodes": 4, "n_disqual_cases": 0 }, "2": { "violations_per_episode": 1.0, "ordering_rate": 0.8, "close_rate": 0.0, "correct_disqual_rate": 0.0, "mean_final_reward": -0.08000000000000002, "mean_cum_reward": 0.5800000000000001, "mean_turns": 6.0, "n_episodes": 4, "n_disqual_cases": 0 }, "3": { "violations_per_episode": 1.0, "ordering_rate": 0.5714285714285714, "close_rate": 0.0, "correct_disqual_rate": 0.0, "mean_final_reward": -0.08000000000000002, "mean_cum_reward": 0.5342857142857143, "mean_turns": 6.0, "n_episodes": 4, "n_disqual_cases": 0 }, "4": { "violations_per_episode": 0.0, "ordering_rate": 0.75, "close_rate": 0.0, "correct_disqual_rate": 0.75, "mean_final_reward": 0.07500000000000001, "mean_cum_reward": -0.6749999999999996, "mean_turns": 9.5, "n_episodes": 4, "n_disqual_cases": 4 } }, "trained": { "1": { "violations_per_episode": 0.0, "ordering_rate": 1.0, "close_rate": 1.0, "correct_disqual_rate": 0.0, "mean_final_reward": 0.3566666666666667, "mean_cum_reward": 1.09, "mean_turns": 7.0, "n_episodes": 4, "n_disqual_cases": 0 }, "2": { "violations_per_episode": 0.75, "ordering_rate": 0.8500000000000001, "close_rate": 0.25, "correct_disqual_rate": 0.0, "mean_final_reward": 0.024999999999999994, "mean_cum_reward": 0.7100000000000001, "mean_turns": 6.25, "n_episodes": 4, "n_disqual_cases": 0 }, "3": { "violations_per_episode": 1.25, "ordering_rate": 0.5714285714285714, "close_rate": 0.0, "correct_disqual_rate": 0.0, "mean_final_reward": -0.08000000000000002, "mean_cum_reward": 0.5392857142857143, "mean_turns": 6.25, "n_episodes": 4, "n_disqual_cases": 0 }, "4": { "violations_per_episode": 0.25, "ordering_rate": 1.0, "close_rate": 0.0, "correct_disqual_rate": 1.0, "mean_final_reward": 0.2, "mean_cum_reward": 0.7050000000000001, "mean_turns": 6.25, "n_episodes": 4, "n_disqual_cases": 4 } } }