{ "steps": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200 ], "rewards": [ 0.184, 0.1201, 0.1201, 0.0333, 0.1145, 0.1035, 0.244, 0.1729, 0.1007, 0.1063, 0.1174, 0.3363, 0.18, 0.1736, 0.2347, 0.0333, 0.1063, 0.0416, 0.1174, 0.2712, 0.2014, 0.1736, 0.1736, 0.1174, 0.0444, 0.1763, 0.1792, 0.2069, 0.1736, 0.1673, 0.2014, 0.2018, 0.3584, 0.1856, 0.2347, 0.1991, 0.193, 0.1229, 0.2513, 0.2201, 0.2347, 0.0333, 0.1645, 0.1736, 0.2597, 0.2708, 0.2485, 0.2014, 0.1847, 0.1847, 0.2907, 0.1063, 0.1903, 0.1736, 0.1945, 0.1173, 0.1063, 0.293, 0.2847, 0.2763, 0.1173, 0.2347, 0.2145, 0.3002, 0.1145, 0.1035, 0.2569, 0.1173, 0.2996, 0.2903, 0.3751, 0.0333, 0.2347, 0.1903, 0.1146, 0.0333, 0.109, 0.3341, 0.2224, 0.2347, 0.2702, 0.1812, 0.1903, 0.2224, 0.3013, 0.1903, 0.1118, 0.1646, 0.179, 0.2375, 0.209, 0.3885, 0.2796, 0.2846, 0.1145, 0.2903, 0.1903, 0.1763, 0.1007, 0.1736, 0.2168, 0.2435, 0.2146, 0.2958, 0.263, 0.1903, 0.3647, 0.2569, 0.1257, 0.0333, 0.2501, 0.2907, 0.2173, 0.2935, 0.3485, 0.3264, 0.368, 0.1007, 0.1201, 0.109, 0.3207, 0.2324, 0.2542, 0.2946, 0.3514, 0.2597, 0.399, 0.4013, 0.3701, 0.4363, 0.025, 0.0333, 0.368, 0.0333, 0.1958, 0.3046, 0.3208, 0.2401, 0.3013, 0.2553, 0.3074, 0.2347, 0.368, 0.2344, 0.2708, 0.3335, 0.2819, 0.3241, 0.3813, 0.0333, 0.0361, 0.1145, 0.1174, 0.293, 0.2769, 0.0472, 0.5063, 0.1874, 0.3625, 0.1862, 0.1945, 0.3051, 0.1173, 0.3541, 0.1007, 0.2784, 0.0217, 0.1173, 0.184, 0.184, 0.2347, 0.3374, 0.1955, 0.3514, 0.2206, 0.3546, 0.109, 0.2824, 0.1708, 0.3514, 0.1958, 0.3958, 0.3013, 0.2485, 0.0979, 0.2875, 0.3013, 0.3124, 0.4051, 0.2764, 0.2542, 0.1285, 0.4053, 0.1895, 0.2375, 0.3196, 0.2625, 0.3735, 0.1874, 0.3462 ], "reward_std": [ 0.2209, 0.0756, 0.1148, 0.0078, 0.1227, 0.0992, 0.2901, 0.1895, 0.1031, 0.0952, 0.1109, 0.1909, 0.1995, 0.0, 0.0864, 0.0078, 0.0952, 0.0039, 0.1109, 0.3442, 0.0078, 0.0, 0.0078, 0.1109, 0.0235, 0.0039, 0.0078, 0.0157, 0.0, 0.1895, 0.0078, 0.0007, 0.2614, 0.2484, 0.0864, 0.0046, 0.0275, 0.1345, 0.0629, 0.272, 0.0864, 0.0078, 0.217, 0.0, 0.1218, 0.1061, 0.0668, 0.0078, 0.0157, 0.0157, 0.1407, 0.1266, 0.0235, 0.0, 0.2358, 0.1188, 0.0952, 0.1689, 0.1571, 0.1061, 0.1188, 0.0864, 0.2484, 0.1588, 0.1227, 0.0913, 0.1178, 0.1266, 0.3843, 0.165, 0.2615, 0.0078, 0.0864, 0.0235, 0.107, 0.0078, 0.0914, 0.0385, 0.2751, 0.0864, 0.1366, 0.2013, 0.0235, 0.2751, 0.1336, 0.0235, 0.1188, 0.1856, 0.239, 0.0903, 0.2563, 0.0684, 0.0936, 0.165, 0.1227, 0.1336, 0.0235, 0.0039, 0.1031, 0.0, 0.2925, 0.0989, 0.2641, 0.0, 0.0936, 0.0235, 0.0047, 0.1178, 0.1306, 0.0078, 0.1083, 0.3717, 0.2445, 0.1604, 0.0668, 0.0196, 0.0393, 0.1031, 0.1306, 0.0914, 0.0432, 0.0204, 0.0589, 0.0062, 0.0786, 0.1218, 0.0832, 0.0078, 0.0284, 0.0572, 0.0117, 0.0078, 0.0393, 0.0078, 0.0, 0.1211, 0.0354, 0.1714, 0.1414, 0.1155, 0.0164, 0.0864, 0.0393, 0.2765, 0.1061, 0.0062, 0.1532, 0.0308, 0.0283, 0.0078, 0.004, 0.1227, 0.1109, 0.1375, 0.0975, 0.0275, 0.0982, 0.0196, 0.0235, 0.2476, 0.2358, 0.3922, 0.1266, 0.0668, 0.1031, 0.1045, 0.0165, 0.1188, 0.2131, 0.2209, 0.0864, 0.0589, 0.0075, 0.0786, 0.2726, 0.074, 0.1148, 0.36, 0.004, 0.0786, 0.0236, 0.0236, 0.0078, 0.0668, 0.0991, 0.161, 0.1336, 0.0235, 0.0668, 0.1139, 0.1139, 0.0952, 0.0448, 0.2131, 0.0903, 0.2065, 0.0472, 0.0235, 0.0196, 0.0622 ], "kl": [ 0.0, 0.0, 3e-06, 5e-06, 2e-06, 4e-06, 2e-06, 3e-06, 1e-05, 3e-06, 0.001917, 7e-06, 0.001059, 5e-06, 1.3e-05, 3e-06, 5e-06, 6e-06, 8e-06, 8e-06, 1.2e-05, 4.7e-05, 4.8e-05, 3.2e-05, 3.3e-05, 0.000108, 0.00018, 4.2e-05, 0.000189, 0.00012, 9.3e-05, 0.000155, 0.00196, 0.000149, 0.000252, 0.000233, 0.000311, 0.000286, 0.000172, 0.000169, 0.000146, 0.005918, 0.000294, 0.000185, 0.000377, 9.5e-05, 9.7e-05, 0.000185, 0.000191, 0.000213, 0.000139, 0.000185, 0.000328, 0.000269, 0.000423, 0.000312, 0.000403, 0.000629, 0.000958, 0.000654, 0.000763, 0.000818, 0.000679, 0.001007, 0.000409, 0.000742, 0.001032, 0.000433, 0.000424, 0.001175, 0.065512, 0.000491, 0.001283, 0.001551, 0.000743, 0.00205, 0.001048, 0.003152, 0.001422, 0.002206, 0.001268, 0.001376, 0.0006, 0.000647, 0.002288, 0.000986, 0.003312, 0.00304, 0.001072, 0.003339, 0.000917, 0.001449, 0.001279, 0.003114, 0.008185, 0.001049, 0.000395, 0.005086, 0.004066, 0.007162, 0.00197, 0.001734, 0.003092, 0.002859, 0.001129, 0.000954, 0.0037, 0.008848, 0.003338, 0.003114, 0.013662, 0.00386, 0.001231, 0.002798, 0.001819, 0.006505, 0.002929, 0.002063, 0.003029, 0.053483, 0.007063, 0.002513, 0.015775, 0.004121, 0.004982, 0.007406, 0.004541, 0.005338, 0.051157, 0.003495, 0.003043, 0.005987, 0.008458, 0.001201, 0.002248, 0.079172, 0.005182, 0.003687, 0.002551, 0.192144, 0.005151, 0.004355, 0.00832, 0.002969, 0.003038, 0.000624, 0.008018, 0.003553, 0.005504, 0.012273, 0.001665, 0.002202, 0.001595, 0.001366, 0.001963, 0.00278, 0.007543, 0.016324, 0.013537, 0.002584, 0.001686, 0.003062, 0.008756, 0.006232, 0.007933, 0.151755, 0.004479, 0.001787, 0.004011, 0.007719, 0.002736, 0.004371, 0.121158, 0.009865, 0.111958, 0.001871, 0.003641, 0.002508, 0.004169, 0.005104, 0.001672, 0.003376, 0.008028, 0.003024, 0.004631, 0.003015, 0.005141, 0.004173, 0.001354, 0.004468, 0.005965, 0.000716, 0.003562, 0.004835, 0.004605, 0.144598, 0.004515, 0.004581, 0.002088, 0.006087 ], "completion_length": [ 317.0, 311.5, 313.0, 337.5, 203.0, 245.0, 277.0, 292.5, 462.0, 322.5, 364.5, 491.0, 413.5, 243.0, 272.0, 286.5, 161.0, 512.0, 224.0, 343.5, 298.0, 120.0, 247.0, 301.0, 512.0, 193.5, 188.0, 478.5, 131.0, 411.5, 344.5, 327.0, 399.0, 223.0, 167.0, 327.0, 285.5, 203.0, 211.5, 322.5, 230.5, 362.5, 372.0, 137.0, 156.0, 354.0, 482.0, 386.0, 274.5, 261.5, 439.5, 438.5, 207.0, 290.5, 196.0, 449.0, 159.0, 272.0, 172.0, 239.0, 337.5, 368.5, 225.0, 424.5, 341.5, 177.5, 179.0, 273.5, 440.5, 362.5, 401.5, 220.0, 295.0, 223.0, 228.0, 350.5, 321.5, 237.5, 350.5, 290.0, 315.0, 341.5, 189.0, 414.0, 270.0, 200.5, 230.5, 267.0, 202.0, 174.5, 376.5, 372.5, 251.0, 322.5, 156.0, 323.5, 252.5, 125.0, 140.5, 121.5, 236.5, 307.5, 198.0, 275.0, 353.0, 211.0, 303.0, 191.0, 205.0, 323.5, 311.5, 343.5, 359.0, 372.5, 394.5, 337.0, 398.0, 267.5, 426.5, 309.5, 231.5, 430.5, 416.0, 369.0, 388.5, 435.5, 382.5, 261.0, 466.0, 425.0, 144.0, 310.5, 302.5, 318.0, 334.5, 512.0, 184.5, 228.5, 310.5, 322.5, 329.0, 187.0, 291.0, 354.0, 280.5, 512.0, 183.0, 357.5, 262.5, 169.5, 343.5, 401.5, 441.5, 426.0, 512.0, 322.5, 456.0, 212.0, 321.0, 324.0, 413.0, 431.0, 273.0, 236.5, 123.5, 496.0, 335.5, 279.5, 322.0, 243.0, 313.5, 227.0, 428.0, 318.5, 389.5, 413.0, 364.0, 488.5, 176.5, 243.0, 262.0, 512.0, 241.0, 244.0, 248.5, 217.0, 310.0, 206.5, 453.0, 284.0, 250.5, 427.5, 391.5, 344.0, 210.0, 330.5, 313.0, 391.0, 222.0, 334.5 ], "loss": [ -0.2943, 0.4545, -0.3589, -0.361, -0.0626, -0.3143, -0.2067, -0.2332, -0.0765, 0.4151, -0.2859, -0.0259, -0.1684, 0.0, 0.1844, -0.1791, 0.1228, -0.0, -0.0347, -0.3468, -0.075, 0.0, 0.1498, -0.2065, 0.0, -0.2298, -0.26, 0.0492, 0.0, 0.0163, 0.0902, 0.0303, -0.2002, 0.1141, -0.1438, 0.0212, -0.3862, -0.1845, 0.0317, -0.4153, 0.0904, 0.288, -0.266, 0.0, -0.2219, 0.2973, 0.0439, 0.1357, 0.096, -0.3453, -0.1166, -0.1184, -0.2653, 0.0, -0.119, 0.0897, 0.1155, -0.1689, -0.1602, 0.1478, -0.3653, 0.275, -0.0911, 0.1457, -0.3527, 0.2368, -0.1816, -0.1511, -0.1147, 0.2914, -0.1945, -0.0666, -0.4405, -0.3663, -0.0991, 0.3217, 0.4185, 0.1411, -0.3257, 0.0901, -0.4172, -0.2866, -0.1863, -0.1554, 0.0183, -0.2581, -0.2866, 0.1006, -0.5773, -0.2449, -0.2544, -0.2644, 0.1295, -0.0887, -0.1993, -0.1671, -0.0851, -0.0441, 0.1131, 0.0, -0.5963, -0.4238, -0.1249, 0.0, -0.032, -0.2603, -0.2651, -0.2293, -0.2482, -0.0745, -0.4547, -0.3468, -0.1457, 0.1584, 0.0421, 0.0501, 0.1506, 0.3948, 0.1416, 0.4622, -0.0655, 0.1332, -0.1323, -0.132, 0.1336, -0.1192, -0.2391, 0.0054, 0.0696, -0.0183, -0.073, 0.0821, -0.1154, -0.1383, 0.0, 0.0001, -0.0783, 0.0016, -0.0011, -0.415, -0.1666, -0.2606, -0.0921, -0.3155, 0.2833, 0.0, -0.1931, 0.3007, -0.0013, -0.0638, -0.3383, 0.1945, 0.1128, -0.0365, 0.0, -0.414, -0.0867, -0.2621, 0.4124, 0.0022, 0.1694, -0.1329, -0.2148, -0.0672, 0.0543, 0.0229, 0.3698, 0.1479, 0.2458, -0.1687, -0.0845, 0.1057, -0.1368, -0.1452, -0.2222, 0.0137, 0.2873, 0.034, 0.1075, -0.0407, -0.1478, 0.0, 0.0232, 0.055, -0.1805, -0.3354, 0.0023, 0.1245, -0.092, -0.1219, -0.1001, 0.1396, -0.2172, -0.3041, -0.259, -0.388, 0.1804, 0.0378, -0.3296, 0.3155 ], "grad_norm": [ 0.1961, 0.1958, 0.167, 0.1481, 0.1628, 0.2135, 0.1081, 0.1042, 0.172, 0.189, NaN, 0.1705, NaN, 0.0, 0.2238, 0.1466, 0.1339, 0.1383, 0.14, 0.097, 0.1361, 0.0, 0.285, 0.0758, 0.1444, 0.1839, 0.2411, 0.1278, 0.0, 0.1871, 0.0803, 0.0971, NaN, 0.2251, 0.2385, 0.1113, 0.2192, 0.2427, 0.1562, 0.1537, 0.2454, 7.313, 0.1808, 0.0, 0.2211, 0.1802, 0.1352, 0.1725, 0.2209, 0.1537, 0.0943, 0.1949, 0.1484, 0.0, 0.1429, 0.1697, 0.1782, 0.1379, 0.1811, 0.1497, 0.2006, 0.2164, 0.1604, 0.2584, 0.1887, 0.1719, 0.1787, 0.1932, 0.162, 0.1837, 6.6024, 0.0937, 0.1919, 0.2166, 0.0878, 0.1349, 0.1933, 0.1453, 0.0689, 0.23, 0.1865, 0.1643, 0.1729, 0.1429, 0.1652, 0.2979, 0.2033, 0.2461, 0.2547, 0.2016, 0.106, 0.115, 0.1593, 0.1303, 0.2509, 0.1655, 0.2473, 0.398, 0.2547, 0.0009, 0.2819, 0.1811, 0.1835, 0.0001, 0.1994, 0.1029, 0.1525, 0.2811, 0.1642, 0.1278, 0.1342, 0.123, 0.2076, 0.1391, 0.2146, 0.2434, 0.136, 0.265, 0.1568, 8.4723, 0.136, 0.1408, 0.1997, 0.1428, 0.1957, 0.153, 0.1334, 0.0977, 5.4034, 0.2068, 0.3558, 0.182, 0.2308, 0.138, 0.0001, 3.9199, 0.1769, 0.207, 0.2048, 9.8743, 0.0885, 0.2152, 0.2054, 0.0973, 0.1709, 0.1017, 0.2027, 0.1274, 0.2776, 0.1704, 0.1746, 0.1885, 0.2155, 0.1581, 0.1066, 0.203, 0.1085, 0.2463, 0.1368, 0.2517, 0.1909, 0.1723, 0.2901, 0.1457, 0.4256, 4.2154, 0.1762, 0.1586, 0.2176, 0.2246, 0.215, 0.2202, 4.6015, 0.2281, 6.0358, 0.1576, 0.2039, 0.1561, 0.2154, 0.1517, 0.1233, 0.1314, 0.2584, 0.1628, 0.2852, 0.2748, 0.1448, 0.2371, 0.1317, 0.222, 0.256, 0.1449, 0.208, 0.1329, 0.1631, 8.4834, 0.176, 0.2077, 0.2308, 0.1885 ], "model": "Qwen/Qwen2.5-3B-Instruct", "method": "GRPO_200_steps", "runtime_seconds": 8412, "peak_reward": 0.5063, "peak_step": 157, "final_reward": 0.3462, "compute_cost": "$0 (free Colab T4)" }