| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9092975676290066, |
| "eval_steps": 10, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009092975676290065, |
| "grad_norm": 1.0088555812835693, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 2.2722, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01818595135258013, |
| "grad_norm": 0.9372844099998474, |
| "learning_rate": 0.000196, |
| "loss": 1.6351, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01818595135258013, |
| "eval_loss": 1.5618833303451538, |
| "eval_runtime": 7.6668, |
| "eval_samples_per_second": 3.913, |
| "eval_steps_per_second": 1.956, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0272789270288702, |
| "grad_norm": 1.3525443077087402, |
| "learning_rate": 0.000194, |
| "loss": 1.5293, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03637190270516026, |
| "grad_norm": 0.8991140723228455, |
| "learning_rate": 0.000192, |
| "loss": 1.4111, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03637190270516026, |
| "eval_loss": 1.3718944787979126, |
| "eval_runtime": 7.8196, |
| "eval_samples_per_second": 3.837, |
| "eval_steps_per_second": 1.918, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04546487838145033, |
| "grad_norm": 1.98069429397583, |
| "learning_rate": 0.00019, |
| "loss": 1.3139, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0545578540577404, |
| "grad_norm": 0.6621396541595459, |
| "learning_rate": 0.000188, |
| "loss": 1.4428, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0545578540577404, |
| "eval_loss": 1.2937129735946655, |
| "eval_runtime": 7.4563, |
| "eval_samples_per_second": 4.023, |
| "eval_steps_per_second": 2.012, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06365082973403047, |
| "grad_norm": 0.896124005317688, |
| "learning_rate": 0.00018600000000000002, |
| "loss": 1.3239, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07274380541032052, |
| "grad_norm": 1.9882720708847046, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 1.279, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07274380541032052, |
| "eval_loss": 1.2551789283752441, |
| "eval_runtime": 7.9884, |
| "eval_samples_per_second": 3.755, |
| "eval_steps_per_second": 1.878, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08183678108661059, |
| "grad_norm": 0.7292985320091248, |
| "learning_rate": 0.000182, |
| "loss": 1.2615, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09092975676290066, |
| "grad_norm": 0.7677621245384216, |
| "learning_rate": 0.00018, |
| "loss": 1.2903, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09092975676290066, |
| "eval_loss": 1.2139209508895874, |
| "eval_runtime": 7.6731, |
| "eval_samples_per_second": 3.91, |
| "eval_steps_per_second": 1.955, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10002273243919073, |
| "grad_norm": 0.781851589679718, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 1.1273, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1091157081154808, |
| "grad_norm": 0.7166887521743774, |
| "learning_rate": 0.00017600000000000002, |
| "loss": 1.3067, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1091157081154808, |
| "eval_loss": 1.2027504444122314, |
| "eval_runtime": 7.6999, |
| "eval_samples_per_second": 3.896, |
| "eval_steps_per_second": 1.948, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11820868379177085, |
| "grad_norm": 0.7799960970878601, |
| "learning_rate": 0.000174, |
| "loss": 1.1987, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.12730165946806093, |
| "grad_norm": 0.6864632964134216, |
| "learning_rate": 0.000172, |
| "loss": 1.2013, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12730165946806093, |
| "eval_loss": 1.1920855045318604, |
| "eval_runtime": 7.8738, |
| "eval_samples_per_second": 3.81, |
| "eval_steps_per_second": 1.905, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.136394635144351, |
| "grad_norm": 0.774085283279419, |
| "learning_rate": 0.00017, |
| "loss": 1.1184, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.14548761082064104, |
| "grad_norm": 0.6681156158447266, |
| "learning_rate": 0.000168, |
| "loss": 1.2931, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.14548761082064104, |
| "eval_loss": 1.1745468378067017, |
| "eval_runtime": 7.4956, |
| "eval_samples_per_second": 4.002, |
| "eval_steps_per_second": 2.001, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15458058649693113, |
| "grad_norm": 0.7310240864753723, |
| "learning_rate": 0.000166, |
| "loss": 1.1426, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.16367356217322118, |
| "grad_norm": 0.8338828682899475, |
| "learning_rate": 0.000164, |
| "loss": 1.1719, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16367356217322118, |
| "eval_loss": 1.1653213500976562, |
| "eval_runtime": 7.8929, |
| "eval_samples_per_second": 3.801, |
| "eval_steps_per_second": 1.9, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17276653784951126, |
| "grad_norm": 0.732770562171936, |
| "learning_rate": 0.000162, |
| "loss": 1.2321, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.18185951352580132, |
| "grad_norm": 0.7523607611656189, |
| "learning_rate": 0.00016, |
| "loss": 1.2331, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.18185951352580132, |
| "eval_loss": 1.1490192413330078, |
| "eval_runtime": 7.6199, |
| "eval_samples_per_second": 3.937, |
| "eval_steps_per_second": 1.969, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.19095248920209137, |
| "grad_norm": 0.7681267261505127, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 1.1277, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.20004546487838146, |
| "grad_norm": 0.7249591946601868, |
| "learning_rate": 0.00015600000000000002, |
| "loss": 1.142, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20004546487838146, |
| "eval_loss": 1.137698769569397, |
| "eval_runtime": 7.8436, |
| "eval_samples_per_second": 3.825, |
| "eval_steps_per_second": 1.912, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2091384405546715, |
| "grad_norm": 0.6904309391975403, |
| "learning_rate": 0.000154, |
| "loss": 1.2033, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2182314162309616, |
| "grad_norm": 0.7456697821617126, |
| "learning_rate": 0.000152, |
| "loss": 1.1777, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2182314162309616, |
| "eval_loss": 1.1293922662734985, |
| "eval_runtime": 7.2963, |
| "eval_samples_per_second": 4.112, |
| "eval_steps_per_second": 2.056, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.22732439190725165, |
| "grad_norm": 0.6743273735046387, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 1.1582, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2364173675835417, |
| "grad_norm": 0.6429440379142761, |
| "learning_rate": 0.000148, |
| "loss": 1.1064, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2364173675835417, |
| "eval_loss": 1.119972825050354, |
| "eval_runtime": 7.7787, |
| "eval_samples_per_second": 3.857, |
| "eval_steps_per_second": 1.928, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2455103432598318, |
| "grad_norm": 0.6626828908920288, |
| "learning_rate": 0.000146, |
| "loss": 1.1741, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.25460331893612187, |
| "grad_norm": 0.8786306381225586, |
| "learning_rate": 0.000144, |
| "loss": 0.9836, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.25460331893612187, |
| "eval_loss": 1.1226236820220947, |
| "eval_runtime": 7.3222, |
| "eval_samples_per_second": 4.097, |
| "eval_steps_per_second": 2.049, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2636962946124119, |
| "grad_norm": 0.7686639428138733, |
| "learning_rate": 0.000142, |
| "loss": 1.0945, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.272789270288702, |
| "grad_norm": 0.795609712600708, |
| "learning_rate": 0.00014, |
| "loss": 0.9761, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.272789270288702, |
| "eval_loss": 1.0910608768463135, |
| "eval_runtime": 7.8761, |
| "eval_samples_per_second": 3.809, |
| "eval_steps_per_second": 1.905, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.28188224596499206, |
| "grad_norm": 0.8161769509315491, |
| "learning_rate": 0.000138, |
| "loss": 1.0516, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.2909752216412821, |
| "grad_norm": 0.7441025972366333, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 1.0843, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2909752216412821, |
| "eval_loss": 1.0994905233383179, |
| "eval_runtime": 7.3248, |
| "eval_samples_per_second": 4.096, |
| "eval_steps_per_second": 2.048, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.30006819731757217, |
| "grad_norm": 0.8015936613082886, |
| "learning_rate": 0.000134, |
| "loss": 1.2283, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.30916117299386225, |
| "grad_norm": 0.7653372287750244, |
| "learning_rate": 0.000132, |
| "loss": 1.0927, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.30916117299386225, |
| "eval_loss": 1.0781885385513306, |
| "eval_runtime": 7.7433, |
| "eval_samples_per_second": 3.874, |
| "eval_steps_per_second": 1.937, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3182541486701523, |
| "grad_norm": 0.7825664281845093, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 1.106, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.32734712434644236, |
| "grad_norm": 0.7554489970207214, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 1.0999, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.32734712434644236, |
| "eval_loss": 1.0733944177627563, |
| "eval_runtime": 7.5964, |
| "eval_samples_per_second": 3.949, |
| "eval_steps_per_second": 1.975, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.33644010002273245, |
| "grad_norm": 0.8089460730552673, |
| "learning_rate": 0.000126, |
| "loss": 1.2226, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.34553307569902253, |
| "grad_norm": 0.7402002215385437, |
| "learning_rate": 0.000124, |
| "loss": 1.1182, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.34553307569902253, |
| "eval_loss": 1.0658830404281616, |
| "eval_runtime": 7.8865, |
| "eval_samples_per_second": 3.804, |
| "eval_steps_per_second": 1.902, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.35462605137531256, |
| "grad_norm": 0.6649179458618164, |
| "learning_rate": 0.000122, |
| "loss": 1.0671, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.36371902705160264, |
| "grad_norm": 0.7573872804641724, |
| "learning_rate": 0.00012, |
| "loss": 1.0291, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.36371902705160264, |
| "eval_loss": 1.0471783876419067, |
| "eval_runtime": 7.9526, |
| "eval_samples_per_second": 3.772, |
| "eval_steps_per_second": 1.886, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3728120027278927, |
| "grad_norm": 0.8243398666381836, |
| "learning_rate": 0.000118, |
| "loss": 1.1096, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.38190497840418275, |
| "grad_norm": 0.721502423286438, |
| "learning_rate": 0.000116, |
| "loss": 1.2158, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.38190497840418275, |
| "eval_loss": 1.0554709434509277, |
| "eval_runtime": 7.3409, |
| "eval_samples_per_second": 4.087, |
| "eval_steps_per_second": 2.043, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.39099795408047283, |
| "grad_norm": 0.7591432332992554, |
| "learning_rate": 0.00011399999999999999, |
| "loss": 1.0817, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.4000909297567629, |
| "grad_norm": 0.7596343755722046, |
| "learning_rate": 0.00011200000000000001, |
| "loss": 1.0873, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4000909297567629, |
| "eval_loss": 1.0482908487319946, |
| "eval_runtime": 7.8536, |
| "eval_samples_per_second": 3.82, |
| "eval_steps_per_second": 1.91, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.40918390543305294, |
| "grad_norm": 0.8296840190887451, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 1.0252, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.418276881109343, |
| "grad_norm": 0.9094285368919373, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 1.0978, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.418276881109343, |
| "eval_loss": 1.046170711517334, |
| "eval_runtime": 7.4472, |
| "eval_samples_per_second": 4.028, |
| "eval_steps_per_second": 2.014, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4273698567856331, |
| "grad_norm": 0.8471206426620483, |
| "learning_rate": 0.00010600000000000002, |
| "loss": 1.0371, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.4364628324619232, |
| "grad_norm": 0.8168342113494873, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 1.0352, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4364628324619232, |
| "eval_loss": 1.0409115552902222, |
| "eval_runtime": 7.8502, |
| "eval_samples_per_second": 3.822, |
| "eval_steps_per_second": 1.911, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4455558081382132, |
| "grad_norm": 0.7482770681381226, |
| "learning_rate": 0.00010200000000000001, |
| "loss": 1.0812, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.4546487838145033, |
| "grad_norm": 0.7300863862037659, |
| "learning_rate": 0.0001, |
| "loss": 1.1762, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4546487838145033, |
| "eval_loss": 1.0410172939300537, |
| "eval_runtime": 7.4872, |
| "eval_samples_per_second": 4.007, |
| "eval_steps_per_second": 2.003, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4637417594907934, |
| "grad_norm": 0.7066290378570557, |
| "learning_rate": 9.8e-05, |
| "loss": 1.1054, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.4728347351670834, |
| "grad_norm": 0.8214625716209412, |
| "learning_rate": 9.6e-05, |
| "loss": 1.0563, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4728347351670834, |
| "eval_loss": 1.03702974319458, |
| "eval_runtime": 7.8723, |
| "eval_samples_per_second": 3.811, |
| "eval_steps_per_second": 1.905, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4819277108433735, |
| "grad_norm": 0.8834312558174133, |
| "learning_rate": 9.4e-05, |
| "loss": 1.1071, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.4910206865196636, |
| "grad_norm": 0.768332302570343, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 1.0537, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4910206865196636, |
| "eval_loss": 1.033887267112732, |
| "eval_runtime": 7.7503, |
| "eval_samples_per_second": 3.871, |
| "eval_steps_per_second": 1.935, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5001136621959537, |
| "grad_norm": 0.805924654006958, |
| "learning_rate": 9e-05, |
| "loss": 1.1193, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5092066378722437, |
| "grad_norm": 0.8571528792381287, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.0951, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5092066378722437, |
| "eval_loss": 1.0283806324005127, |
| "eval_runtime": 7.6361, |
| "eval_samples_per_second": 3.929, |
| "eval_steps_per_second": 1.964, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5182996135485337, |
| "grad_norm": 0.8743025064468384, |
| "learning_rate": 8.6e-05, |
| "loss": 0.9861, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5273925892248238, |
| "grad_norm": 0.8119250535964966, |
| "learning_rate": 8.4e-05, |
| "loss": 1.0458, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5273925892248238, |
| "eval_loss": 1.0257965326309204, |
| "eval_runtime": 7.8945, |
| "eval_samples_per_second": 3.8, |
| "eval_steps_per_second": 1.9, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5364855649011139, |
| "grad_norm": 0.9032679796218872, |
| "learning_rate": 8.2e-05, |
| "loss": 1.0145, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.545578540577404, |
| "grad_norm": 0.8125148415565491, |
| "learning_rate": 8e-05, |
| "loss": 1.0212, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.545578540577404, |
| "eval_loss": 1.018557071685791, |
| "eval_runtime": 7.5438, |
| "eval_samples_per_second": 3.977, |
| "eval_steps_per_second": 1.988, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.554671516253694, |
| "grad_norm": 0.77150958776474, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.0901, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5637644919299841, |
| "grad_norm": 0.8303976058959961, |
| "learning_rate": 7.6e-05, |
| "loss": 1.0535, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5637644919299841, |
| "eval_loss": 1.019250750541687, |
| "eval_runtime": 7.9264, |
| "eval_samples_per_second": 3.785, |
| "eval_steps_per_second": 1.892, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5728574676062742, |
| "grad_norm": 0.8433631658554077, |
| "learning_rate": 7.4e-05, |
| "loss": 1.1187, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5819504432825642, |
| "grad_norm": 0.8279653787612915, |
| "learning_rate": 7.2e-05, |
| "loss": 1.1483, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5819504432825642, |
| "eval_loss": 1.0166659355163574, |
| "eval_runtime": 7.3093, |
| "eval_samples_per_second": 4.104, |
| "eval_steps_per_second": 2.052, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5910434189588543, |
| "grad_norm": 0.6873704791069031, |
| "learning_rate": 7e-05, |
| "loss": 1.0573, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6001363946351443, |
| "grad_norm": 0.7217792868614197, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 1.0225, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6001363946351443, |
| "eval_loss": 1.0203421115875244, |
| "eval_runtime": 7.9938, |
| "eval_samples_per_second": 3.753, |
| "eval_steps_per_second": 1.876, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6092293703114344, |
| "grad_norm": 0.828619122505188, |
| "learning_rate": 6.6e-05, |
| "loss": 1.0272, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.6183223459877245, |
| "grad_norm": 0.7822660207748413, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.9776, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6183223459877245, |
| "eval_loss": 1.0186898708343506, |
| "eval_runtime": 7.3434, |
| "eval_samples_per_second": 4.085, |
| "eval_steps_per_second": 2.043, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6274153216640146, |
| "grad_norm": 0.7307916283607483, |
| "learning_rate": 6.2e-05, |
| "loss": 1.0637, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6365082973403046, |
| "grad_norm": 0.8595789670944214, |
| "learning_rate": 6e-05, |
| "loss": 1.0571, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6365082973403046, |
| "eval_loss": 1.008802056312561, |
| "eval_runtime": 7.6422, |
| "eval_samples_per_second": 3.926, |
| "eval_steps_per_second": 1.963, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6456012730165946, |
| "grad_norm": 1.0007542371749878, |
| "learning_rate": 5.8e-05, |
| "loss": 1.1277, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6546942486928847, |
| "grad_norm": 0.8014799356460571, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.2342, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6546942486928847, |
| "eval_loss": 1.0054609775543213, |
| "eval_runtime": 7.5443, |
| "eval_samples_per_second": 3.977, |
| "eval_steps_per_second": 1.988, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6637872243691748, |
| "grad_norm": 0.8301798105239868, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 1.0886, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6728802000454649, |
| "grad_norm": 0.8582270741462708, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.0834, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6728802000454649, |
| "eval_loss": 0.9980356693267822, |
| "eval_runtime": 7.3182, |
| "eval_samples_per_second": 4.099, |
| "eval_steps_per_second": 2.05, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.681973175721755, |
| "grad_norm": 0.9084227085113525, |
| "learning_rate": 5e-05, |
| "loss": 1.0517, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.6910661513980451, |
| "grad_norm": 0.8120643496513367, |
| "learning_rate": 4.8e-05, |
| "loss": 1.0931, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6910661513980451, |
| "eval_loss": 0.9912369847297668, |
| "eval_runtime": 7.8444, |
| "eval_samples_per_second": 3.824, |
| "eval_steps_per_second": 1.912, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.700159127074335, |
| "grad_norm": 0.8523077964782715, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.0883, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7092521027506251, |
| "grad_norm": 0.8379296660423279, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.1041, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7092521027506251, |
| "eval_loss": 0.9924930334091187, |
| "eval_runtime": 7.364, |
| "eval_samples_per_second": 4.074, |
| "eval_steps_per_second": 2.037, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7183450784269152, |
| "grad_norm": 0.9272042512893677, |
| "learning_rate": 4.2e-05, |
| "loss": 1.0839, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.7274380541032053, |
| "grad_norm": 0.8774125576019287, |
| "learning_rate": 4e-05, |
| "loss": 0.9889, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7274380541032053, |
| "eval_loss": 0.9954690337181091, |
| "eval_runtime": 7.8404, |
| "eval_samples_per_second": 3.826, |
| "eval_steps_per_second": 1.913, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7365310297794954, |
| "grad_norm": 0.7553389072418213, |
| "learning_rate": 3.8e-05, |
| "loss": 1.0906, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7456240054557854, |
| "grad_norm": 0.7866451740264893, |
| "learning_rate": 3.6e-05, |
| "loss": 1.0219, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7456240054557854, |
| "eval_loss": 0.994717001914978, |
| "eval_runtime": 7.8266, |
| "eval_samples_per_second": 3.833, |
| "eval_steps_per_second": 1.917, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.8554181456565857, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.0598, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7638099568083655, |
| "grad_norm": 0.9773761034011841, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.033, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7638099568083655, |
| "eval_loss": 0.9926409125328064, |
| "eval_runtime": 7.2819, |
| "eval_samples_per_second": 4.12, |
| "eval_steps_per_second": 2.06, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7729029324846556, |
| "grad_norm": 0.8768495917320251, |
| "learning_rate": 3e-05, |
| "loss": 1.054, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.7819959081609457, |
| "grad_norm": 0.787002682685852, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.0548, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7819959081609457, |
| "eval_loss": 0.9910202622413635, |
| "eval_runtime": 7.8704, |
| "eval_samples_per_second": 3.812, |
| "eval_steps_per_second": 1.906, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7910888838372357, |
| "grad_norm": 0.843839704990387, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.0936, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8001818595135258, |
| "grad_norm": 0.9202592968940735, |
| "learning_rate": 2.4e-05, |
| "loss": 1.0684, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8001818595135258, |
| "eval_loss": 0.9879806637763977, |
| "eval_runtime": 7.292, |
| "eval_samples_per_second": 4.114, |
| "eval_steps_per_second": 2.057, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8092748351898159, |
| "grad_norm": 0.8747548460960388, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.0185, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.8183678108661059, |
| "grad_norm": 0.8311501145362854, |
| "learning_rate": 2e-05, |
| "loss": 1.0874, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8183678108661059, |
| "eval_loss": 0.9860556125640869, |
| "eval_runtime": 7.7936, |
| "eval_samples_per_second": 3.849, |
| "eval_steps_per_second": 1.925, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.827460786542396, |
| "grad_norm": 0.8813076615333557, |
| "learning_rate": 1.8e-05, |
| "loss": 1.0209, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.836553762218686, |
| "grad_norm": 0.9480300545692444, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.0878, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.836553762218686, |
| "eval_loss": 0.9852551817893982, |
| "eval_runtime": 7.2978, |
| "eval_samples_per_second": 4.111, |
| "eval_steps_per_second": 2.055, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8456467378949761, |
| "grad_norm": 0.8942534923553467, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.9746, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8547397135712662, |
| "grad_norm": 0.9491382837295532, |
| "learning_rate": 1.2e-05, |
| "loss": 0.9443, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8547397135712662, |
| "eval_loss": 0.9845015406608582, |
| "eval_runtime": 7.7967, |
| "eval_samples_per_second": 3.848, |
| "eval_steps_per_second": 1.924, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8638326892475563, |
| "grad_norm": 0.9191480278968811, |
| "learning_rate": 1e-05, |
| "loss": 1.0311, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8729256649238464, |
| "grad_norm": 0.8474745750427246, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.1006, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8729256649238464, |
| "eval_loss": 0.9836694002151489, |
| "eval_runtime": 7.3154, |
| "eval_samples_per_second": 4.101, |
| "eval_steps_per_second": 2.05, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8820186406001363, |
| "grad_norm": 0.8463994860649109, |
| "learning_rate": 6e-06, |
| "loss": 1.0196, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.8911116162764264, |
| "grad_norm": 0.8902223706245422, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.0447, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8911116162764264, |
| "eval_loss": 0.9838915467262268, |
| "eval_runtime": 7.7764, |
| "eval_samples_per_second": 3.858, |
| "eval_steps_per_second": 1.929, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9002045919527165, |
| "grad_norm": 0.8993239998817444, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.0981, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.9092975676290066, |
| "grad_norm": 0.8118588924407959, |
| "learning_rate": 0.0, |
| "loss": 1.0078, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9092975676290066, |
| "eval_loss": 0.9837616086006165, |
| "eval_runtime": 7.456, |
| "eval_samples_per_second": 4.024, |
| "eval_steps_per_second": 2.012, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 20, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.068977650017795e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|