| { |
| "best_global_step": 20000, |
| "best_metric": 0.8734950423240662, |
| "best_model_checkpoint": "/scratch-shared/gwijngaard/outputs/qwen2_5-sft-nolora-nonewtoken-nofreeze_20250930_231031/checkpoint-20000", |
| "epoch": 1.0, |
| "eval_steps": 5000, |
| "global_step": 22500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.9206753075122833, |
| "epoch": 4.4444444444444447e-05, |
| "grad_norm": 8.4375, |
| "learning_rate": 0.0, |
| "loss": 1.1975, |
| "mean_token_accuracy": 0.7345979571342468, |
| "num_tokens": 29463751.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 2.8330495211542868, |
| "epoch": 0.0022222222222222222, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.451851851851852e-06, |
| "loss": 0.8656, |
| "mean_token_accuracy": 0.7418258153662389, |
| "num_tokens": 29529779.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.836770453453064, |
| "epoch": 0.0044444444444444444, |
| "grad_norm": 6.625, |
| "learning_rate": 2.9333333333333338e-06, |
| "loss": 0.8785, |
| "mean_token_accuracy": 0.7382896327972412, |
| "num_tokens": 29596644.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.9122318506240843, |
| "epoch": 0.006666666666666667, |
| "grad_norm": 8.3125, |
| "learning_rate": 4.4148148148148154e-06, |
| "loss": 0.8733, |
| "mean_token_accuracy": 0.7413022458553314, |
| "num_tokens": 29660760.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.900973844528198, |
| "epoch": 0.008888888888888889, |
| "grad_norm": 7.65625, |
| "learning_rate": 5.896296296296296e-06, |
| "loss": 0.897, |
| "mean_token_accuracy": 0.737481083869934, |
| "num_tokens": 29728064.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 2.8783673429489136, |
| "epoch": 0.011111111111111112, |
| "grad_norm": 7.53125, |
| "learning_rate": 7.377777777777778e-06, |
| "loss": 0.8858, |
| "mean_token_accuracy": 0.7386987507343292, |
| "num_tokens": 29793159.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 2.855666689872742, |
| "epoch": 0.013333333333333334, |
| "grad_norm": 6.71875, |
| "learning_rate": 8.85925925925926e-06, |
| "loss": 0.8958, |
| "mean_token_accuracy": 0.736550339460373, |
| "num_tokens": 29858642.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.7782880878448486, |
| "epoch": 0.015555555555555555, |
| "grad_norm": 9.125, |
| "learning_rate": 1.0340740740740743e-05, |
| "loss": 0.8976, |
| "mean_token_accuracy": 0.7359859848022461, |
| "num_tokens": 29927577.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.747315707206726, |
| "epoch": 0.017777777777777778, |
| "grad_norm": 9.125, |
| "learning_rate": 1.1822222222222225e-05, |
| "loss": 0.851, |
| "mean_token_accuracy": 0.7458417963981628, |
| "num_tokens": 29993260.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.777502555847168, |
| "epoch": 0.02, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.3303703703703705e-05, |
| "loss": 0.8894, |
| "mean_token_accuracy": 0.7404855847358703, |
| "num_tokens": 30061485.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.879872035980225, |
| "epoch": 0.022222222222222223, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.4785185185185186e-05, |
| "loss": 0.9081, |
| "mean_token_accuracy": 0.7336488461494446, |
| "num_tokens": 30126648.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.7876138353347777, |
| "epoch": 0.024444444444444446, |
| "grad_norm": 9.6875, |
| "learning_rate": 1.6266666666666668e-05, |
| "loss": 0.8461, |
| "mean_token_accuracy": 0.7486303246021271, |
| "num_tokens": 30192879.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.9612947702407837, |
| "epoch": 0.02666666666666667, |
| "grad_norm": 8.0, |
| "learning_rate": 1.774814814814815e-05, |
| "loss": 0.8981, |
| "mean_token_accuracy": 0.7390279281139374, |
| "num_tokens": 30257537.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.7916600465774537, |
| "epoch": 0.028888888888888888, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.922962962962963e-05, |
| "loss": 0.8675, |
| "mean_token_accuracy": 0.7437423765659332, |
| "num_tokens": 30323879.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 2.8081023025512697, |
| "epoch": 0.03111111111111111, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.9978006872852237e-05, |
| "loss": 0.8574, |
| "mean_token_accuracy": 0.7459037184715271, |
| "num_tokens": 30393385.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 2.9191632890701293, |
| "epoch": 0.03333333333333333, |
| "grad_norm": 8.3125, |
| "learning_rate": 1.9932187857961058e-05, |
| "loss": 0.9081, |
| "mean_token_accuracy": 0.7324011015892029, |
| "num_tokens": 30458516.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.783015651702881, |
| "epoch": 0.035555555555555556, |
| "grad_norm": 6.875, |
| "learning_rate": 1.9886368843069876e-05, |
| "loss": 0.8403, |
| "mean_token_accuracy": 0.7504636573791504, |
| "num_tokens": 30522811.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 2.782901654243469, |
| "epoch": 0.03777777777777778, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.9840549828178697e-05, |
| "loss": 0.8633, |
| "mean_token_accuracy": 0.7460731649398804, |
| "num_tokens": 30588800.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.852466220855713, |
| "epoch": 0.04, |
| "grad_norm": 7.875, |
| "learning_rate": 1.9794730813287515e-05, |
| "loss": 0.8939, |
| "mean_token_accuracy": 0.7356749868392944, |
| "num_tokens": 30654866.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 2.8751544713974, |
| "epoch": 0.042222222222222223, |
| "grad_norm": 6.5625, |
| "learning_rate": 1.9748911798396336e-05, |
| "loss": 0.8526, |
| "mean_token_accuracy": 0.7440717363357544, |
| "num_tokens": 30718125.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 2.791564016342163, |
| "epoch": 0.044444444444444446, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.9703092783505157e-05, |
| "loss": 0.8528, |
| "mean_token_accuracy": 0.743047387599945, |
| "num_tokens": 30784326.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.7872579193115232, |
| "epoch": 0.04666666666666667, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.9657273768613975e-05, |
| "loss": 0.8516, |
| "mean_token_accuracy": 0.747126750946045, |
| "num_tokens": 30851638.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.7552728176116945, |
| "epoch": 0.04888888888888889, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.9611454753722796e-05, |
| "loss": 0.8552, |
| "mean_token_accuracy": 0.7450509345531464, |
| "num_tokens": 30917218.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 2.7746379947662354, |
| "epoch": 0.051111111111111114, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.9565635738831617e-05, |
| "loss": 0.8364, |
| "mean_token_accuracy": 0.754111316204071, |
| "num_tokens": 30981770.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.6211288261413572, |
| "epoch": 0.05333333333333334, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.9519816723940438e-05, |
| "loss": 0.8351, |
| "mean_token_accuracy": 0.7522010815143585, |
| "num_tokens": 31048479.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.6207118034362793, |
| "epoch": 0.05555555555555555, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.9473997709049256e-05, |
| "loss": 0.8428, |
| "mean_token_accuracy": 0.7495422863960266, |
| "num_tokens": 31116595.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 2.6855200052261354, |
| "epoch": 0.057777777777777775, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.9428178694158077e-05, |
| "loss": 0.8629, |
| "mean_token_accuracy": 0.7421471869945526, |
| "num_tokens": 31183980.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.528144361972809, |
| "epoch": 0.06, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.93823596792669e-05, |
| "loss": 0.7948, |
| "mean_token_accuracy": 0.7594469666481019, |
| "num_tokens": 31250261.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.5050185775756835, |
| "epoch": 0.06222222222222222, |
| "grad_norm": 8.5, |
| "learning_rate": 1.933654066437572e-05, |
| "loss": 0.8032, |
| "mean_token_accuracy": 0.7555232620239258, |
| "num_tokens": 31316852.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.511464171409607, |
| "epoch": 0.06444444444444444, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.9290721649484537e-05, |
| "loss": 0.8122, |
| "mean_token_accuracy": 0.7525858426094055, |
| "num_tokens": 31381166.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 2.47177143573761, |
| "epoch": 0.06666666666666667, |
| "grad_norm": 7.59375, |
| "learning_rate": 1.924490263459336e-05, |
| "loss": 0.8022, |
| "mean_token_accuracy": 0.7576224994659424, |
| "num_tokens": 31450722.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.451930871009827, |
| "epoch": 0.06888888888888889, |
| "grad_norm": 12.3125, |
| "learning_rate": 1.919908361970218e-05, |
| "loss": 0.8025, |
| "mean_token_accuracy": 0.7531751859188079, |
| "num_tokens": 31517729.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.46038672208786, |
| "epoch": 0.07111111111111111, |
| "grad_norm": 6.84375, |
| "learning_rate": 1.9153264604810998e-05, |
| "loss": 0.8188, |
| "mean_token_accuracy": 0.7543287444114685, |
| "num_tokens": 31584935.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.5046206855773927, |
| "epoch": 0.07333333333333333, |
| "grad_norm": 7.40625, |
| "learning_rate": 1.910744558991982e-05, |
| "loss": 0.8076, |
| "mean_token_accuracy": 0.7573912596702576, |
| "num_tokens": 31651819.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.587780613899231, |
| "epoch": 0.07555555555555556, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.9061626575028637e-05, |
| "loss": 0.7887, |
| "mean_token_accuracy": 0.7603560221195221, |
| "num_tokens": 31716593.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.471900417804718, |
| "epoch": 0.07777777777777778, |
| "grad_norm": 7.0, |
| "learning_rate": 1.9015807560137458e-05, |
| "loss": 0.7814, |
| "mean_token_accuracy": 0.7607843494415283, |
| "num_tokens": 31783153.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 2.4517657709121705, |
| "epoch": 0.08, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.896998854524628e-05, |
| "loss": 0.8497, |
| "mean_token_accuracy": 0.7433587527275085, |
| "num_tokens": 31851501.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.4117108368873597, |
| "epoch": 0.08222222222222222, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.89241695303551e-05, |
| "loss": 0.799, |
| "mean_token_accuracy": 0.7592541599273681, |
| "num_tokens": 31916822.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 2.35961368560791, |
| "epoch": 0.08444444444444445, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.8878350515463918e-05, |
| "loss": 0.8093, |
| "mean_token_accuracy": 0.7549574375152588, |
| "num_tokens": 31984311.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 2.444758574962616, |
| "epoch": 0.08666666666666667, |
| "grad_norm": 7.0, |
| "learning_rate": 1.883253150057274e-05, |
| "loss": 0.7778, |
| "mean_token_accuracy": 0.7624431896209717, |
| "num_tokens": 32049197.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 2.4109366965293884, |
| "epoch": 0.08888888888888889, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.878671248568156e-05, |
| "loss": 0.8078, |
| "mean_token_accuracy": 0.7562625980377198, |
| "num_tokens": 32114585.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 2.338375616073608, |
| "epoch": 0.09111111111111111, |
| "grad_norm": 7.8125, |
| "learning_rate": 1.874089347079038e-05, |
| "loss": 0.8051, |
| "mean_token_accuracy": 0.7559153795242309, |
| "num_tokens": 32182189.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 2.3647463655471803, |
| "epoch": 0.09333333333333334, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.86950744558992e-05, |
| "loss": 0.7964, |
| "mean_token_accuracy": 0.7597098231315613, |
| "num_tokens": 32246491.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 2.3017068338394164, |
| "epoch": 0.09555555555555556, |
| "grad_norm": 11.0625, |
| "learning_rate": 1.864925544100802e-05, |
| "loss": 0.8237, |
| "mean_token_accuracy": 0.749286116361618, |
| "num_tokens": 32313665.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 2.3421465373039245, |
| "epoch": 0.09777777777777778, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.860343642611684e-05, |
| "loss": 0.7969, |
| "mean_token_accuracy": 0.757558046579361, |
| "num_tokens": 32377681.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 2.2409315156936644, |
| "epoch": 0.1, |
| "grad_norm": 8.25, |
| "learning_rate": 1.855761741122566e-05, |
| "loss": 0.782, |
| "mean_token_accuracy": 0.7633419442176819, |
| "num_tokens": 32445533.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 2.2817729759216308, |
| "epoch": 0.10222222222222223, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.851179839633448e-05, |
| "loss": 0.803, |
| "mean_token_accuracy": 0.7561051642894745, |
| "num_tokens": 32514116.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 2.151771836280823, |
| "epoch": 0.10444444444444445, |
| "grad_norm": 7.125, |
| "learning_rate": 1.84659793814433e-05, |
| "loss": 0.7906, |
| "mean_token_accuracy": 0.7589080274105072, |
| "num_tokens": 32583284.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 2.184410240650177, |
| "epoch": 0.10666666666666667, |
| "grad_norm": 10.4375, |
| "learning_rate": 1.842016036655212e-05, |
| "loss": 0.8398, |
| "mean_token_accuracy": 0.7480292344093322, |
| "num_tokens": 32650513.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 2.18844452381134, |
| "epoch": 0.10888888888888888, |
| "grad_norm": 10.5, |
| "learning_rate": 1.837434135166094e-05, |
| "loss": 0.801, |
| "mean_token_accuracy": 0.7543882429599762, |
| "num_tokens": 32716407.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 2.172247538566589, |
| "epoch": 0.1111111111111111, |
| "grad_norm": 10.5625, |
| "learning_rate": 1.8328522336769762e-05, |
| "loss": 0.7963, |
| "mean_token_accuracy": 0.7586643397808075, |
| "num_tokens": 32781494.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 2.241757538318634, |
| "epoch": 0.11333333333333333, |
| "grad_norm": 9.9375, |
| "learning_rate": 1.828270332187858e-05, |
| "loss": 0.8141, |
| "mean_token_accuracy": 0.7552792716026306, |
| "num_tokens": 32846884.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 2.2868755722045897, |
| "epoch": 0.11555555555555555, |
| "grad_norm": 7.8125, |
| "learning_rate": 1.82368843069874e-05, |
| "loss": 0.8164, |
| "mean_token_accuracy": 0.7541338610649109, |
| "num_tokens": 32912538.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 2.2125061202049254, |
| "epoch": 0.11777777777777777, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.8191065292096222e-05, |
| "loss": 0.7896, |
| "mean_token_accuracy": 0.7596218919754029, |
| "num_tokens": 32980319.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 2.155822920799255, |
| "epoch": 0.12, |
| "grad_norm": 6.78125, |
| "learning_rate": 1.8145246277205043e-05, |
| "loss": 0.8207, |
| "mean_token_accuracy": 0.754329582452774, |
| "num_tokens": 33049961.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 2.1824621748924256, |
| "epoch": 0.12222222222222222, |
| "grad_norm": 7.59375, |
| "learning_rate": 1.809942726231386e-05, |
| "loss": 0.7661, |
| "mean_token_accuracy": 0.7682196378707886, |
| "num_tokens": 33116366.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 2.231255145072937, |
| "epoch": 0.12444444444444444, |
| "grad_norm": 8.875, |
| "learning_rate": 1.8053608247422682e-05, |
| "loss": 0.8048, |
| "mean_token_accuracy": 0.7537202823162079, |
| "num_tokens": 33185374.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 2.2291252851486205, |
| "epoch": 0.12666666666666668, |
| "grad_norm": 8.125, |
| "learning_rate": 1.8007789232531504e-05, |
| "loss": 0.7598, |
| "mean_token_accuracy": 0.766055703163147, |
| "num_tokens": 33252044.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 2.1579408121109007, |
| "epoch": 0.1288888888888889, |
| "grad_norm": 8.3125, |
| "learning_rate": 1.7961970217640325e-05, |
| "loss": 0.7862, |
| "mean_token_accuracy": 0.7643947994709015, |
| "num_tokens": 33319654.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 2.1827985763549806, |
| "epoch": 0.13111111111111112, |
| "grad_norm": 7.375, |
| "learning_rate": 1.7916151202749143e-05, |
| "loss": 0.8002, |
| "mean_token_accuracy": 0.7550258612632752, |
| "num_tokens": 33388120.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 2.2125383615493774, |
| "epoch": 0.13333333333333333, |
| "grad_norm": 8.0, |
| "learning_rate": 1.787033218785796e-05, |
| "loss": 0.7784, |
| "mean_token_accuracy": 0.7626236033439636, |
| "num_tokens": 33454419.0, |
| "step": 3000 |
| }, |
| { |
| "entropy": 2.1113911986351015, |
| "epoch": 0.13555555555555557, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.782451317296678e-05, |
| "loss": 0.7674, |
| "mean_token_accuracy": 0.7665451729297638, |
| "num_tokens": 33520682.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 2.1803041291236878, |
| "epoch": 0.13777777777777778, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.7778694158075603e-05, |
| "loss": 0.7898, |
| "mean_token_accuracy": 0.7653110408782959, |
| "num_tokens": 33588492.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 2.2334672379493714, |
| "epoch": 0.14, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.7732875143184424e-05, |
| "loss": 0.7731, |
| "mean_token_accuracy": 0.7676819574832916, |
| "num_tokens": 33654548.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 2.1736029314994814, |
| "epoch": 0.14222222222222222, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.768705612829324e-05, |
| "loss": 0.8031, |
| "mean_token_accuracy": 0.7566261160373687, |
| "num_tokens": 33722789.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 2.2151904988288877, |
| "epoch": 0.14444444444444443, |
| "grad_norm": 7.625, |
| "learning_rate": 1.7641237113402063e-05, |
| "loss": 0.7979, |
| "mean_token_accuracy": 0.7582576811313629, |
| "num_tokens": 33789866.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 2.213924946784973, |
| "epoch": 0.14666666666666667, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.7595418098510884e-05, |
| "loss": 0.7889, |
| "mean_token_accuracy": 0.7583772194385529, |
| "num_tokens": 33857885.0, |
| "step": 3300 |
| }, |
| { |
| "entropy": 2.144984345436096, |
| "epoch": 0.14888888888888888, |
| "grad_norm": 7.78125, |
| "learning_rate": 1.7549599083619705e-05, |
| "loss": 0.8019, |
| "mean_token_accuracy": 0.7535349237918854, |
| "num_tokens": 33923487.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 2.1649972796440125, |
| "epoch": 0.1511111111111111, |
| "grad_norm": 8.125, |
| "learning_rate": 1.7503780068728523e-05, |
| "loss": 0.7989, |
| "mean_token_accuracy": 0.757453374862671, |
| "num_tokens": 33989482.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 2.2169123339653014, |
| "epoch": 0.15333333333333332, |
| "grad_norm": 6.5625, |
| "learning_rate": 1.7457961053837344e-05, |
| "loss": 0.8161, |
| "mean_token_accuracy": 0.7528200805187225, |
| "num_tokens": 34059798.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 2.2231992268562317, |
| "epoch": 0.15555555555555556, |
| "grad_norm": 7.78125, |
| "learning_rate": 1.7412142038946165e-05, |
| "loss": 0.8242, |
| "mean_token_accuracy": 0.7535739564895629, |
| "num_tokens": 34125686.0, |
| "step": 3500 |
| }, |
| { |
| "entropy": 2.1861121082305908, |
| "epoch": 0.15777777777777777, |
| "grad_norm": 7.125, |
| "learning_rate": 1.7366323024054987e-05, |
| "loss": 0.777, |
| "mean_token_accuracy": 0.7635022902488708, |
| "num_tokens": 34192858.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 2.1724994444847106, |
| "epoch": 0.16, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.7320504009163804e-05, |
| "loss": 0.7834, |
| "mean_token_accuracy": 0.76077321767807, |
| "num_tokens": 34259641.0, |
| "step": 3600 |
| }, |
| { |
| "entropy": 2.182099552154541, |
| "epoch": 0.1622222222222222, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.7274684994272622e-05, |
| "loss": 0.7983, |
| "mean_token_accuracy": 0.7584855699539185, |
| "num_tokens": 34328061.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 2.1422414350509644, |
| "epoch": 0.16444444444444445, |
| "grad_norm": 8.375, |
| "learning_rate": 1.7228865979381443e-05, |
| "loss": 0.7817, |
| "mean_token_accuracy": 0.7643873155117035, |
| "num_tokens": 34397172.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 2.1676920294761657, |
| "epoch": 0.16666666666666666, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.7183046964490265e-05, |
| "loss": 0.8121, |
| "mean_token_accuracy": 0.7518685030937194, |
| "num_tokens": 34462090.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 2.0866295003890993, |
| "epoch": 0.1688888888888889, |
| "grad_norm": 7.5, |
| "learning_rate": 1.7137227949599086e-05, |
| "loss": 0.7967, |
| "mean_token_accuracy": 0.7572396492958069, |
| "num_tokens": 34531839.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 2.1308164954185487, |
| "epoch": 0.1711111111111111, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.7091408934707904e-05, |
| "loss": 0.7656, |
| "mean_token_accuracy": 0.7690932631492615, |
| "num_tokens": 34597073.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 2.1494863390922547, |
| "epoch": 0.17333333333333334, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.7045589919816725e-05, |
| "loss": 0.8129, |
| "mean_token_accuracy": 0.7566492486000062, |
| "num_tokens": 34667098.0, |
| "step": 3900 |
| }, |
| { |
| "entropy": 2.098915767669678, |
| "epoch": 0.17555555555555555, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.6999770904925546e-05, |
| "loss": 0.7681, |
| "mean_token_accuracy": 0.764438933134079, |
| "num_tokens": 34734238.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 2.2060401248931885, |
| "epoch": 0.17777777777777778, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.6953951890034367e-05, |
| "loss": 0.7993, |
| "mean_token_accuracy": 0.7594318461418151, |
| "num_tokens": 34801484.0, |
| "step": 4000 |
| }, |
| { |
| "entropy": 2.1855486106872557, |
| "epoch": 0.18, |
| "grad_norm": 7.96875, |
| "learning_rate": 1.6908132875143185e-05, |
| "loss": 0.792, |
| "mean_token_accuracy": 0.7597147989273071, |
| "num_tokens": 34869674.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 2.0849170541763304, |
| "epoch": 0.18222222222222223, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.6862313860252006e-05, |
| "loss": 0.8095, |
| "mean_token_accuracy": 0.7556380236148834, |
| "num_tokens": 34939372.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 2.1427531123161314, |
| "epoch": 0.18444444444444444, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.6816494845360827e-05, |
| "loss": 0.8103, |
| "mean_token_accuracy": 0.7562667608261109, |
| "num_tokens": 35007892.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 2.0929103469848633, |
| "epoch": 0.18666666666666668, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.677067583046965e-05, |
| "loss": 0.8292, |
| "mean_token_accuracy": 0.7510438573360443, |
| "num_tokens": 35078997.0, |
| "step": 4200 |
| }, |
| { |
| "entropy": 2.049334568977356, |
| "epoch": 0.18888888888888888, |
| "grad_norm": 7.875, |
| "learning_rate": 1.6724856815578466e-05, |
| "loss": 0.7729, |
| "mean_token_accuracy": 0.7638446021080018, |
| "num_tokens": 35148350.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 2.099260985851288, |
| "epoch": 0.19111111111111112, |
| "grad_norm": 6.8125, |
| "learning_rate": 1.6679037800687284e-05, |
| "loss": 0.7974, |
| "mean_token_accuracy": 0.7584287643432617, |
| "num_tokens": 35216731.0, |
| "step": 4300 |
| }, |
| { |
| "entropy": 2.112381303310394, |
| "epoch": 0.19333333333333333, |
| "grad_norm": 7.0, |
| "learning_rate": 1.6633218785796105e-05, |
| "loss": 0.743, |
| "mean_token_accuracy": 0.7701405155658722, |
| "num_tokens": 35282400.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 2.152976577281952, |
| "epoch": 0.19555555555555557, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.6587399770904926e-05, |
| "loss": 0.79, |
| "mean_token_accuracy": 0.7612162494659424, |
| "num_tokens": 35348623.0, |
| "step": 4400 |
| }, |
| { |
| "entropy": 2.1815468096733093, |
| "epoch": 0.19777777777777777, |
| "grad_norm": 7.125, |
| "learning_rate": 1.6541580756013748e-05, |
| "loss": 0.8279, |
| "mean_token_accuracy": 0.7511774361133575, |
| "num_tokens": 35417105.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 2.2018524050712585, |
| "epoch": 0.2, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.6495761741122565e-05, |
| "loss": 0.7781, |
| "mean_token_accuracy": 0.763760222196579, |
| "num_tokens": 35483468.0, |
| "step": 4500 |
| }, |
| { |
| "entropy": 2.1301321840286254, |
| "epoch": 0.20222222222222222, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.6449942726231387e-05, |
| "loss": 0.8019, |
| "mean_token_accuracy": 0.7569781160354614, |
| "num_tokens": 35550517.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 2.187158074378967, |
| "epoch": 0.20444444444444446, |
| "grad_norm": 8.25, |
| "learning_rate": 1.6404123711340208e-05, |
| "loss": 0.8104, |
| "mean_token_accuracy": 0.7544570553302765, |
| "num_tokens": 35617223.0, |
| "step": 4600 |
| }, |
| { |
| "entropy": 2.106666340827942, |
| "epoch": 0.20666666666666667, |
| "grad_norm": 7.53125, |
| "learning_rate": 1.635830469644903e-05, |
| "loss": 0.755, |
| "mean_token_accuracy": 0.7678434896469116, |
| "num_tokens": 35682799.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 2.109342801570892, |
| "epoch": 0.2088888888888889, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.6312485681557847e-05, |
| "loss": 0.809, |
| "mean_token_accuracy": 0.7529355835914612, |
| "num_tokens": 35749281.0, |
| "step": 4700 |
| }, |
| { |
| "entropy": 2.0876813530921936, |
| "epoch": 0.2111111111111111, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.6266666666666668e-05, |
| "loss": 0.7859, |
| "mean_token_accuracy": 0.760038149356842, |
| "num_tokens": 35817971.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 2.109708137512207, |
| "epoch": 0.21333333333333335, |
| "grad_norm": 8.875, |
| "learning_rate": 1.622084765177549e-05, |
| "loss": 0.8217, |
| "mean_token_accuracy": 0.7479185128211975, |
| "num_tokens": 35884686.0, |
| "step": 4800 |
| }, |
| { |
| "entropy": 2.181152358055115, |
| "epoch": 0.21555555555555556, |
| "grad_norm": 11.375, |
| "learning_rate": 1.617502863688431e-05, |
| "loss": 0.8565, |
| "mean_token_accuracy": 0.7458688330650329, |
| "num_tokens": 35950287.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 2.1443709397315978, |
| "epoch": 0.21777777777777776, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.6129209621993128e-05, |
| "loss": 0.8138, |
| "mean_token_accuracy": 0.7534062623977661, |
| "num_tokens": 36015777.0, |
| "step": 4900 |
| }, |
| { |
| "entropy": 2.0954318118095396, |
| "epoch": 0.22, |
| "grad_norm": 7.84375, |
| "learning_rate": 1.608339060710195e-05, |
| "loss": 0.7995, |
| "mean_token_accuracy": 0.7556386387348175, |
| "num_tokens": 36084969.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 2.1733604526519774, |
| "epoch": 0.2222222222222222, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.6037571592210767e-05, |
| "loss": 0.7857, |
| "mean_token_accuracy": 0.7591150319576263, |
| "num_tokens": 36150062.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "eval_entropy": 2.060102254152298, |
| "eval_loss": 0.9120854735374451, |
| "eval_mean_token_accuracy": 0.7374376058578491, |
| "eval_num_tokens": 36150062.0, |
| "eval_runtime": 5.8568, |
| "eval_samples_per_second": 2.22, |
| "eval_steps_per_second": 0.683, |
| "step": 5000 |
| }, |
| { |
| "entropy": 2.12566153049469, |
| "epoch": 0.22444444444444445, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.5991752577319588e-05, |
| "loss": 0.7834, |
| "mean_token_accuracy": 0.7630481898784638, |
| "num_tokens": 36218190.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 2.1597506642341613, |
| "epoch": 0.22666666666666666, |
| "grad_norm": 7.96875, |
| "learning_rate": 1.594593356242841e-05, |
| "loss": 0.8013, |
| "mean_token_accuracy": 0.7572294700145722, |
| "num_tokens": 36284871.0, |
| "step": 5100 |
| }, |
| { |
| "entropy": 2.2783447027206423, |
| "epoch": 0.2288888888888889, |
| "grad_norm": 6.90625, |
| "learning_rate": 1.5900114547537227e-05, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7509990322589875, |
| "num_tokens": 36352201.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 2.0995934796333313, |
| "epoch": 0.2311111111111111, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.585429553264605e-05, |
| "loss": 0.7758, |
| "mean_token_accuracy": 0.7649268102645874, |
| "num_tokens": 36422636.0, |
| "step": 5200 |
| }, |
| { |
| "entropy": 2.2191508650779723, |
| "epoch": 0.23333333333333334, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.580847651775487e-05, |
| "loss": 0.8183, |
| "mean_token_accuracy": 0.7507635414600372, |
| "num_tokens": 36489811.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 2.158704442977905, |
| "epoch": 0.23555555555555555, |
| "grad_norm": 11.625, |
| "learning_rate": 1.576265750286369e-05, |
| "loss": 0.7919, |
| "mean_token_accuracy": 0.7578798067569733, |
| "num_tokens": 36558093.0, |
| "step": 5300 |
| }, |
| { |
| "entropy": 2.2435457158088683, |
| "epoch": 0.23777777777777778, |
| "grad_norm": 10.1875, |
| "learning_rate": 1.571683848797251e-05, |
| "loss": 0.8026, |
| "mean_token_accuracy": 0.7596748018264771, |
| "num_tokens": 36623942.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 2.26217898607254, |
| "epoch": 0.24, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.567101947308133e-05, |
| "loss": 0.7801, |
| "mean_token_accuracy": 0.7630024456977844, |
| "num_tokens": 36693576.0, |
| "step": 5400 |
| }, |
| { |
| "entropy": 2.1895520281791687, |
| "epoch": 0.24222222222222223, |
| "grad_norm": 6.96875, |
| "learning_rate": 1.562520045819015e-05, |
| "loss": 0.8052, |
| "mean_token_accuracy": 0.7553786933422089, |
| "num_tokens": 36761921.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 2.242999081611633, |
| "epoch": 0.24444444444444444, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.5579381443298972e-05, |
| "loss": 0.7577, |
| "mean_token_accuracy": 0.7669945132732391, |
| "num_tokens": 36829085.0, |
| "step": 5500 |
| }, |
| { |
| "entropy": 2.168731119632721, |
| "epoch": 0.24666666666666667, |
| "grad_norm": 7.71875, |
| "learning_rate": 1.553356242840779e-05, |
| "loss": 0.7417, |
| "mean_token_accuracy": 0.7682712721824646, |
| "num_tokens": 36894333.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 2.209793448448181, |
| "epoch": 0.24888888888888888, |
| "grad_norm": 10.875, |
| "learning_rate": 1.548774341351661e-05, |
| "loss": 0.787, |
| "mean_token_accuracy": 0.7617891025543213, |
| "num_tokens": 36964522.0, |
| "step": 5600 |
| }, |
| { |
| "entropy": 2.2578547739982606, |
| "epoch": 0.2511111111111111, |
| "grad_norm": 7.59375, |
| "learning_rate": 1.5441924398625432e-05, |
| "loss": 0.8494, |
| "mean_token_accuracy": 0.7462784695625305, |
| "num_tokens": 37031831.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 2.2453298950195313, |
| "epoch": 0.25333333333333335, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.539610538373425e-05, |
| "loss": 0.7898, |
| "mean_token_accuracy": 0.7597798085212708, |
| "num_tokens": 37096360.0, |
| "step": 5700 |
| }, |
| { |
| "entropy": 2.1777192115783692, |
| "epoch": 0.25555555555555554, |
| "grad_norm": 10.0, |
| "learning_rate": 1.535028636884307e-05, |
| "loss": 0.8147, |
| "mean_token_accuracy": 0.7537575364112854, |
| "num_tokens": 37161633.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 2.171867892742157, |
| "epoch": 0.2577777777777778, |
| "grad_norm": 7.71875, |
| "learning_rate": 1.530446735395189e-05, |
| "loss": 0.8001, |
| "mean_token_accuracy": 0.7575827407836914, |
| "num_tokens": 37228323.0, |
| "step": 5800 |
| }, |
| { |
| "entropy": 2.1830092740058897, |
| "epoch": 0.26, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.525864833906071e-05, |
| "loss": 0.7722, |
| "mean_token_accuracy": 0.7659056377410889, |
| "num_tokens": 37295036.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 2.2247348022460938, |
| "epoch": 0.26222222222222225, |
| "grad_norm": 7.75, |
| "learning_rate": 1.5212829324169531e-05, |
| "loss": 0.786, |
| "mean_token_accuracy": 0.7639349389076233, |
| "num_tokens": 37363275.0, |
| "step": 5900 |
| }, |
| { |
| "entropy": 2.1976384329795837, |
| "epoch": 0.2644444444444444, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.5167010309278351e-05, |
| "loss": 0.7832, |
| "mean_token_accuracy": 0.7637232232093811, |
| "num_tokens": 37429362.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 2.1915416312217713, |
| "epoch": 0.26666666666666666, |
| "grad_norm": 7.53125, |
| "learning_rate": 1.5121191294387172e-05, |
| "loss": 0.7911, |
| "mean_token_accuracy": 0.7595293188095092, |
| "num_tokens": 37496221.0, |
| "step": 6000 |
| }, |
| { |
| "entropy": 2.1550126123428344, |
| "epoch": 0.2688888888888889, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.5075372279495992e-05, |
| "loss": 0.7866, |
| "mean_token_accuracy": 0.7628938972949981, |
| "num_tokens": 37563946.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 2.1578181266784666, |
| "epoch": 0.27111111111111114, |
| "grad_norm": 11.625, |
| "learning_rate": 1.5029553264604813e-05, |
| "loss": 0.8002, |
| "mean_token_accuracy": 0.7602377796173095, |
| "num_tokens": 37632154.0, |
| "step": 6100 |
| }, |
| { |
| "entropy": 2.1704964351654055, |
| "epoch": 0.2733333333333333, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.4983734249713632e-05, |
| "loss": 0.7968, |
| "mean_token_accuracy": 0.7589374840259552, |
| "num_tokens": 37700403.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 2.1958404278755186, |
| "epoch": 0.27555555555555555, |
| "grad_norm": 7.71875, |
| "learning_rate": 1.4937915234822453e-05, |
| "loss": 0.8021, |
| "mean_token_accuracy": 0.7585455322265625, |
| "num_tokens": 37763144.0, |
| "step": 6200 |
| }, |
| { |
| "entropy": 2.2267961406707766, |
| "epoch": 0.2777777777777778, |
| "grad_norm": 9.375, |
| "learning_rate": 1.4892096219931273e-05, |
| "loss": 0.8, |
| "mean_token_accuracy": 0.756108273267746, |
| "num_tokens": 37828258.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 2.2654725027084353, |
| "epoch": 0.28, |
| "grad_norm": 8.875, |
| "learning_rate": 1.4846277205040094e-05, |
| "loss": 0.8398, |
| "mean_token_accuracy": 0.7474746882915497, |
| "num_tokens": 37894712.0, |
| "step": 6300 |
| }, |
| { |
| "entropy": 2.2119923210144044, |
| "epoch": 0.2822222222222222, |
| "grad_norm": 7.125, |
| "learning_rate": 1.4800458190148912e-05, |
| "loss": 0.8204, |
| "mean_token_accuracy": 0.7523449230194091, |
| "num_tokens": 37961686.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 2.176643466949463, |
| "epoch": 0.28444444444444444, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.4754639175257731e-05, |
| "loss": 0.7743, |
| "mean_token_accuracy": 0.7683572423458099, |
| "num_tokens": 38028734.0, |
| "step": 6400 |
| }, |
| { |
| "entropy": 2.175390179157257, |
| "epoch": 0.2866666666666667, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.4708820160366553e-05, |
| "loss": 0.8181, |
| "mean_token_accuracy": 0.7544300401210785, |
| "num_tokens": 38093844.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 2.2051888942718505, |
| "epoch": 0.28888888888888886, |
| "grad_norm": 6.5, |
| "learning_rate": 1.4663001145475372e-05, |
| "loss": 0.8126, |
| "mean_token_accuracy": 0.7564982330799103, |
| "num_tokens": 38159450.0, |
| "step": 6500 |
| }, |
| { |
| "entropy": 2.217521970272064, |
| "epoch": 0.2911111111111111, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.4617182130584193e-05, |
| "loss": 0.816, |
| "mean_token_accuracy": 0.7553452157974243, |
| "num_tokens": 38224878.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 2.190767750740051, |
| "epoch": 0.29333333333333333, |
| "grad_norm": 8.875, |
| "learning_rate": 1.4571363115693013e-05, |
| "loss": 0.8144, |
| "mean_token_accuracy": 0.7556489539146424, |
| "num_tokens": 38291865.0, |
| "step": 6600 |
| }, |
| { |
| "entropy": 2.2169429779052736, |
| "epoch": 0.29555555555555557, |
| "grad_norm": 7.0, |
| "learning_rate": 1.4525544100801834e-05, |
| "loss": 0.8167, |
| "mean_token_accuracy": 0.7550825190544128, |
| "num_tokens": 38360809.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 2.2201628613471986, |
| "epoch": 0.29777777777777775, |
| "grad_norm": 8.375, |
| "learning_rate": 1.4479725085910653e-05, |
| "loss": 0.8098, |
| "mean_token_accuracy": 0.7543143463134766, |
| "num_tokens": 38428799.0, |
| "step": 6700 |
| }, |
| { |
| "entropy": 2.198468723297119, |
| "epoch": 0.3, |
| "grad_norm": 7.59375, |
| "learning_rate": 1.4433906071019475e-05, |
| "loss": 0.841, |
| "mean_token_accuracy": 0.7505165827274323, |
| "num_tokens": 38496550.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 2.1353413486480712, |
| "epoch": 0.3022222222222222, |
| "grad_norm": 7.625, |
| "learning_rate": 1.4388087056128294e-05, |
| "loss": 0.794, |
| "mean_token_accuracy": 0.7610986518859864, |
| "num_tokens": 38564172.0, |
| "step": 6800 |
| }, |
| { |
| "entropy": 2.249865219593048, |
| "epoch": 0.30444444444444446, |
| "grad_norm": 7.5, |
| "learning_rate": 1.4342268041237115e-05, |
| "loss": 0.8153, |
| "mean_token_accuracy": 0.7510960531234742, |
| "num_tokens": 38630875.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 2.1702062487602234, |
| "epoch": 0.30666666666666664, |
| "grad_norm": 10.6875, |
| "learning_rate": 1.4296449026345935e-05, |
| "loss": 0.7737, |
| "mean_token_accuracy": 0.763483716249466, |
| "num_tokens": 38694435.0, |
| "step": 6900 |
| }, |
| { |
| "entropy": 2.2601536536216735, |
| "epoch": 0.3088888888888889, |
| "grad_norm": 10.375, |
| "learning_rate": 1.4250630011454756e-05, |
| "loss": 0.8578, |
| "mean_token_accuracy": 0.7417992842197418, |
| "num_tokens": 38759789.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 2.276434664726257, |
| "epoch": 0.3111111111111111, |
| "grad_norm": 6.8125, |
| "learning_rate": 1.4204810996563575e-05, |
| "loss": 0.7678, |
| "mean_token_accuracy": 0.7661231434345246, |
| "num_tokens": 38826171.0, |
| "step": 7000 |
| }, |
| { |
| "entropy": 2.2015094637870787, |
| "epoch": 0.31333333333333335, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.4158991981672395e-05, |
| "loss": 0.8136, |
| "mean_token_accuracy": 0.7545084583759308, |
| "num_tokens": 38890825.0, |
| "step": 7050 |
| }, |
| { |
| "entropy": 2.1954299902915952, |
| "epoch": 0.31555555555555553, |
| "grad_norm": 7.875, |
| "learning_rate": 1.4113172966781214e-05, |
| "loss": 0.8102, |
| "mean_token_accuracy": 0.7536708974838257, |
| "num_tokens": 38957597.0, |
| "step": 7100 |
| }, |
| { |
| "entropy": 2.1553618788719175, |
| "epoch": 0.31777777777777777, |
| "grad_norm": 10.625, |
| "learning_rate": 1.4067353951890036e-05, |
| "loss": 0.806, |
| "mean_token_accuracy": 0.756889488697052, |
| "num_tokens": 39024094.0, |
| "step": 7150 |
| }, |
| { |
| "entropy": 2.1930714321136473, |
| "epoch": 0.32, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.4021534936998855e-05, |
| "loss": 0.8191, |
| "mean_token_accuracy": 0.7526458752155304, |
| "num_tokens": 39088926.0, |
| "step": 7200 |
| }, |
| { |
| "entropy": 2.1648390030860902, |
| "epoch": 0.32222222222222224, |
| "grad_norm": 6.40625, |
| "learning_rate": 1.3975715922107675e-05, |
| "loss": 0.8018, |
| "mean_token_accuracy": 0.7559799265861511, |
| "num_tokens": 39155958.0, |
| "step": 7250 |
| }, |
| { |
| "entropy": 2.163375446796417, |
| "epoch": 0.3244444444444444, |
| "grad_norm": 7.8125, |
| "learning_rate": 1.3929896907216496e-05, |
| "loss": 0.8065, |
| "mean_token_accuracy": 0.7592847204208374, |
| "num_tokens": 39224450.0, |
| "step": 7300 |
| }, |
| { |
| "entropy": 2.1846578884124757, |
| "epoch": 0.32666666666666666, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.3884077892325315e-05, |
| "loss": 0.7933, |
| "mean_token_accuracy": 0.761990624666214, |
| "num_tokens": 39288896.0, |
| "step": 7350 |
| }, |
| { |
| "entropy": 2.2050635480880736, |
| "epoch": 0.3288888888888889, |
| "grad_norm": 6.84375, |
| "learning_rate": 1.3838258877434137e-05, |
| "loss": 0.8317, |
| "mean_token_accuracy": 0.7510676419734955, |
| "num_tokens": 39354824.0, |
| "step": 7400 |
| }, |
| { |
| "entropy": 2.1733724021911622, |
| "epoch": 0.33111111111111113, |
| "grad_norm": 10.5, |
| "learning_rate": 1.3792439862542956e-05, |
| "loss": 0.8164, |
| "mean_token_accuracy": 0.755161405801773, |
| "num_tokens": 39423233.0, |
| "step": 7450 |
| }, |
| { |
| "entropy": 2.1624359107017517, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 8.875, |
| "learning_rate": 1.3746620847651777e-05, |
| "loss": 0.8105, |
| "mean_token_accuracy": 0.7563036870956421, |
| "num_tokens": 39488844.0, |
| "step": 7500 |
| }, |
| { |
| "entropy": 2.2275446701049804, |
| "epoch": 0.33555555555555555, |
| "grad_norm": 8.625, |
| "learning_rate": 1.3700801832760597e-05, |
| "loss": 0.8569, |
| "mean_token_accuracy": 0.7444409322738648, |
| "num_tokens": 39557151.0, |
| "step": 7550 |
| }, |
| { |
| "entropy": 2.160377633571625, |
| "epoch": 0.3377777777777778, |
| "grad_norm": 8.0, |
| "learning_rate": 1.3654982817869418e-05, |
| "loss": 0.7943, |
| "mean_token_accuracy": 0.7591172540187836, |
| "num_tokens": 39622145.0, |
| "step": 7600 |
| }, |
| { |
| "entropy": 2.1659918451309204, |
| "epoch": 0.34, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.3609163802978237e-05, |
| "loss": 0.8312, |
| "mean_token_accuracy": 0.7525255751609802, |
| "num_tokens": 39692633.0, |
| "step": 7650 |
| }, |
| { |
| "entropy": 2.15194406747818, |
| "epoch": 0.3422222222222222, |
| "grad_norm": 9.8125, |
| "learning_rate": 1.3563344788087059e-05, |
| "loss": 0.7778, |
| "mean_token_accuracy": 0.7636078727245331, |
| "num_tokens": 39759159.0, |
| "step": 7700 |
| }, |
| { |
| "entropy": 2.217858202457428, |
| "epoch": 0.34444444444444444, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.3517525773195876e-05, |
| "loss": 0.8241, |
| "mean_token_accuracy": 0.7522653472423554, |
| "num_tokens": 39826283.0, |
| "step": 7750 |
| }, |
| { |
| "entropy": 2.1696537256240847, |
| "epoch": 0.3466666666666667, |
| "grad_norm": 9.0625, |
| "learning_rate": 1.3471706758304698e-05, |
| "loss": 0.815, |
| "mean_token_accuracy": 0.7559839642047882, |
| "num_tokens": 39895721.0, |
| "step": 7800 |
| }, |
| { |
| "entropy": 2.1727401876449584, |
| "epoch": 0.3488888888888889, |
| "grad_norm": 8.375, |
| "learning_rate": 1.3425887743413517e-05, |
| "loss": 0.8366, |
| "mean_token_accuracy": 0.7508484077453613, |
| "num_tokens": 39963266.0, |
| "step": 7850 |
| }, |
| { |
| "entropy": 2.1629985857009886, |
| "epoch": 0.3511111111111111, |
| "grad_norm": 7.25, |
| "learning_rate": 1.3380068728522338e-05, |
| "loss": 0.782, |
| "mean_token_accuracy": 0.761427184343338, |
| "num_tokens": 40031913.0, |
| "step": 7900 |
| }, |
| { |
| "entropy": 2.157064917087555, |
| "epoch": 0.35333333333333333, |
| "grad_norm": 14.125, |
| "learning_rate": 1.3334249713631158e-05, |
| "loss": 0.7747, |
| "mean_token_accuracy": 0.7661045718193055, |
| "num_tokens": 40098930.0, |
| "step": 7950 |
| }, |
| { |
| "entropy": 2.0971979546546935, |
| "epoch": 0.35555555555555557, |
| "grad_norm": 8.75, |
| "learning_rate": 1.3288430698739979e-05, |
| "loss": 0.8186, |
| "mean_token_accuracy": 0.7531020665168762, |
| "num_tokens": 40165756.0, |
| "step": 8000 |
| }, |
| { |
| "entropy": 2.1666273212432863, |
| "epoch": 0.35777777777777775, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.3242611683848798e-05, |
| "loss": 0.842, |
| "mean_token_accuracy": 0.7474488174915314, |
| "num_tokens": 40233071.0, |
| "step": 8050 |
| }, |
| { |
| "entropy": 2.2938923120498655, |
| "epoch": 0.36, |
| "grad_norm": 11.875, |
| "learning_rate": 1.3196792668957618e-05, |
| "loss": 0.8804, |
| "mean_token_accuracy": 0.7407302105426788, |
| "num_tokens": 40297749.0, |
| "step": 8100 |
| }, |
| { |
| "entropy": 2.2303052496910096, |
| "epoch": 0.3622222222222222, |
| "grad_norm": 8.125, |
| "learning_rate": 1.3150973654066439e-05, |
| "loss": 0.8033, |
| "mean_token_accuracy": 0.7578864073753357, |
| "num_tokens": 40366213.0, |
| "step": 8150 |
| }, |
| { |
| "entropy": 2.2161930847167968, |
| "epoch": 0.36444444444444446, |
| "grad_norm": 9.6875, |
| "learning_rate": 1.3105154639175259e-05, |
| "loss": 0.7947, |
| "mean_token_accuracy": 0.7593982243537902, |
| "num_tokens": 40433288.0, |
| "step": 8200 |
| }, |
| { |
| "entropy": 2.1040697479248047, |
| "epoch": 0.36666666666666664, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.305933562428408e-05, |
| "loss": 0.7959, |
| "mean_token_accuracy": 0.7588922154903411, |
| "num_tokens": 40499720.0, |
| "step": 8250 |
| }, |
| { |
| "entropy": 2.1772580099105836, |
| "epoch": 0.3688888888888889, |
| "grad_norm": 7.25, |
| "learning_rate": 1.30135166093929e-05, |
| "loss": 0.8284, |
| "mean_token_accuracy": 0.7538638985157013, |
| "num_tokens": 40565376.0, |
| "step": 8300 |
| }, |
| { |
| "entropy": 2.128792498111725, |
| "epoch": 0.3711111111111111, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.296769759450172e-05, |
| "loss": 0.7672, |
| "mean_token_accuracy": 0.7641964781284333, |
| "num_tokens": 40631675.0, |
| "step": 8350 |
| }, |
| { |
| "entropy": 2.1719995403289794, |
| "epoch": 0.37333333333333335, |
| "grad_norm": 9.25, |
| "learning_rate": 1.2921878579610538e-05, |
| "loss": 0.801, |
| "mean_token_accuracy": 0.76089714884758, |
| "num_tokens": 40697335.0, |
| "step": 8400 |
| }, |
| { |
| "entropy": 2.156131479740143, |
| "epoch": 0.37555555555555553, |
| "grad_norm": 8.75, |
| "learning_rate": 1.287605956471936e-05, |
| "loss": 0.8384, |
| "mean_token_accuracy": 0.7503017449378967, |
| "num_tokens": 40764922.0, |
| "step": 8450 |
| }, |
| { |
| "entropy": 2.284397015571594, |
| "epoch": 0.37777777777777777, |
| "grad_norm": 9.25, |
| "learning_rate": 1.2830240549828179e-05, |
| "loss": 0.8418, |
| "mean_token_accuracy": 0.7493084251880646, |
| "num_tokens": 40828443.0, |
| "step": 8500 |
| }, |
| { |
| "entropy": 2.147591190338135, |
| "epoch": 0.38, |
| "grad_norm": 8.625, |
| "learning_rate": 1.2784421534937e-05, |
| "loss": 0.8173, |
| "mean_token_accuracy": 0.756924353837967, |
| "num_tokens": 40894414.0, |
| "step": 8550 |
| }, |
| { |
| "entropy": 2.142817313671112, |
| "epoch": 0.38222222222222224, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.273860252004582e-05, |
| "loss": 0.7962, |
| "mean_token_accuracy": 0.7598996949195862, |
| "num_tokens": 40962274.0, |
| "step": 8600 |
| }, |
| { |
| "entropy": 2.1330949759483335, |
| "epoch": 0.3844444444444444, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.269278350515464e-05, |
| "loss": 0.7939, |
| "mean_token_accuracy": 0.7612218356132507, |
| "num_tokens": 41030193.0, |
| "step": 8650 |
| }, |
| { |
| "entropy": 2.166394736766815, |
| "epoch": 0.38666666666666666, |
| "grad_norm": 7.40625, |
| "learning_rate": 1.264696449026346e-05, |
| "loss": 0.8336, |
| "mean_token_accuracy": 0.7500250363349914, |
| "num_tokens": 41099766.0, |
| "step": 8700 |
| }, |
| { |
| "entropy": 2.2078594040870665, |
| "epoch": 0.3888888888888889, |
| "grad_norm": 8.625, |
| "learning_rate": 1.2601145475372281e-05, |
| "loss": 0.803, |
| "mean_token_accuracy": 0.7575648665428162, |
| "num_tokens": 41164744.0, |
| "step": 8750 |
| }, |
| { |
| "entropy": 2.2065504598617554, |
| "epoch": 0.39111111111111113, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.2555326460481101e-05, |
| "loss": 0.816, |
| "mean_token_accuracy": 0.7564821767807007, |
| "num_tokens": 41232194.0, |
| "step": 8800 |
| }, |
| { |
| "entropy": 2.1662226915359497, |
| "epoch": 0.3933333333333333, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.2509507445589922e-05, |
| "loss": 0.8317, |
| "mean_token_accuracy": 0.7503245520591736, |
| "num_tokens": 41297353.0, |
| "step": 8850 |
| }, |
| { |
| "entropy": 2.236915967464447, |
| "epoch": 0.39555555555555555, |
| "grad_norm": 9.875, |
| "learning_rate": 1.2463688430698742e-05, |
| "loss": 0.7977, |
| "mean_token_accuracy": 0.759481954574585, |
| "num_tokens": 41361134.0, |
| "step": 8900 |
| }, |
| { |
| "entropy": 2.1871555137634275, |
| "epoch": 0.3977777777777778, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.2417869415807561e-05, |
| "loss": 0.8034, |
| "mean_token_accuracy": 0.757278825044632, |
| "num_tokens": 41427995.0, |
| "step": 8950 |
| }, |
| { |
| "entropy": 2.2592362785339355, |
| "epoch": 0.4, |
| "grad_norm": 9.875, |
| "learning_rate": 1.2372050400916382e-05, |
| "loss": 0.8186, |
| "mean_token_accuracy": 0.7548012447357177, |
| "num_tokens": 41494201.0, |
| "step": 9000 |
| }, |
| { |
| "entropy": 2.196383099555969, |
| "epoch": 0.4022222222222222, |
| "grad_norm": 7.625, |
| "learning_rate": 1.2326231386025202e-05, |
| "loss": 0.8138, |
| "mean_token_accuracy": 0.7561362779140473, |
| "num_tokens": 41559989.0, |
| "step": 9050 |
| }, |
| { |
| "entropy": 2.1516851663589476, |
| "epoch": 0.40444444444444444, |
| "grad_norm": 13.75, |
| "learning_rate": 1.2280412371134021e-05, |
| "loss": 0.7951, |
| "mean_token_accuracy": 0.7594835031032562, |
| "num_tokens": 41626379.0, |
| "step": 9100 |
| }, |
| { |
| "entropy": 2.1745046091079714, |
| "epoch": 0.4066666666666667, |
| "grad_norm": 8.0, |
| "learning_rate": 1.223459335624284e-05, |
| "loss": 0.8048, |
| "mean_token_accuracy": 0.7599712920188904, |
| "num_tokens": 41692143.0, |
| "step": 9150 |
| }, |
| { |
| "entropy": 2.154158115386963, |
| "epoch": 0.4088888888888889, |
| "grad_norm": 8.25, |
| "learning_rate": 1.2188774341351662e-05, |
| "loss": 0.7877, |
| "mean_token_accuracy": 0.7622714912891388, |
| "num_tokens": 41759805.0, |
| "step": 9200 |
| }, |
| { |
| "entropy": 2.2480538749694823, |
| "epoch": 0.4111111111111111, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.2142955326460481e-05, |
| "loss": 0.8052, |
| "mean_token_accuracy": 0.7560270345211029, |
| "num_tokens": 41823593.0, |
| "step": 9250 |
| }, |
| { |
| "entropy": 2.2491649985313416, |
| "epoch": 0.41333333333333333, |
| "grad_norm": 9.125, |
| "learning_rate": 1.2097136311569303e-05, |
| "loss": 0.8696, |
| "mean_token_accuracy": 0.7419601881504059, |
| "num_tokens": 41888809.0, |
| "step": 9300 |
| }, |
| { |
| "entropy": 2.258921926021576, |
| "epoch": 0.41555555555555557, |
| "grad_norm": 10.0625, |
| "learning_rate": 1.2051317296678122e-05, |
| "loss": 0.8289, |
| "mean_token_accuracy": 0.75118199467659, |
| "num_tokens": 41952730.0, |
| "step": 9350 |
| }, |
| { |
| "entropy": 2.163720915317535, |
| "epoch": 0.4177777777777778, |
| "grad_norm": 9.25, |
| "learning_rate": 1.2005498281786943e-05, |
| "loss": 0.8323, |
| "mean_token_accuracy": 0.7530957090854645, |
| "num_tokens": 42019573.0, |
| "step": 9400 |
| }, |
| { |
| "entropy": 2.2212659883499146, |
| "epoch": 0.42, |
| "grad_norm": 7.40625, |
| "learning_rate": 1.1959679266895763e-05, |
| "loss": 0.8361, |
| "mean_token_accuracy": 0.7518267011642457, |
| "num_tokens": 42088291.0, |
| "step": 9450 |
| }, |
| { |
| "entropy": 2.2050712847709657, |
| "epoch": 0.4222222222222222, |
| "grad_norm": 9.75, |
| "learning_rate": 1.1913860252004584e-05, |
| "loss": 0.8456, |
| "mean_token_accuracy": 0.7448002827167511, |
| "num_tokens": 42158120.0, |
| "step": 9500 |
| }, |
| { |
| "entropy": 2.216916351318359, |
| "epoch": 0.42444444444444446, |
| "grad_norm": 6.8125, |
| "learning_rate": 1.1868041237113403e-05, |
| "loss": 0.8415, |
| "mean_token_accuracy": 0.7452654683589935, |
| "num_tokens": 42226742.0, |
| "step": 9550 |
| }, |
| { |
| "entropy": 2.1955419325828553, |
| "epoch": 0.4266666666666667, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.1822222222222225e-05, |
| "loss": 0.8166, |
| "mean_token_accuracy": 0.7562698805332184, |
| "num_tokens": 42292822.0, |
| "step": 9600 |
| }, |
| { |
| "entropy": 2.2368349695205687, |
| "epoch": 0.4288888888888889, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.1776403207331044e-05, |
| "loss": 0.8089, |
| "mean_token_accuracy": 0.757049810886383, |
| "num_tokens": 42356424.0, |
| "step": 9650 |
| }, |
| { |
| "entropy": 2.178975234031677, |
| "epoch": 0.4311111111111111, |
| "grad_norm": 8.375, |
| "learning_rate": 1.1730584192439865e-05, |
| "loss": 0.8438, |
| "mean_token_accuracy": 0.7456352376937866, |
| "num_tokens": 42422979.0, |
| "step": 9700 |
| }, |
| { |
| "entropy": 2.0868096995353698, |
| "epoch": 0.43333333333333335, |
| "grad_norm": 7.53125, |
| "learning_rate": 1.1684765177548683e-05, |
| "loss": 0.8235, |
| "mean_token_accuracy": 0.7529381263256073, |
| "num_tokens": 42490375.0, |
| "step": 9750 |
| }, |
| { |
| "entropy": 2.147622332572937, |
| "epoch": 0.43555555555555553, |
| "grad_norm": 9.75, |
| "learning_rate": 1.1638946162657503e-05, |
| "loss": 0.7728, |
| "mean_token_accuracy": 0.7673224699497223, |
| "num_tokens": 42557143.0, |
| "step": 9800 |
| }, |
| { |
| "entropy": 2.114510886669159, |
| "epoch": 0.43777777777777777, |
| "grad_norm": 9.375, |
| "learning_rate": 1.1593127147766324e-05, |
| "loss": 0.8238, |
| "mean_token_accuracy": 0.7501462066173553, |
| "num_tokens": 42626026.0, |
| "step": 9850 |
| }, |
| { |
| "entropy": 2.133998863697052, |
| "epoch": 0.44, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.1547308132875143e-05, |
| "loss": 0.8005, |
| "mean_token_accuracy": 0.7565454721450806, |
| "num_tokens": 42688911.0, |
| "step": 9900 |
| }, |
| { |
| "entropy": 2.146713092327118, |
| "epoch": 0.44222222222222224, |
| "grad_norm": 8.625, |
| "learning_rate": 1.1501489117983964e-05, |
| "loss": 0.801, |
| "mean_token_accuracy": 0.7562527394294739, |
| "num_tokens": 42756274.0, |
| "step": 9950 |
| }, |
| { |
| "entropy": 2.205234091281891, |
| "epoch": 0.4444444444444444, |
| "grad_norm": 6.875, |
| "learning_rate": 1.1455670103092784e-05, |
| "loss": 0.8242, |
| "mean_token_accuracy": 0.7537857472896576, |
| "num_tokens": 42825342.0, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "eval_entropy": 2.0749987959861755, |
| "eval_loss": 0.8938310146331787, |
| "eval_mean_token_accuracy": 0.7446183115243912, |
| "eval_num_tokens": 42825342.0, |
| "eval_runtime": 7.8855, |
| "eval_samples_per_second": 1.649, |
| "eval_steps_per_second": 0.507, |
| "step": 10000 |
| }, |
| { |
| "entropy": 2.232636868953705, |
| "epoch": 0.44666666666666666, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.1409851088201605e-05, |
| "loss": 0.8088, |
| "mean_token_accuracy": 0.7571802771091461, |
| "num_tokens": 42892615.0, |
| "step": 10050 |
| }, |
| { |
| "entropy": 2.1724118828773498, |
| "epoch": 0.4488888888888889, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.1364032073310425e-05, |
| "loss": 0.7938, |
| "mean_token_accuracy": 0.7575406277179718, |
| "num_tokens": 42961116.0, |
| "step": 10100 |
| }, |
| { |
| "entropy": 2.2688467741012572, |
| "epoch": 0.45111111111111113, |
| "grad_norm": 8.25, |
| "learning_rate": 1.1318213058419246e-05, |
| "loss": 0.8109, |
| "mean_token_accuracy": 0.7592162156105041, |
| "num_tokens": 43026002.0, |
| "step": 10150 |
| }, |
| { |
| "entropy": 2.189846224784851, |
| "epoch": 0.4533333333333333, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.1272394043528065e-05, |
| "loss": 0.8393, |
| "mean_token_accuracy": 0.750606085062027, |
| "num_tokens": 43096450.0, |
| "step": 10200 |
| }, |
| { |
| "entropy": 2.1485534167289733, |
| "epoch": 0.45555555555555555, |
| "grad_norm": 6.96875, |
| "learning_rate": 1.1226575028636886e-05, |
| "loss": 0.8005, |
| "mean_token_accuracy": 0.7586869549751282, |
| "num_tokens": 43164974.0, |
| "step": 10250 |
| }, |
| { |
| "entropy": 2.114973647594452, |
| "epoch": 0.4577777777777778, |
| "grad_norm": 7.71875, |
| "learning_rate": 1.1180756013745706e-05, |
| "loss": 0.81, |
| "mean_token_accuracy": 0.7530654954910279, |
| "num_tokens": 43230676.0, |
| "step": 10300 |
| }, |
| { |
| "entropy": 2.210615482330322, |
| "epoch": 0.46, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.1134936998854527e-05, |
| "loss": 0.8133, |
| "mean_token_accuracy": 0.7568694865703582, |
| "num_tokens": 43295393.0, |
| "step": 10350 |
| }, |
| { |
| "entropy": 2.1622642707824706, |
| "epoch": 0.4622222222222222, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.1089117983963347e-05, |
| "loss": 0.7942, |
| "mean_token_accuracy": 0.7580956876277923, |
| "num_tokens": 43361602.0, |
| "step": 10400 |
| }, |
| { |
| "entropy": 2.164878299236298, |
| "epoch": 0.46444444444444444, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.1043298969072164e-05, |
| "loss": 0.7759, |
| "mean_token_accuracy": 0.7644217216968536, |
| "num_tokens": 43427335.0, |
| "step": 10450 |
| }, |
| { |
| "entropy": 2.1177299284934996, |
| "epoch": 0.4666666666666667, |
| "grad_norm": 10.0, |
| "learning_rate": 1.0997479954180986e-05, |
| "loss": 0.85, |
| "mean_token_accuracy": 0.7469760966300965, |
| "num_tokens": 43494368.0, |
| "step": 10500 |
| }, |
| { |
| "entropy": 2.151867859363556, |
| "epoch": 0.4688888888888889, |
| "grad_norm": 9.25, |
| "learning_rate": 1.0951660939289805e-05, |
| "loss": 0.8053, |
| "mean_token_accuracy": 0.7571006786823272, |
| "num_tokens": 43560267.0, |
| "step": 10550 |
| }, |
| { |
| "entropy": 2.1284614849090575, |
| "epoch": 0.4711111111111111, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.0905841924398626e-05, |
| "loss": 0.8309, |
| "mean_token_accuracy": 0.752720388174057, |
| "num_tokens": 43626812.0, |
| "step": 10600 |
| }, |
| { |
| "entropy": 2.1522640681266783, |
| "epoch": 0.47333333333333333, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.0860022909507446e-05, |
| "loss": 0.8633, |
| "mean_token_accuracy": 0.7454240775108337, |
| "num_tokens": 43693459.0, |
| "step": 10650 |
| }, |
| { |
| "entropy": 2.1508279252052307, |
| "epoch": 0.47555555555555556, |
| "grad_norm": 7.78125, |
| "learning_rate": 1.0814203894616267e-05, |
| "loss": 0.7701, |
| "mean_token_accuracy": 0.767644681930542, |
| "num_tokens": 43758441.0, |
| "step": 10700 |
| }, |
| { |
| "entropy": 2.1406164813041686, |
| "epoch": 0.4777777777777778, |
| "grad_norm": 8.3125, |
| "learning_rate": 1.0768384879725086e-05, |
| "loss": 0.8299, |
| "mean_token_accuracy": 0.7507808887958527, |
| "num_tokens": 43827587.0, |
| "step": 10750 |
| }, |
| { |
| "entropy": 2.0773234295845033, |
| "epoch": 0.48, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.0722565864833908e-05, |
| "loss": 0.8059, |
| "mean_token_accuracy": 0.7575164914131165, |
| "num_tokens": 43893454.0, |
| "step": 10800 |
| }, |
| { |
| "entropy": 2.0964699006080627, |
| "epoch": 0.4822222222222222, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.0676746849942727e-05, |
| "loss": 0.8538, |
| "mean_token_accuracy": 0.7474709522724151, |
| "num_tokens": 43961221.0, |
| "step": 10850 |
| }, |
| { |
| "entropy": 2.127323613166809, |
| "epoch": 0.48444444444444446, |
| "grad_norm": 8.25, |
| "learning_rate": 1.0630927835051548e-05, |
| "loss": 0.823, |
| "mean_token_accuracy": 0.7566704392433167, |
| "num_tokens": 44030179.0, |
| "step": 10900 |
| }, |
| { |
| "entropy": 2.1277793073654174, |
| "epoch": 0.4866666666666667, |
| "grad_norm": 8.0, |
| "learning_rate": 1.0585108820160368e-05, |
| "loss": 0.7951, |
| "mean_token_accuracy": 0.7619584739208222, |
| "num_tokens": 44096190.0, |
| "step": 10950 |
| }, |
| { |
| "entropy": 2.100730609893799, |
| "epoch": 0.4888888888888889, |
| "grad_norm": 7.5625, |
| "learning_rate": 1.0539289805269189e-05, |
| "loss": 0.8355, |
| "mean_token_accuracy": 0.7507687473297119, |
| "num_tokens": 44164149.0, |
| "step": 11000 |
| }, |
| { |
| "entropy": 2.1630342602729797, |
| "epoch": 0.4911111111111111, |
| "grad_norm": 9.0, |
| "learning_rate": 1.0493470790378008e-05, |
| "loss": 0.8394, |
| "mean_token_accuracy": 0.7504077112674713, |
| "num_tokens": 44232826.0, |
| "step": 11050 |
| }, |
| { |
| "entropy": 2.135989320278168, |
| "epoch": 0.49333333333333335, |
| "grad_norm": 7.25, |
| "learning_rate": 1.044765177548683e-05, |
| "loss": 0.826, |
| "mean_token_accuracy": 0.7503847754001618, |
| "num_tokens": 44300785.0, |
| "step": 11100 |
| }, |
| { |
| "entropy": 2.1245554232597352, |
| "epoch": 0.4955555555555556, |
| "grad_norm": 13.125, |
| "learning_rate": 1.0401832760595647e-05, |
| "loss": 0.8276, |
| "mean_token_accuracy": 0.7498899948596954, |
| "num_tokens": 44366272.0, |
| "step": 11150 |
| }, |
| { |
| "entropy": 2.0881606268882753, |
| "epoch": 0.49777777777777776, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.0356013745704467e-05, |
| "loss": 0.8272, |
| "mean_token_accuracy": 0.7556144452095032, |
| "num_tokens": 44432287.0, |
| "step": 11200 |
| }, |
| { |
| "entropy": 2.1587840700149536, |
| "epoch": 0.5, |
| "grad_norm": 22.625, |
| "learning_rate": 1.0310194730813288e-05, |
| "loss": 0.8169, |
| "mean_token_accuracy": 0.7587198996543885, |
| "num_tokens": 44497099.0, |
| "step": 11250 |
| }, |
| { |
| "entropy": 2.059075405597687, |
| "epoch": 0.5022222222222222, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.0264375715922108e-05, |
| "loss": 0.8128, |
| "mean_token_accuracy": 0.7540943920612335, |
| "num_tokens": 44568074.0, |
| "step": 11300 |
| }, |
| { |
| "entropy": 2.132368493080139, |
| "epoch": 0.5044444444444445, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.0218556701030929e-05, |
| "loss": 0.7915, |
| "mean_token_accuracy": 0.7632870030403137, |
| "num_tokens": 44634710.0, |
| "step": 11350 |
| }, |
| { |
| "entropy": 2.075038847923279, |
| "epoch": 0.5066666666666667, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.0172737686139748e-05, |
| "loss": 0.8615, |
| "mean_token_accuracy": 0.7465237331390381, |
| "num_tokens": 44705095.0, |
| "step": 11400 |
| }, |
| { |
| "entropy": 2.11560346364975, |
| "epoch": 0.5088888888888888, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.012691867124857e-05, |
| "loss": 0.8016, |
| "mean_token_accuracy": 0.758952580690384, |
| "num_tokens": 44772217.0, |
| "step": 11450 |
| }, |
| { |
| "entropy": 2.1024736380577087, |
| "epoch": 0.5111111111111111, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.0081099656357389e-05, |
| "loss": 0.7976, |
| "mean_token_accuracy": 0.7605179595947266, |
| "num_tokens": 44840575.0, |
| "step": 11500 |
| }, |
| { |
| "entropy": 2.145506045818329, |
| "epoch": 0.5133333333333333, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.003528064146621e-05, |
| "loss": 0.8572, |
| "mean_token_accuracy": 0.7433122813701629, |
| "num_tokens": 44906721.0, |
| "step": 11550 |
| }, |
| { |
| "entropy": 2.1501906871795655, |
| "epoch": 0.5155555555555555, |
| "grad_norm": 7.9375, |
| "learning_rate": 9.98946162657503e-06, |
| "loss": 0.8199, |
| "mean_token_accuracy": 0.7566520750522614, |
| "num_tokens": 44973401.0, |
| "step": 11600 |
| }, |
| { |
| "entropy": 2.180145356655121, |
| "epoch": 0.5177777777777778, |
| "grad_norm": 7.96875, |
| "learning_rate": 9.943642611683849e-06, |
| "loss": 0.8497, |
| "mean_token_accuracy": 0.747704974412918, |
| "num_tokens": 45038114.0, |
| "step": 11650 |
| }, |
| { |
| "entropy": 2.1444980192184446, |
| "epoch": 0.52, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.897823596792669e-06, |
| "loss": 0.8054, |
| "mean_token_accuracy": 0.7571915662288666, |
| "num_tokens": 45105740.0, |
| "step": 11700 |
| }, |
| { |
| "entropy": 2.1226493763923644, |
| "epoch": 0.5222222222222223, |
| "grad_norm": 8.6875, |
| "learning_rate": 9.85200458190149e-06, |
| "loss": 0.8543, |
| "mean_token_accuracy": 0.74732057929039, |
| "num_tokens": 45173497.0, |
| "step": 11750 |
| }, |
| { |
| "entropy": 2.105964336395264, |
| "epoch": 0.5244444444444445, |
| "grad_norm": 6.90625, |
| "learning_rate": 9.80618556701031e-06, |
| "loss": 0.8664, |
| "mean_token_accuracy": 0.7433609414100647, |
| "num_tokens": 45240881.0, |
| "step": 11800 |
| }, |
| { |
| "entropy": 2.1655562567710875, |
| "epoch": 0.5266666666666666, |
| "grad_norm": 7.78125, |
| "learning_rate": 9.76036655211913e-06, |
| "loss": 0.8134, |
| "mean_token_accuracy": 0.7605512762069702, |
| "num_tokens": 45306709.0, |
| "step": 11850 |
| }, |
| { |
| "entropy": 2.115200798511505, |
| "epoch": 0.5288888888888889, |
| "grad_norm": 8.125, |
| "learning_rate": 9.71454753722795e-06, |
| "loss": 0.7942, |
| "mean_token_accuracy": 0.76005859375, |
| "num_tokens": 45373791.0, |
| "step": 11900 |
| }, |
| { |
| "entropy": 2.1086077737808226, |
| "epoch": 0.5311111111111111, |
| "grad_norm": 8.375, |
| "learning_rate": 9.668728522336771e-06, |
| "loss": 0.832, |
| "mean_token_accuracy": 0.7541840577125549, |
| "num_tokens": 45442329.0, |
| "step": 11950 |
| }, |
| { |
| "entropy": 2.1453971552848814, |
| "epoch": 0.5333333333333333, |
| "grad_norm": 9.625, |
| "learning_rate": 9.62290950744559e-06, |
| "loss": 0.8618, |
| "mean_token_accuracy": 0.74840935587883, |
| "num_tokens": 45509133.0, |
| "step": 12000 |
| }, |
| { |
| "entropy": 2.185004472732544, |
| "epoch": 0.5355555555555556, |
| "grad_norm": 6.46875, |
| "learning_rate": 9.57709049255441e-06, |
| "loss": 0.8269, |
| "mean_token_accuracy": 0.75218052983284, |
| "num_tokens": 45574414.0, |
| "step": 12050 |
| }, |
| { |
| "entropy": 2.0952596497535705, |
| "epoch": 0.5377777777777778, |
| "grad_norm": 7.59375, |
| "learning_rate": 9.531271477663231e-06, |
| "loss": 0.8312, |
| "mean_token_accuracy": 0.7515332329273224, |
| "num_tokens": 45645759.0, |
| "step": 12100 |
| }, |
| { |
| "entropy": 2.1197858357429507, |
| "epoch": 0.54, |
| "grad_norm": 8.0, |
| "learning_rate": 9.48545246277205e-06, |
| "loss": 0.8165, |
| "mean_token_accuracy": 0.7548327159881592, |
| "num_tokens": 45711167.0, |
| "step": 12150 |
| }, |
| { |
| "entropy": 2.0839271712303162, |
| "epoch": 0.5422222222222223, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.439633447880872e-06, |
| "loss": 0.8007, |
| "mean_token_accuracy": 0.7602275168895721, |
| "num_tokens": 45781410.0, |
| "step": 12200 |
| }, |
| { |
| "entropy": 2.1347982597351076, |
| "epoch": 0.5444444444444444, |
| "grad_norm": 7.09375, |
| "learning_rate": 9.393814432989692e-06, |
| "loss": 0.8163, |
| "mean_token_accuracy": 0.752629029750824, |
| "num_tokens": 45845816.0, |
| "step": 12250 |
| }, |
| { |
| "entropy": 2.124971535205841, |
| "epoch": 0.5466666666666666, |
| "grad_norm": 8.1875, |
| "learning_rate": 9.347995418098513e-06, |
| "loss": 0.8166, |
| "mean_token_accuracy": 0.7571777141094208, |
| "num_tokens": 45911093.0, |
| "step": 12300 |
| }, |
| { |
| "entropy": 2.133582751750946, |
| "epoch": 0.5488888888888889, |
| "grad_norm": 7.8125, |
| "learning_rate": 9.302176403207332e-06, |
| "loss": 0.8724, |
| "mean_token_accuracy": 0.7431828391551971, |
| "num_tokens": 45980429.0, |
| "step": 12350 |
| }, |
| { |
| "entropy": 2.102145965099335, |
| "epoch": 0.5511111111111111, |
| "grad_norm": 6.8125, |
| "learning_rate": 9.256357388316152e-06, |
| "loss": 0.799, |
| "mean_token_accuracy": 0.7613983142375946, |
| "num_tokens": 46047747.0, |
| "step": 12400 |
| }, |
| { |
| "entropy": 2.19835401058197, |
| "epoch": 0.5533333333333333, |
| "grad_norm": 8.25, |
| "learning_rate": 9.210538373424973e-06, |
| "loss": 0.8433, |
| "mean_token_accuracy": 0.750613443851471, |
| "num_tokens": 46109196.0, |
| "step": 12450 |
| }, |
| { |
| "entropy": 2.192390067577362, |
| "epoch": 0.5555555555555556, |
| "grad_norm": 6.6875, |
| "learning_rate": 9.164719358533792e-06, |
| "loss": 0.8233, |
| "mean_token_accuracy": 0.7548783671855926, |
| "num_tokens": 46175945.0, |
| "step": 12500 |
| }, |
| { |
| "entropy": 2.1229864621162413, |
| "epoch": 0.5577777777777778, |
| "grad_norm": 7.0625, |
| "learning_rate": 9.118900343642612e-06, |
| "loss": 0.8412, |
| "mean_token_accuracy": 0.7494572842121124, |
| "num_tokens": 46241375.0, |
| "step": 12550 |
| }, |
| { |
| "entropy": 2.1183197784423826, |
| "epoch": 0.56, |
| "grad_norm": 6.875, |
| "learning_rate": 9.073081328751433e-06, |
| "loss": 0.7941, |
| "mean_token_accuracy": 0.7617311882972717, |
| "num_tokens": 46309203.0, |
| "step": 12600 |
| }, |
| { |
| "entropy": 2.1563652181625366, |
| "epoch": 0.5622222222222222, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.027262313860253e-06, |
| "loss": 0.8333, |
| "mean_token_accuracy": 0.7494297361373902, |
| "num_tokens": 46377337.0, |
| "step": 12650 |
| }, |
| { |
| "entropy": 2.149436049461365, |
| "epoch": 0.5644444444444444, |
| "grad_norm": 9.0, |
| "learning_rate": 8.981443298969072e-06, |
| "loss": 0.8053, |
| "mean_token_accuracy": 0.7584555840492249, |
| "num_tokens": 46444416.0, |
| "step": 12700 |
| }, |
| { |
| "entropy": 2.1461888194084167, |
| "epoch": 0.5666666666666667, |
| "grad_norm": 6.6875, |
| "learning_rate": 8.935624284077893e-06, |
| "loss": 0.8286, |
| "mean_token_accuracy": 0.7490329504013061, |
| "num_tokens": 46513786.0, |
| "step": 12750 |
| }, |
| { |
| "entropy": 2.149775302410126, |
| "epoch": 0.5688888888888889, |
| "grad_norm": 7.5625, |
| "learning_rate": 8.889805269186713e-06, |
| "loss": 0.8296, |
| "mean_token_accuracy": 0.7498440980911255, |
| "num_tokens": 46580058.0, |
| "step": 12800 |
| }, |
| { |
| "entropy": 2.1283646035194397, |
| "epoch": 0.5711111111111111, |
| "grad_norm": 6.96875, |
| "learning_rate": 8.843986254295534e-06, |
| "loss": 0.8538, |
| "mean_token_accuracy": 0.7479376924037934, |
| "num_tokens": 46645685.0, |
| "step": 12850 |
| }, |
| { |
| "entropy": 2.136440978050232, |
| "epoch": 0.5733333333333334, |
| "grad_norm": 10.1875, |
| "learning_rate": 8.798167239404353e-06, |
| "loss": 0.8558, |
| "mean_token_accuracy": 0.7462117183208465, |
| "num_tokens": 46713622.0, |
| "step": 12900 |
| }, |
| { |
| "entropy": 2.044099681377411, |
| "epoch": 0.5755555555555556, |
| "grad_norm": 7.65625, |
| "learning_rate": 8.752348224513175e-06, |
| "loss": 0.8078, |
| "mean_token_accuracy": 0.755551530122757, |
| "num_tokens": 46784415.0, |
| "step": 12950 |
| }, |
| { |
| "entropy": 2.1400314664840696, |
| "epoch": 0.5777777777777777, |
| "grad_norm": 9.1875, |
| "learning_rate": 8.706529209621994e-06, |
| "loss": 0.7706, |
| "mean_token_accuracy": 0.7651760971546173, |
| "num_tokens": 46849509.0, |
| "step": 13000 |
| }, |
| { |
| "entropy": 2.1189517760276795, |
| "epoch": 0.58, |
| "grad_norm": 8.0, |
| "learning_rate": 8.660710194730814e-06, |
| "loss": 0.8118, |
| "mean_token_accuracy": 0.7566381883621216, |
| "num_tokens": 46916191.0, |
| "step": 13050 |
| }, |
| { |
| "entropy": 2.107829716205597, |
| "epoch": 0.5822222222222222, |
| "grad_norm": 6.5625, |
| "learning_rate": 8.614891179839635e-06, |
| "loss": 0.7543, |
| "mean_token_accuracy": 0.7710553121566772, |
| "num_tokens": 46984133.0, |
| "step": 13100 |
| }, |
| { |
| "entropy": 2.2209577679634096, |
| "epoch": 0.5844444444444444, |
| "grad_norm": 10.4375, |
| "learning_rate": 8.569072164948454e-06, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.7479790794849396, |
| "num_tokens": 47046937.0, |
| "step": 13150 |
| }, |
| { |
| "entropy": 2.1490628361701964, |
| "epoch": 0.5866666666666667, |
| "grad_norm": 7.40625, |
| "learning_rate": 8.523253150057275e-06, |
| "loss": 0.8488, |
| "mean_token_accuracy": 0.7476764714717865, |
| "num_tokens": 47115124.0, |
| "step": 13200 |
| }, |
| { |
| "entropy": 2.1350867557525635, |
| "epoch": 0.5888888888888889, |
| "grad_norm": 7.4375, |
| "learning_rate": 8.477434135166095e-06, |
| "loss": 0.8266, |
| "mean_token_accuracy": 0.7515107154846191, |
| "num_tokens": 47183436.0, |
| "step": 13250 |
| }, |
| { |
| "entropy": 2.203930015563965, |
| "epoch": 0.5911111111111111, |
| "grad_norm": 7.375, |
| "learning_rate": 8.431615120274916e-06, |
| "loss": 0.8206, |
| "mean_token_accuracy": 0.758274735212326, |
| "num_tokens": 47250973.0, |
| "step": 13300 |
| }, |
| { |
| "entropy": 2.17663028717041, |
| "epoch": 0.5933333333333334, |
| "grad_norm": 7.125, |
| "learning_rate": 8.385796105383734e-06, |
| "loss": 0.8627, |
| "mean_token_accuracy": 0.7431224703788757, |
| "num_tokens": 47319629.0, |
| "step": 13350 |
| }, |
| { |
| "entropy": 2.1186418867111207, |
| "epoch": 0.5955555555555555, |
| "grad_norm": 8.0625, |
| "learning_rate": 8.339977090492555e-06, |
| "loss": 0.8079, |
| "mean_token_accuracy": 0.7619412040710449, |
| "num_tokens": 47387168.0, |
| "step": 13400 |
| }, |
| { |
| "entropy": 2.1040466976165773, |
| "epoch": 0.5977777777777777, |
| "grad_norm": 8.0625, |
| "learning_rate": 8.294158075601375e-06, |
| "loss": 0.8375, |
| "mean_token_accuracy": 0.7491306090354919, |
| "num_tokens": 47458980.0, |
| "step": 13450 |
| }, |
| { |
| "entropy": 2.1847296571731567, |
| "epoch": 0.6, |
| "grad_norm": 7.75, |
| "learning_rate": 8.248339060710196e-06, |
| "loss": 0.8656, |
| "mean_token_accuracy": 0.7442897510528564, |
| "num_tokens": 47527776.0, |
| "step": 13500 |
| }, |
| { |
| "entropy": 2.1727059721946715, |
| "epoch": 0.6022222222222222, |
| "grad_norm": 8.875, |
| "learning_rate": 8.202520045819015e-06, |
| "loss": 0.8416, |
| "mean_token_accuracy": 0.7498923110961914, |
| "num_tokens": 47592067.0, |
| "step": 13550 |
| }, |
| { |
| "entropy": 2.146069450378418, |
| "epoch": 0.6044444444444445, |
| "grad_norm": 8.1875, |
| "learning_rate": 8.156701030927836e-06, |
| "loss": 0.8304, |
| "mean_token_accuracy": 0.7521639728546142, |
| "num_tokens": 47660520.0, |
| "step": 13600 |
| }, |
| { |
| "entropy": 2.1906869626045227, |
| "epoch": 0.6066666666666667, |
| "grad_norm": 8.0625, |
| "learning_rate": 8.110882016036656e-06, |
| "loss": 0.8126, |
| "mean_token_accuracy": 0.7562307071685791, |
| "num_tokens": 47726291.0, |
| "step": 13650 |
| }, |
| { |
| "entropy": 2.256641490459442, |
| "epoch": 0.6088888888888889, |
| "grad_norm": 8.6875, |
| "learning_rate": 8.065063001145475e-06, |
| "loss": 0.8229, |
| "mean_token_accuracy": 0.7553273499011993, |
| "num_tokens": 47789796.0, |
| "step": 13700 |
| }, |
| { |
| "entropy": 2.15241749048233, |
| "epoch": 0.6111111111111112, |
| "grad_norm": 7.625, |
| "learning_rate": 8.019243986254297e-06, |
| "loss": 0.7972, |
| "mean_token_accuracy": 0.7611013460159302, |
| "num_tokens": 47854791.0, |
| "step": 13750 |
| }, |
| { |
| "entropy": 2.1587498426437377, |
| "epoch": 0.6133333333333333, |
| "grad_norm": 10.1875, |
| "learning_rate": 7.973424971363116e-06, |
| "loss": 0.8219, |
| "mean_token_accuracy": 0.7551308906078339, |
| "num_tokens": 47922964.0, |
| "step": 13800 |
| }, |
| { |
| "entropy": 2.141474585533142, |
| "epoch": 0.6155555555555555, |
| "grad_norm": 8.875, |
| "learning_rate": 7.927605956471937e-06, |
| "loss": 0.8584, |
| "mean_token_accuracy": 0.7452862620353699, |
| "num_tokens": 47989985.0, |
| "step": 13850 |
| }, |
| { |
| "entropy": 2.116554036140442, |
| "epoch": 0.6177777777777778, |
| "grad_norm": 7.0625, |
| "learning_rate": 7.881786941580757e-06, |
| "loss": 0.7575, |
| "mean_token_accuracy": 0.7697462677955628, |
| "num_tokens": 48055485.0, |
| "step": 13900 |
| }, |
| { |
| "entropy": 2.145723969936371, |
| "epoch": 0.62, |
| "grad_norm": 7.21875, |
| "learning_rate": 7.835967926689578e-06, |
| "loss": 0.8424, |
| "mean_token_accuracy": 0.7527009093761444, |
| "num_tokens": 48123791.0, |
| "step": 13950 |
| }, |
| { |
| "entropy": 2.228086581230164, |
| "epoch": 0.6222222222222222, |
| "grad_norm": 16.25, |
| "learning_rate": 7.790148911798397e-06, |
| "loss": 0.8609, |
| "mean_token_accuracy": 0.7481016480922699, |
| "num_tokens": 48188322.0, |
| "step": 14000 |
| }, |
| { |
| "entropy": 2.1463445258140563, |
| "epoch": 0.6244444444444445, |
| "grad_norm": 8.0, |
| "learning_rate": 7.744329896907217e-06, |
| "loss": 0.7909, |
| "mean_token_accuracy": 0.7623829674720765, |
| "num_tokens": 48256913.0, |
| "step": 14050 |
| }, |
| { |
| "entropy": 2.1033449459075926, |
| "epoch": 0.6266666666666667, |
| "grad_norm": 6.5, |
| "learning_rate": 7.698510882016036e-06, |
| "loss": 0.8107, |
| "mean_token_accuracy": 0.7558232533931732, |
| "num_tokens": 48322383.0, |
| "step": 14100 |
| }, |
| { |
| "entropy": 2.1505328583717347, |
| "epoch": 0.6288888888888889, |
| "grad_norm": 8.0, |
| "learning_rate": 7.652691867124858e-06, |
| "loss": 0.8591, |
| "mean_token_accuracy": 0.7444049680233001, |
| "num_tokens": 48390728.0, |
| "step": 14150 |
| }, |
| { |
| "entropy": 2.1623160433769226, |
| "epoch": 0.6311111111111111, |
| "grad_norm": 7.875, |
| "learning_rate": 7.606872852233678e-06, |
| "loss": 0.783, |
| "mean_token_accuracy": 0.7643996751308442, |
| "num_tokens": 48457254.0, |
| "step": 14200 |
| }, |
| { |
| "entropy": 2.1291307163238526, |
| "epoch": 0.6333333333333333, |
| "grad_norm": 9.125, |
| "learning_rate": 7.561053837342498e-06, |
| "loss": 0.8297, |
| "mean_token_accuracy": 0.755406551361084, |
| "num_tokens": 48526165.0, |
| "step": 14250 |
| }, |
| { |
| "entropy": 2.0992329573631285, |
| "epoch": 0.6355555555555555, |
| "grad_norm": 9.5625, |
| "learning_rate": 7.515234822451319e-06, |
| "loss": 0.8357, |
| "mean_token_accuracy": 0.7475246119499207, |
| "num_tokens": 48594409.0, |
| "step": 14300 |
| }, |
| { |
| "entropy": 2.096374764442444, |
| "epoch": 0.6377777777777778, |
| "grad_norm": 7.5, |
| "learning_rate": 7.469415807560139e-06, |
| "loss": 0.7929, |
| "mean_token_accuracy": 0.7609642744064331, |
| "num_tokens": 48662016.0, |
| "step": 14350 |
| }, |
| { |
| "entropy": 2.1556865262985228, |
| "epoch": 0.64, |
| "grad_norm": 7.96875, |
| "learning_rate": 7.4235967926689576e-06, |
| "loss": 0.8432, |
| "mean_token_accuracy": 0.7489691793918609, |
| "num_tokens": 48727783.0, |
| "step": 14400 |
| }, |
| { |
| "entropy": 2.2297734808921814, |
| "epoch": 0.6422222222222222, |
| "grad_norm": 7.25, |
| "learning_rate": 7.377777777777778e-06, |
| "loss": 0.845, |
| "mean_token_accuracy": 0.7471547079086304, |
| "num_tokens": 48795872.0, |
| "step": 14450 |
| }, |
| { |
| "entropy": 2.093806471824646, |
| "epoch": 0.6444444444444445, |
| "grad_norm": 9.875, |
| "learning_rate": 7.331958762886598e-06, |
| "loss": 0.7843, |
| "mean_token_accuracy": 0.764157919883728, |
| "num_tokens": 48864940.0, |
| "step": 14500 |
| }, |
| { |
| "entropy": 2.1970014452934263, |
| "epoch": 0.6466666666666666, |
| "grad_norm": 6.96875, |
| "learning_rate": 7.286139747995419e-06, |
| "loss": 0.8726, |
| "mean_token_accuracy": 0.7447559666633606, |
| "num_tokens": 48932155.0, |
| "step": 14550 |
| }, |
| { |
| "entropy": 2.173217701911926, |
| "epoch": 0.6488888888888888, |
| "grad_norm": 7.0, |
| "learning_rate": 7.240320733104239e-06, |
| "loss": 0.8209, |
| "mean_token_accuracy": 0.7533725011348724, |
| "num_tokens": 48996865.0, |
| "step": 14600 |
| }, |
| { |
| "entropy": 2.1182384943962096, |
| "epoch": 0.6511111111111111, |
| "grad_norm": 7.25, |
| "learning_rate": 7.194501718213059e-06, |
| "loss": 0.8433, |
| "mean_token_accuracy": 0.743918879032135, |
| "num_tokens": 49062177.0, |
| "step": 14650 |
| }, |
| { |
| "entropy": 2.182834794521332, |
| "epoch": 0.6533333333333333, |
| "grad_norm": 15.5, |
| "learning_rate": 7.148682703321879e-06, |
| "loss": 0.8163, |
| "mean_token_accuracy": 0.7540112996101379, |
| "num_tokens": 49126666.0, |
| "step": 14700 |
| }, |
| { |
| "entropy": 2.1658547282218934, |
| "epoch": 0.6555555555555556, |
| "grad_norm": 7.0625, |
| "learning_rate": 7.102863688430699e-06, |
| "loss": 0.8318, |
| "mean_token_accuracy": 0.7548858106136322, |
| "num_tokens": 49193802.0, |
| "step": 14750 |
| }, |
| { |
| "entropy": 2.22178261756897, |
| "epoch": 0.6577777777777778, |
| "grad_norm": 7.34375, |
| "learning_rate": 7.0570446735395194e-06, |
| "loss": 0.8171, |
| "mean_token_accuracy": 0.755473929643631, |
| "num_tokens": 49256927.0, |
| "step": 14800 |
| }, |
| { |
| "entropy": 2.1972863483428955, |
| "epoch": 0.66, |
| "grad_norm": 17.25, |
| "learning_rate": 7.01122565864834e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7451527345180512, |
| "num_tokens": 49324296.0, |
| "step": 14850 |
| }, |
| { |
| "entropy": 2.13280175447464, |
| "epoch": 0.6622222222222223, |
| "grad_norm": 7.28125, |
| "learning_rate": 6.96540664375716e-06, |
| "loss": 0.8565, |
| "mean_token_accuracy": 0.7481217682361603, |
| "num_tokens": 49393359.0, |
| "step": 14900 |
| }, |
| { |
| "entropy": 2.223068025112152, |
| "epoch": 0.6644444444444444, |
| "grad_norm": 9.9375, |
| "learning_rate": 6.9195876288659804e-06, |
| "loss": 0.8344, |
| "mean_token_accuracy": 0.7528722262382508, |
| "num_tokens": 49455849.0, |
| "step": 14950 |
| }, |
| { |
| "entropy": 2.1984812426567077, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 7.875, |
| "learning_rate": 6.873768613974801e-06, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.7472201347351074, |
| "num_tokens": 49521825.0, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "eval_entropy": 2.0471889674663544, |
| "eval_loss": 0.8776370286941528, |
| "eval_mean_token_accuracy": 0.7463443726301193, |
| "eval_num_tokens": 49521825.0, |
| "eval_runtime": 5.8695, |
| "eval_samples_per_second": 2.215, |
| "eval_steps_per_second": 0.681, |
| "step": 15000 |
| }, |
| { |
| "entropy": 2.1753278255462645, |
| "epoch": 0.6688888888888889, |
| "grad_norm": 8.6875, |
| "learning_rate": 6.8279495990836194e-06, |
| "loss": 0.8559, |
| "mean_token_accuracy": 0.7446865785121918, |
| "num_tokens": 49586773.0, |
| "step": 15050 |
| }, |
| { |
| "entropy": 2.137886700630188, |
| "epoch": 0.6711111111111111, |
| "grad_norm": 8.6875, |
| "learning_rate": 6.78213058419244e-06, |
| "loss": 0.8128, |
| "mean_token_accuracy": 0.757747368812561, |
| "num_tokens": 49654785.0, |
| "step": 15100 |
| }, |
| { |
| "entropy": 2.1542887043952943, |
| "epoch": 0.6733333333333333, |
| "grad_norm": 9.5625, |
| "learning_rate": 6.73631156930126e-06, |
| "loss": 0.8498, |
| "mean_token_accuracy": 0.7489660286903381, |
| "num_tokens": 49720666.0, |
| "step": 15150 |
| }, |
| { |
| "entropy": 2.1714869332313538, |
| "epoch": 0.6755555555555556, |
| "grad_norm": 8.25, |
| "learning_rate": 6.6904925544100804e-06, |
| "loss": 0.8183, |
| "mean_token_accuracy": 0.7538860237598419, |
| "num_tokens": 49790606.0, |
| "step": 15200 |
| }, |
| { |
| "entropy": 2.1753248810768127, |
| "epoch": 0.6777777777777778, |
| "grad_norm": 7.96875, |
| "learning_rate": 6.644673539518901e-06, |
| "loss": 0.8017, |
| "mean_token_accuracy": 0.7590576016902923, |
| "num_tokens": 49855120.0, |
| "step": 15250 |
| }, |
| { |
| "entropy": 2.1427893018722535, |
| "epoch": 0.68, |
| "grad_norm": 10.25, |
| "learning_rate": 6.598854524627721e-06, |
| "loss": 0.8308, |
| "mean_token_accuracy": 0.7554869890213013, |
| "num_tokens": 49923395.0, |
| "step": 15300 |
| }, |
| { |
| "entropy": 2.0930906391143798, |
| "epoch": 0.6822222222222222, |
| "grad_norm": 8.1875, |
| "learning_rate": 6.5530355097365415e-06, |
| "loss": 0.8226, |
| "mean_token_accuracy": 0.7525417697429657, |
| "num_tokens": 49988632.0, |
| "step": 15350 |
| }, |
| { |
| "entropy": 2.183457748889923, |
| "epoch": 0.6844444444444444, |
| "grad_norm": 7.34375, |
| "learning_rate": 6.507216494845361e-06, |
| "loss": 0.8382, |
| "mean_token_accuracy": 0.7489526784420013, |
| "num_tokens": 50055473.0, |
| "step": 15400 |
| }, |
| { |
| "entropy": 2.1905418968200685, |
| "epoch": 0.6866666666666666, |
| "grad_norm": 8.25, |
| "learning_rate": 6.461397479954181e-06, |
| "loss": 0.8143, |
| "mean_token_accuracy": 0.7581993770599366, |
| "num_tokens": 50122557.0, |
| "step": 15450 |
| }, |
| { |
| "entropy": 2.1051335525512695, |
| "epoch": 0.6888888888888889, |
| "grad_norm": 8.25, |
| "learning_rate": 6.415578465063002e-06, |
| "loss": 0.8459, |
| "mean_token_accuracy": 0.7460039758682251, |
| "num_tokens": 50190091.0, |
| "step": 15500 |
| }, |
| { |
| "entropy": 2.191064128875732, |
| "epoch": 0.6911111111111111, |
| "grad_norm": 6.96875, |
| "learning_rate": 6.369759450171822e-06, |
| "loss": 0.8458, |
| "mean_token_accuracy": 0.7485956978797913, |
| "num_tokens": 50255412.0, |
| "step": 15550 |
| }, |
| { |
| "entropy": 2.1825055122375487, |
| "epoch": 0.6933333333333334, |
| "grad_norm": 8.25, |
| "learning_rate": 6.323940435280642e-06, |
| "loss": 0.8686, |
| "mean_token_accuracy": 0.7439832353591919, |
| "num_tokens": 50323487.0, |
| "step": 15600 |
| }, |
| { |
| "entropy": 2.2044737410545348, |
| "epoch": 0.6955555555555556, |
| "grad_norm": 6.71875, |
| "learning_rate": 6.278121420389463e-06, |
| "loss": 0.8422, |
| "mean_token_accuracy": 0.7480260360240937, |
| "num_tokens": 50389804.0, |
| "step": 15650 |
| }, |
| { |
| "entropy": 2.194586501121521, |
| "epoch": 0.6977777777777778, |
| "grad_norm": 7.65625, |
| "learning_rate": 6.232302405498283e-06, |
| "loss": 0.8351, |
| "mean_token_accuracy": 0.7537850439548492, |
| "num_tokens": 50452603.0, |
| "step": 15700 |
| }, |
| { |
| "entropy": 2.1760614275932313, |
| "epoch": 0.7, |
| "grad_norm": 7.90625, |
| "learning_rate": 6.186483390607102e-06, |
| "loss": 0.8462, |
| "mean_token_accuracy": 0.7481569695472717, |
| "num_tokens": 50522865.0, |
| "step": 15750 |
| }, |
| { |
| "entropy": 2.1524909257888796, |
| "epoch": 0.7022222222222222, |
| "grad_norm": 15.375, |
| "learning_rate": 6.140664375715922e-06, |
| "loss": 0.8255, |
| "mean_token_accuracy": 0.7523048520088196, |
| "num_tokens": 50592440.0, |
| "step": 15800 |
| }, |
| { |
| "entropy": 2.160615997314453, |
| "epoch": 0.7044444444444444, |
| "grad_norm": 8.1875, |
| "learning_rate": 6.094845360824742e-06, |
| "loss": 0.8195, |
| "mean_token_accuracy": 0.7542084169387817, |
| "num_tokens": 50661169.0, |
| "step": 15850 |
| }, |
| { |
| "entropy": 2.1588108134269715, |
| "epoch": 0.7066666666666667, |
| "grad_norm": 7.28125, |
| "learning_rate": 6.049026345933563e-06, |
| "loss": 0.8343, |
| "mean_token_accuracy": 0.7516414785385132, |
| "num_tokens": 50730313.0, |
| "step": 15900 |
| }, |
| { |
| "entropy": 2.2916067910194395, |
| "epoch": 0.7088888888888889, |
| "grad_norm": 8.9375, |
| "learning_rate": 6.003207331042383e-06, |
| "loss": 0.8779, |
| "mean_token_accuracy": 0.7434639298915863, |
| "num_tokens": 50793896.0, |
| "step": 15950 |
| }, |
| { |
| "entropy": 2.165027015209198, |
| "epoch": 0.7111111111111111, |
| "grad_norm": 9.75, |
| "learning_rate": 5.957388316151203e-06, |
| "loss": 0.8546, |
| "mean_token_accuracy": 0.7454061865806579, |
| "num_tokens": 50861288.0, |
| "step": 16000 |
| }, |
| { |
| "entropy": 2.0912844824790953, |
| "epoch": 0.7133333333333334, |
| "grad_norm": 8.375, |
| "learning_rate": 5.911569301260024e-06, |
| "loss": 0.7921, |
| "mean_token_accuracy": 0.7610292685031891, |
| "num_tokens": 50928175.0, |
| "step": 16050 |
| }, |
| { |
| "entropy": 2.193835806846619, |
| "epoch": 0.7155555555555555, |
| "grad_norm": 9.375, |
| "learning_rate": 5.865750286368843e-06, |
| "loss": 0.8429, |
| "mean_token_accuracy": 0.748592312335968, |
| "num_tokens": 50991843.0, |
| "step": 16100 |
| }, |
| { |
| "entropy": 2.150286679267883, |
| "epoch": 0.7177777777777777, |
| "grad_norm": 8.0, |
| "learning_rate": 5.8199312714776635e-06, |
| "loss": 0.8731, |
| "mean_token_accuracy": 0.7444507765769959, |
| "num_tokens": 51057738.0, |
| "step": 16150 |
| }, |
| { |
| "entropy": 2.2080190443992613, |
| "epoch": 0.72, |
| "grad_norm": 6.59375, |
| "learning_rate": 5.774112256586484e-06, |
| "loss": 0.8394, |
| "mean_token_accuracy": 0.7506636822223663, |
| "num_tokens": 51122791.0, |
| "step": 16200 |
| }, |
| { |
| "entropy": 2.1831669211387634, |
| "epoch": 0.7222222222222222, |
| "grad_norm": 6.78125, |
| "learning_rate": 5.728293241695304e-06, |
| "loss": 0.8655, |
| "mean_token_accuracy": 0.7454936730861664, |
| "num_tokens": 51189529.0, |
| "step": 16250 |
| }, |
| { |
| "entropy": 2.187736349105835, |
| "epoch": 0.7244444444444444, |
| "grad_norm": 7.15625, |
| "learning_rate": 5.6824742268041245e-06, |
| "loss": 0.8503, |
| "mean_token_accuracy": 0.7467346620559693, |
| "num_tokens": 51255571.0, |
| "step": 16300 |
| }, |
| { |
| "entropy": 2.228825159072876, |
| "epoch": 0.7266666666666667, |
| "grad_norm": 7.09375, |
| "learning_rate": 5.636655211912945e-06, |
| "loss": 0.8646, |
| "mean_token_accuracy": 0.7439834475517273, |
| "num_tokens": 51321902.0, |
| "step": 16350 |
| }, |
| { |
| "entropy": 2.1766190052032472, |
| "epoch": 0.7288888888888889, |
| "grad_norm": 8.0, |
| "learning_rate": 5.590836197021764e-06, |
| "loss": 0.8582, |
| "mean_token_accuracy": 0.7422134637832641, |
| "num_tokens": 51387236.0, |
| "step": 16400 |
| }, |
| { |
| "entropy": 2.1550874876976014, |
| "epoch": 0.7311111111111112, |
| "grad_norm": 6.84375, |
| "learning_rate": 5.545017182130585e-06, |
| "loss": 0.803, |
| "mean_token_accuracy": 0.7592871415615082, |
| "num_tokens": 51454340.0, |
| "step": 16450 |
| }, |
| { |
| "entropy": 2.1886205792427065, |
| "epoch": 0.7333333333333333, |
| "grad_norm": 8.5625, |
| "learning_rate": 5.499198167239405e-06, |
| "loss": 0.8312, |
| "mean_token_accuracy": 0.7522400307655335, |
| "num_tokens": 51521289.0, |
| "step": 16500 |
| }, |
| { |
| "entropy": 2.1630620193481445, |
| "epoch": 0.7355555555555555, |
| "grad_norm": 11.125, |
| "learning_rate": 5.4533791523482245e-06, |
| "loss": 0.8429, |
| "mean_token_accuracy": 0.7494550979137421, |
| "num_tokens": 51590453.0, |
| "step": 16550 |
| }, |
| { |
| "entropy": 2.2003714513778685, |
| "epoch": 0.7377777777777778, |
| "grad_norm": 7.90625, |
| "learning_rate": 5.407560137457045e-06, |
| "loss": 0.8594, |
| "mean_token_accuracy": 0.7431814324855804, |
| "num_tokens": 51659077.0, |
| "step": 16600 |
| }, |
| { |
| "entropy": 2.159868106842041, |
| "epoch": 0.74, |
| "grad_norm": 7.09375, |
| "learning_rate": 5.361741122565865e-06, |
| "loss": 0.8511, |
| "mean_token_accuracy": 0.7491097986698151, |
| "num_tokens": 51724327.0, |
| "step": 16650 |
| }, |
| { |
| "entropy": 2.1559503626823426, |
| "epoch": 0.7422222222222222, |
| "grad_norm": 7.03125, |
| "learning_rate": 5.3159221076746855e-06, |
| "loss": 0.866, |
| "mean_token_accuracy": 0.7423543095588684, |
| "num_tokens": 51793571.0, |
| "step": 16700 |
| }, |
| { |
| "entropy": 2.1431314539909363, |
| "epoch": 0.7444444444444445, |
| "grad_norm": 6.96875, |
| "learning_rate": 5.270103092783505e-06, |
| "loss": 0.8175, |
| "mean_token_accuracy": 0.7550452971458435, |
| "num_tokens": 51862535.0, |
| "step": 16750 |
| }, |
| { |
| "entropy": 2.1364932513237, |
| "epoch": 0.7466666666666667, |
| "grad_norm": 9.5, |
| "learning_rate": 5.224284077892325e-06, |
| "loss": 0.8622, |
| "mean_token_accuracy": 0.7470258843898773, |
| "num_tokens": 51930936.0, |
| "step": 16800 |
| }, |
| { |
| "entropy": 2.213224956989288, |
| "epoch": 0.7488888888888889, |
| "grad_norm": 7.0, |
| "learning_rate": 5.178465063001146e-06, |
| "loss": 0.8664, |
| "mean_token_accuracy": 0.7476742577552795, |
| "num_tokens": 51996312.0, |
| "step": 16850 |
| }, |
| { |
| "entropy": 2.3157504415512085, |
| "epoch": 0.7511111111111111, |
| "grad_norm": 7.53125, |
| "learning_rate": 5.132646048109966e-06, |
| "loss": 0.821, |
| "mean_token_accuracy": 0.7552366864681244, |
| "num_tokens": 52061146.0, |
| "step": 16900 |
| }, |
| { |
| "entropy": 2.258919379711151, |
| "epoch": 0.7533333333333333, |
| "grad_norm": 7.1875, |
| "learning_rate": 5.086827033218786e-06, |
| "loss": 0.8122, |
| "mean_token_accuracy": 0.7566414999961854, |
| "num_tokens": 52126822.0, |
| "step": 16950 |
| }, |
| { |
| "entropy": 2.2003828430175782, |
| "epoch": 0.7555555555555555, |
| "grad_norm": 8.6875, |
| "learning_rate": 5.041008018327607e-06, |
| "loss": 0.8321, |
| "mean_token_accuracy": 0.7529778003692627, |
| "num_tokens": 52192915.0, |
| "step": 17000 |
| }, |
| { |
| "entropy": 2.2184296226501465, |
| "epoch": 0.7577777777777778, |
| "grad_norm": 9.1875, |
| "learning_rate": 4.995189003436426e-06, |
| "loss": 0.8577, |
| "mean_token_accuracy": 0.7449932956695556, |
| "num_tokens": 52261692.0, |
| "step": 17050 |
| }, |
| { |
| "entropy": 2.171595447063446, |
| "epoch": 0.76, |
| "grad_norm": 9.125, |
| "learning_rate": 4.9493699885452465e-06, |
| "loss": 0.8545, |
| "mean_token_accuracy": 0.7480556511878967, |
| "num_tokens": 52329194.0, |
| "step": 17100 |
| }, |
| { |
| "entropy": 2.2223637771606444, |
| "epoch": 0.7622222222222222, |
| "grad_norm": 7.375, |
| "learning_rate": 4.903550973654067e-06, |
| "loss": 0.8367, |
| "mean_token_accuracy": 0.7503479218482971, |
| "num_tokens": 52395383.0, |
| "step": 17150 |
| }, |
| { |
| "entropy": 2.2046579623222353, |
| "epoch": 0.7644444444444445, |
| "grad_norm": 7.53125, |
| "learning_rate": 4.857731958762887e-06, |
| "loss": 0.8635, |
| "mean_token_accuracy": 0.7457800447940827, |
| "num_tokens": 52463232.0, |
| "step": 17200 |
| }, |
| { |
| "entropy": 2.173711452484131, |
| "epoch": 0.7666666666666667, |
| "grad_norm": 6.625, |
| "learning_rate": 4.8119129438717075e-06, |
| "loss": 0.8209, |
| "mean_token_accuracy": 0.7542583394050598, |
| "num_tokens": 52531656.0, |
| "step": 17250 |
| }, |
| { |
| "entropy": 2.2189766001701354, |
| "epoch": 0.7688888888888888, |
| "grad_norm": 10.6875, |
| "learning_rate": 4.766093928980528e-06, |
| "loss": 0.8685, |
| "mean_token_accuracy": 0.7441653323173523, |
| "num_tokens": 52595389.0, |
| "step": 17300 |
| }, |
| { |
| "entropy": 2.1452325272560118, |
| "epoch": 0.7711111111111111, |
| "grad_norm": 9.625, |
| "learning_rate": 4.720274914089347e-06, |
| "loss": 0.8348, |
| "mean_token_accuracy": 0.7506163120269775, |
| "num_tokens": 52663028.0, |
| "step": 17350 |
| }, |
| { |
| "entropy": 2.2454160952568056, |
| "epoch": 0.7733333333333333, |
| "grad_norm": 7.75, |
| "learning_rate": 4.674455899198168e-06, |
| "loss": 0.8597, |
| "mean_token_accuracy": 0.7458804631233216, |
| "num_tokens": 52728656.0, |
| "step": 17400 |
| }, |
| { |
| "entropy": 2.292165369987488, |
| "epoch": 0.7755555555555556, |
| "grad_norm": 6.65625, |
| "learning_rate": 4.628636884306988e-06, |
| "loss": 0.8405, |
| "mean_token_accuracy": 0.7525258159637451, |
| "num_tokens": 52794230.0, |
| "step": 17450 |
| }, |
| { |
| "entropy": 2.1589057970047, |
| "epoch": 0.7777777777777778, |
| "grad_norm": 7.9375, |
| "learning_rate": 4.582817869415808e-06, |
| "loss": 0.8481, |
| "mean_token_accuracy": 0.7463726258277893, |
| "num_tokens": 52862165.0, |
| "step": 17500 |
| }, |
| { |
| "entropy": 2.169076681137085, |
| "epoch": 0.78, |
| "grad_norm": 7.78125, |
| "learning_rate": 4.536998854524628e-06, |
| "loss": 0.8224, |
| "mean_token_accuracy": 0.7554120934009552, |
| "num_tokens": 52930052.0, |
| "step": 17550 |
| }, |
| { |
| "entropy": 2.21035982131958, |
| "epoch": 0.7822222222222223, |
| "grad_norm": 8.625, |
| "learning_rate": 4.491179839633448e-06, |
| "loss": 0.7947, |
| "mean_token_accuracy": 0.7617706823348999, |
| "num_tokens": 52997539.0, |
| "step": 17600 |
| }, |
| { |
| "entropy": 2.285029878616333, |
| "epoch": 0.7844444444444445, |
| "grad_norm": 6.90625, |
| "learning_rate": 4.4453608247422685e-06, |
| "loss": 0.8251, |
| "mean_token_accuracy": 0.7533777153491974, |
| "num_tokens": 53062000.0, |
| "step": 17650 |
| }, |
| { |
| "entropy": 2.169939410686493, |
| "epoch": 0.7866666666666666, |
| "grad_norm": 6.875, |
| "learning_rate": 4.399541809851088e-06, |
| "loss": 0.8256, |
| "mean_token_accuracy": 0.7538401031494141, |
| "num_tokens": 53128912.0, |
| "step": 17700 |
| }, |
| { |
| "entropy": 2.2041222214698792, |
| "epoch": 0.7888888888888889, |
| "grad_norm": 10.375, |
| "learning_rate": 4.353722794959908e-06, |
| "loss": 0.7965, |
| "mean_token_accuracy": 0.7599800097942352, |
| "num_tokens": 53195055.0, |
| "step": 17750 |
| }, |
| { |
| "entropy": 2.2029996418952944, |
| "epoch": 0.7911111111111111, |
| "grad_norm": 8.25, |
| "learning_rate": 4.307903780068729e-06, |
| "loss": 0.8447, |
| "mean_token_accuracy": 0.748561098575592, |
| "num_tokens": 53259948.0, |
| "step": 17800 |
| }, |
| { |
| "entropy": 2.139230773448944, |
| "epoch": 0.7933333333333333, |
| "grad_norm": 6.15625, |
| "learning_rate": 4.262084765177549e-06, |
| "loss": 0.8634, |
| "mean_token_accuracy": 0.7443192017078399, |
| "num_tokens": 53330653.0, |
| "step": 17850 |
| }, |
| { |
| "entropy": 2.1335135221481325, |
| "epoch": 0.7955555555555556, |
| "grad_norm": 7.09375, |
| "learning_rate": 4.216265750286369e-06, |
| "loss": 0.8332, |
| "mean_token_accuracy": 0.7517373490333558, |
| "num_tokens": 53398724.0, |
| "step": 17900 |
| }, |
| { |
| "entropy": 2.164382312297821, |
| "epoch": 0.7977777777777778, |
| "grad_norm": 8.375, |
| "learning_rate": 4.17044673539519e-06, |
| "loss": 0.8511, |
| "mean_token_accuracy": 0.7499736166000366, |
| "num_tokens": 53466863.0, |
| "step": 17950 |
| }, |
| { |
| "entropy": 2.190858428478241, |
| "epoch": 0.8, |
| "grad_norm": 6.59375, |
| "learning_rate": 4.12462772050401e-06, |
| "loss": 0.8523, |
| "mean_token_accuracy": 0.7486334586143494, |
| "num_tokens": 53532021.0, |
| "step": 18000 |
| }, |
| { |
| "entropy": 2.22938738822937, |
| "epoch": 0.8022222222222222, |
| "grad_norm": 8.3125, |
| "learning_rate": 4.0788087056128295e-06, |
| "loss": 0.8667, |
| "mean_token_accuracy": 0.7432076168060303, |
| "num_tokens": 53600120.0, |
| "step": 18050 |
| }, |
| { |
| "entropy": 2.233106544017792, |
| "epoch": 0.8044444444444444, |
| "grad_norm": 9.375, |
| "learning_rate": 4.03298969072165e-06, |
| "loss": 0.8267, |
| "mean_token_accuracy": 0.7550076794624329, |
| "num_tokens": 53665356.0, |
| "step": 18100 |
| }, |
| { |
| "entropy": 2.214742834568024, |
| "epoch": 0.8066666666666666, |
| "grad_norm": 10.375, |
| "learning_rate": 3.98717067583047e-06, |
| "loss": 0.8315, |
| "mean_token_accuracy": 0.752135591506958, |
| "num_tokens": 53732051.0, |
| "step": 18150 |
| }, |
| { |
| "entropy": 2.1795053052902222, |
| "epoch": 0.8088888888888889, |
| "grad_norm": 8.0625, |
| "learning_rate": 3.94135166093929e-06, |
| "loss": 0.8391, |
| "mean_token_accuracy": 0.7515090310573578, |
| "num_tokens": 53798222.0, |
| "step": 18200 |
| }, |
| { |
| "entropy": 2.1936265587806703, |
| "epoch": 0.8111111111111111, |
| "grad_norm": 7.875, |
| "learning_rate": 3.89553264604811e-06, |
| "loss": 0.8642, |
| "mean_token_accuracy": 0.7441390895843506, |
| "num_tokens": 53861370.0, |
| "step": 18250 |
| }, |
| { |
| "entropy": 2.261586802005768, |
| "epoch": 0.8133333333333334, |
| "grad_norm": 8.0625, |
| "learning_rate": 3.84971363115693e-06, |
| "loss": 0.8219, |
| "mean_token_accuracy": 0.7518996393680573, |
| "num_tokens": 53927168.0, |
| "step": 18300 |
| }, |
| { |
| "entropy": 2.1711619758605956, |
| "epoch": 0.8155555555555556, |
| "grad_norm": 7.125, |
| "learning_rate": 3.8038946162657507e-06, |
| "loss": 0.854, |
| "mean_token_accuracy": 0.7472757256031036, |
| "num_tokens": 53995523.0, |
| "step": 18350 |
| }, |
| { |
| "entropy": 2.1286602544784547, |
| "epoch": 0.8177777777777778, |
| "grad_norm": 10.0625, |
| "learning_rate": 3.7580756013745706e-06, |
| "loss": 0.8391, |
| "mean_token_accuracy": 0.7513591229915619, |
| "num_tokens": 54064144.0, |
| "step": 18400 |
| }, |
| { |
| "entropy": 2.227732105255127, |
| "epoch": 0.82, |
| "grad_norm": 8.5, |
| "learning_rate": 3.712256586483391e-06, |
| "loss": 0.8569, |
| "mean_token_accuracy": 0.7463644111156463, |
| "num_tokens": 54130199.0, |
| "step": 18450 |
| }, |
| { |
| "entropy": 2.242250292301178, |
| "epoch": 0.8222222222222222, |
| "grad_norm": 7.4375, |
| "learning_rate": 3.6664375715922113e-06, |
| "loss": 0.8458, |
| "mean_token_accuracy": 0.7489724898338318, |
| "num_tokens": 54193967.0, |
| "step": 18500 |
| }, |
| { |
| "entropy": 2.2216247129440307, |
| "epoch": 0.8244444444444444, |
| "grad_norm": 9.5, |
| "learning_rate": 3.620618556701031e-06, |
| "loss": 0.8631, |
| "mean_token_accuracy": 0.7474298918247223, |
| "num_tokens": 54258735.0, |
| "step": 18550 |
| }, |
| { |
| "entropy": 2.19566166639328, |
| "epoch": 0.8266666666666667, |
| "grad_norm": 8.875, |
| "learning_rate": 3.574799541809851e-06, |
| "loss": 0.8383, |
| "mean_token_accuracy": 0.7505941188335419, |
| "num_tokens": 54325869.0, |
| "step": 18600 |
| }, |
| { |
| "entropy": 2.2029261493682863, |
| "epoch": 0.8288888888888889, |
| "grad_norm": 6.71875, |
| "learning_rate": 3.5289805269186715e-06, |
| "loss": 0.8685, |
| "mean_token_accuracy": 0.7453620088100433, |
| "num_tokens": 54391495.0, |
| "step": 18650 |
| }, |
| { |
| "entropy": 2.321768162250519, |
| "epoch": 0.8311111111111111, |
| "grad_norm": 8.1875, |
| "learning_rate": 3.4831615120274914e-06, |
| "loss": 0.8268, |
| "mean_token_accuracy": 0.7540508365631103, |
| "num_tokens": 54457662.0, |
| "step": 18700 |
| }, |
| { |
| "entropy": 2.204611828327179, |
| "epoch": 0.8333333333333334, |
| "grad_norm": 7.03125, |
| "learning_rate": 3.4373424971363117e-06, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.7491276848316193, |
| "num_tokens": 54522465.0, |
| "step": 18750 |
| }, |
| { |
| "entropy": 2.180729539394379, |
| "epoch": 0.8355555555555556, |
| "grad_norm": 7.96875, |
| "learning_rate": 3.391523482245132e-06, |
| "loss": 0.8335, |
| "mean_token_accuracy": 0.7529357576370239, |
| "num_tokens": 54590053.0, |
| "step": 18800 |
| }, |
| { |
| "entropy": 2.205647897720337, |
| "epoch": 0.8377777777777777, |
| "grad_norm": 7.53125, |
| "learning_rate": 3.3457044673539524e-06, |
| "loss": 0.8733, |
| "mean_token_accuracy": 0.7421969878673553, |
| "num_tokens": 54656959.0, |
| "step": 18850 |
| }, |
| { |
| "entropy": 2.22350492477417, |
| "epoch": 0.84, |
| "grad_norm": 7.96875, |
| "learning_rate": 3.2998854524627723e-06, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7474096655845642, |
| "num_tokens": 54722156.0, |
| "step": 18900 |
| }, |
| { |
| "entropy": 2.1877153444290163, |
| "epoch": 0.8422222222222222, |
| "grad_norm": 6.15625, |
| "learning_rate": 3.2540664375715927e-06, |
| "loss": 0.8894, |
| "mean_token_accuracy": 0.7431522738933564, |
| "num_tokens": 54789152.0, |
| "step": 18950 |
| }, |
| { |
| "entropy": 2.225923342704773, |
| "epoch": 0.8444444444444444, |
| "grad_norm": 11.125, |
| "learning_rate": 3.2082474226804126e-06, |
| "loss": 0.8555, |
| "mean_token_accuracy": 0.7505220258235932, |
| "num_tokens": 54858257.0, |
| "step": 19000 |
| }, |
| { |
| "entropy": 2.1716776871681214, |
| "epoch": 0.8466666666666667, |
| "grad_norm": 7.75, |
| "learning_rate": 3.1624284077892325e-06, |
| "loss": 0.8272, |
| "mean_token_accuracy": 0.7512453496456146, |
| "num_tokens": 54927221.0, |
| "step": 19050 |
| }, |
| { |
| "entropy": 2.2240468645095826, |
| "epoch": 0.8488888888888889, |
| "grad_norm": 12.5, |
| "learning_rate": 3.116609392898053e-06, |
| "loss": 0.8363, |
| "mean_token_accuracy": 0.7510754656791687, |
| "num_tokens": 54995938.0, |
| "step": 19100 |
| }, |
| { |
| "entropy": 2.1904780864715576, |
| "epoch": 0.8511111111111112, |
| "grad_norm": 9.75, |
| "learning_rate": 3.070790378006873e-06, |
| "loss": 0.834, |
| "mean_token_accuracy": 0.75088552236557, |
| "num_tokens": 55058163.0, |
| "step": 19150 |
| }, |
| { |
| "entropy": 2.215365264415741, |
| "epoch": 0.8533333333333334, |
| "grad_norm": 8.25, |
| "learning_rate": 3.0249713631156935e-06, |
| "loss": 0.8597, |
| "mean_token_accuracy": 0.7446792745590209, |
| "num_tokens": 55124026.0, |
| "step": 19200 |
| }, |
| { |
| "entropy": 2.202007007598877, |
| "epoch": 0.8555555555555555, |
| "grad_norm": 9.0625, |
| "learning_rate": 2.9791523482245134e-06, |
| "loss": 0.8144, |
| "mean_token_accuracy": 0.7558258211612702, |
| "num_tokens": 55192976.0, |
| "step": 19250 |
| }, |
| { |
| "entropy": 2.2235484766960143, |
| "epoch": 0.8577777777777778, |
| "grad_norm": 8.3125, |
| "learning_rate": 2.9333333333333338e-06, |
| "loss": 0.8662, |
| "mean_token_accuracy": 0.7427694439888001, |
| "num_tokens": 55260374.0, |
| "step": 19300 |
| }, |
| { |
| "entropy": 2.256188449859619, |
| "epoch": 0.86, |
| "grad_norm": 8.5625, |
| "learning_rate": 2.887514318442154e-06, |
| "loss": 0.8496, |
| "mean_token_accuracy": 0.7467195224761963, |
| "num_tokens": 55326060.0, |
| "step": 19350 |
| }, |
| { |
| "entropy": 2.2128705596923828, |
| "epoch": 0.8622222222222222, |
| "grad_norm": 11.75, |
| "learning_rate": 2.8416953035509736e-06, |
| "loss": 0.8645, |
| "mean_token_accuracy": 0.7435001564025879, |
| "num_tokens": 55391552.0, |
| "step": 19400 |
| }, |
| { |
| "entropy": 2.1930198001861574, |
| "epoch": 0.8644444444444445, |
| "grad_norm": 8.125, |
| "learning_rate": 2.795876288659794e-06, |
| "loss": 0.8781, |
| "mean_token_accuracy": 0.7410417079925538, |
| "num_tokens": 55461963.0, |
| "step": 19450 |
| }, |
| { |
| "entropy": 2.1576480197906496, |
| "epoch": 0.8666666666666667, |
| "grad_norm": 7.75, |
| "learning_rate": 2.7500572737686143e-06, |
| "loss": 0.832, |
| "mean_token_accuracy": 0.7539917016029358, |
| "num_tokens": 55529650.0, |
| "step": 19500 |
| }, |
| { |
| "entropy": 2.1880473136901855, |
| "epoch": 0.8688888888888889, |
| "grad_norm": 7.75, |
| "learning_rate": 2.7042382588774346e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7416273355484009, |
| "num_tokens": 55596036.0, |
| "step": 19550 |
| }, |
| { |
| "entropy": 2.2236318159103394, |
| "epoch": 0.8711111111111111, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.6584192439862545e-06, |
| "loss": 0.8535, |
| "mean_token_accuracy": 0.7471865510940552, |
| "num_tokens": 55662244.0, |
| "step": 19600 |
| }, |
| { |
| "entropy": 2.2074507117271422, |
| "epoch": 0.8733333333333333, |
| "grad_norm": 8.125, |
| "learning_rate": 2.612600229095075e-06, |
| "loss": 0.857, |
| "mean_token_accuracy": 0.746492406129837, |
| "num_tokens": 55730056.0, |
| "step": 19650 |
| }, |
| { |
| "entropy": 2.175764639377594, |
| "epoch": 0.8755555555555555, |
| "grad_norm": 7.875, |
| "learning_rate": 2.566781214203895e-06, |
| "loss": 0.8402, |
| "mean_token_accuracy": 0.7526404368877411, |
| "num_tokens": 55796368.0, |
| "step": 19700 |
| }, |
| { |
| "entropy": 2.2133652138710023, |
| "epoch": 0.8777777777777778, |
| "grad_norm": 7.40625, |
| "learning_rate": 2.5209621993127147e-06, |
| "loss": 0.8401, |
| "mean_token_accuracy": 0.7528079390525818, |
| "num_tokens": 55862412.0, |
| "step": 19750 |
| }, |
| { |
| "entropy": 2.16952513217926, |
| "epoch": 0.88, |
| "grad_norm": 11.125, |
| "learning_rate": 2.475143184421535e-06, |
| "loss": 0.8162, |
| "mean_token_accuracy": 0.7567255461215973, |
| "num_tokens": 55929780.0, |
| "step": 19800 |
| }, |
| { |
| "entropy": 2.2304827785491943, |
| "epoch": 0.8822222222222222, |
| "grad_norm": 9.125, |
| "learning_rate": 2.4293241695303554e-06, |
| "loss": 0.8747, |
| "mean_token_accuracy": 0.7438322114944458, |
| "num_tokens": 55997356.0, |
| "step": 19850 |
| }, |
| { |
| "entropy": 2.1662241196632386, |
| "epoch": 0.8844444444444445, |
| "grad_norm": 7.71875, |
| "learning_rate": 2.3835051546391753e-06, |
| "loss": 0.8738, |
| "mean_token_accuracy": 0.7397397947311402, |
| "num_tokens": 56066573.0, |
| "step": 19900 |
| }, |
| { |
| "entropy": 2.1665696811676027, |
| "epoch": 0.8866666666666667, |
| "grad_norm": 7.15625, |
| "learning_rate": 2.3376861397479956e-06, |
| "loss": 0.8558, |
| "mean_token_accuracy": 0.7482295572757721, |
| "num_tokens": 56136956.0, |
| "step": 19950 |
| }, |
| { |
| "entropy": 2.2371082639694215, |
| "epoch": 0.8888888888888888, |
| "grad_norm": 8.75, |
| "learning_rate": 2.291867124856816e-06, |
| "loss": 0.8136, |
| "mean_token_accuracy": 0.7589144480228424, |
| "num_tokens": 56202618.0, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "eval_entropy": 2.108439266681671, |
| "eval_loss": 0.8734950423240662, |
| "eval_mean_token_accuracy": 0.7479927390813828, |
| "eval_num_tokens": 56202618.0, |
| "eval_runtime": 5.499, |
| "eval_samples_per_second": 2.364, |
| "eval_steps_per_second": 0.727, |
| "step": 20000 |
| }, |
| { |
| "entropy": 2.2510220575332642, |
| "epoch": 0.8911111111111111, |
| "grad_norm": 9.5, |
| "learning_rate": 2.246048109965636e-06, |
| "loss": 0.8871, |
| "mean_token_accuracy": 0.7370022451877594, |
| "num_tokens": 56266792.0, |
| "step": 20050 |
| }, |
| { |
| "entropy": 2.1528950190544127, |
| "epoch": 0.8933333333333333, |
| "grad_norm": 9.75, |
| "learning_rate": 2.200229095074456e-06, |
| "loss": 0.8586, |
| "mean_token_accuracy": 0.7451235044002533, |
| "num_tokens": 56334124.0, |
| "step": 20100 |
| }, |
| { |
| "entropy": 2.1923975205421447, |
| "epoch": 0.8955555555555555, |
| "grad_norm": 6.71875, |
| "learning_rate": 2.154410080183276e-06, |
| "loss": 0.874, |
| "mean_token_accuracy": 0.7444364356994629, |
| "num_tokens": 56401545.0, |
| "step": 20150 |
| }, |
| { |
| "entropy": 2.222883083820343, |
| "epoch": 0.8977777777777778, |
| "grad_norm": 9.1875, |
| "learning_rate": 2.1085910652920965e-06, |
| "loss": 0.8376, |
| "mean_token_accuracy": 0.7498513388633729, |
| "num_tokens": 56471261.0, |
| "step": 20200 |
| }, |
| { |
| "entropy": 2.228672001361847, |
| "epoch": 0.9, |
| "grad_norm": 7.21875, |
| "learning_rate": 2.062772050400917e-06, |
| "loss": 0.8231, |
| "mean_token_accuracy": 0.755576502084732, |
| "num_tokens": 56535034.0, |
| "step": 20250 |
| }, |
| { |
| "entropy": 2.196291310787201, |
| "epoch": 0.9022222222222223, |
| "grad_norm": 7.21875, |
| "learning_rate": 2.0169530355097367e-06, |
| "loss": 0.8711, |
| "mean_token_accuracy": 0.7452339708805085, |
| "num_tokens": 56605038.0, |
| "step": 20300 |
| }, |
| { |
| "entropy": 2.206197905540466, |
| "epoch": 0.9044444444444445, |
| "grad_norm": 9.0, |
| "learning_rate": 1.971134020618557e-06, |
| "loss": 0.8707, |
| "mean_token_accuracy": 0.7420957183837891, |
| "num_tokens": 56670283.0, |
| "step": 20350 |
| }, |
| { |
| "entropy": 2.1762799286842345, |
| "epoch": 0.9066666666666666, |
| "grad_norm": 11.0, |
| "learning_rate": 1.925315005727377e-06, |
| "loss": 0.8375, |
| "mean_token_accuracy": 0.751966392993927, |
| "num_tokens": 56735920.0, |
| "step": 20400 |
| }, |
| { |
| "entropy": 2.218523108959198, |
| "epoch": 0.9088888888888889, |
| "grad_norm": 9.25, |
| "learning_rate": 1.879495990836197e-06, |
| "loss": 0.8468, |
| "mean_token_accuracy": 0.7500053834915161, |
| "num_tokens": 56801907.0, |
| "step": 20450 |
| }, |
| { |
| "entropy": 2.289003756046295, |
| "epoch": 0.9111111111111111, |
| "grad_norm": 11.75, |
| "learning_rate": 1.8336769759450174e-06, |
| "loss": 0.8643, |
| "mean_token_accuracy": 0.7460965967178345, |
| "num_tokens": 56867853.0, |
| "step": 20500 |
| }, |
| { |
| "entropy": 2.1768972492218017, |
| "epoch": 0.9133333333333333, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.7878579610538373e-06, |
| "loss": 0.8613, |
| "mean_token_accuracy": 0.7483590936660767, |
| "num_tokens": 56936042.0, |
| "step": 20550 |
| }, |
| { |
| "entropy": 2.211531710624695, |
| "epoch": 0.9155555555555556, |
| "grad_norm": 12.125, |
| "learning_rate": 1.7420389461626577e-06, |
| "loss": 0.8518, |
| "mean_token_accuracy": 0.7475169038772583, |
| "num_tokens": 57003712.0, |
| "step": 20600 |
| }, |
| { |
| "entropy": 2.2285032725334166, |
| "epoch": 0.9177777777777778, |
| "grad_norm": 9.0, |
| "learning_rate": 1.6962199312714778e-06, |
| "loss": 0.8805, |
| "mean_token_accuracy": 0.7423696160316468, |
| "num_tokens": 57071218.0, |
| "step": 20650 |
| }, |
| { |
| "entropy": 2.2252227783203127, |
| "epoch": 0.92, |
| "grad_norm": 13.1875, |
| "learning_rate": 1.6504009163802981e-06, |
| "loss": 0.8401, |
| "mean_token_accuracy": 0.751958976984024, |
| "num_tokens": 57136636.0, |
| "step": 20700 |
| }, |
| { |
| "entropy": 2.220773038864136, |
| "epoch": 0.9222222222222223, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.604581901489118e-06, |
| "loss": 0.8293, |
| "mean_token_accuracy": 0.7526602661609649, |
| "num_tokens": 57201653.0, |
| "step": 20750 |
| }, |
| { |
| "entropy": 2.2865464878082276, |
| "epoch": 0.9244444444444444, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.5587628865979382e-06, |
| "loss": 0.8497, |
| "mean_token_accuracy": 0.7494020068645477, |
| "num_tokens": 57266565.0, |
| "step": 20800 |
| }, |
| { |
| "entropy": 2.2102810382843017, |
| "epoch": 0.9266666666666666, |
| "grad_norm": 6.8125, |
| "learning_rate": 1.5129438717067585e-06, |
| "loss": 0.8608, |
| "mean_token_accuracy": 0.748868852853775, |
| "num_tokens": 57334744.0, |
| "step": 20850 |
| }, |
| { |
| "entropy": 2.1809313702583313, |
| "epoch": 0.9288888888888889, |
| "grad_norm": 15.625, |
| "learning_rate": 1.4671248568155784e-06, |
| "loss": 0.8679, |
| "mean_token_accuracy": 0.7432131195068359, |
| "num_tokens": 57403405.0, |
| "step": 20900 |
| }, |
| { |
| "entropy": 2.2084882354736326, |
| "epoch": 0.9311111111111111, |
| "grad_norm": 6.90625, |
| "learning_rate": 1.4213058419243988e-06, |
| "loss": 0.8544, |
| "mean_token_accuracy": 0.7461783969402314, |
| "num_tokens": 57470110.0, |
| "step": 20950 |
| }, |
| { |
| "entropy": 2.143686933517456, |
| "epoch": 0.9333333333333333, |
| "grad_norm": 7.375, |
| "learning_rate": 1.375486827033219e-06, |
| "loss": 0.8695, |
| "mean_token_accuracy": 0.7416430413722992, |
| "num_tokens": 57537171.0, |
| "step": 21000 |
| }, |
| { |
| "entropy": 2.210377869606018, |
| "epoch": 0.9355555555555556, |
| "grad_norm": 6.75, |
| "learning_rate": 1.329667812142039e-06, |
| "loss": 0.8413, |
| "mean_token_accuracy": 0.7495397543907165, |
| "num_tokens": 57602216.0, |
| "step": 21050 |
| }, |
| { |
| "entropy": 2.186688220500946, |
| "epoch": 0.9377777777777778, |
| "grad_norm": 6.96875, |
| "learning_rate": 1.2838487972508592e-06, |
| "loss": 0.8411, |
| "mean_token_accuracy": 0.7513405895233154, |
| "num_tokens": 57669645.0, |
| "step": 21100 |
| }, |
| { |
| "entropy": 2.1919109773635865, |
| "epoch": 0.94, |
| "grad_norm": 6.96875, |
| "learning_rate": 1.2380297823596793e-06, |
| "loss": 0.8498, |
| "mean_token_accuracy": 0.7503270518779754, |
| "num_tokens": 57738922.0, |
| "step": 21150 |
| }, |
| { |
| "entropy": 2.2736938905715944, |
| "epoch": 0.9422222222222222, |
| "grad_norm": 9.0625, |
| "learning_rate": 1.1922107674684994e-06, |
| "loss": 0.8467, |
| "mean_token_accuracy": 0.7496572816371918, |
| "num_tokens": 57805121.0, |
| "step": 21200 |
| }, |
| { |
| "entropy": 2.2604011583328245, |
| "epoch": 0.9444444444444444, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.1463917525773197e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.7471484684944153, |
| "num_tokens": 57869045.0, |
| "step": 21250 |
| }, |
| { |
| "entropy": 2.236326413154602, |
| "epoch": 0.9466666666666667, |
| "grad_norm": 8.625, |
| "learning_rate": 1.1005727376861399e-06, |
| "loss": 0.8684, |
| "mean_token_accuracy": 0.7434519910812378, |
| "num_tokens": 57937773.0, |
| "step": 21300 |
| }, |
| { |
| "entropy": 2.2119676208496095, |
| "epoch": 0.9488888888888889, |
| "grad_norm": 7.5, |
| "learning_rate": 1.05475372279496e-06, |
| "loss": 0.8414, |
| "mean_token_accuracy": 0.7505972480773926, |
| "num_tokens": 57999509.0, |
| "step": 21350 |
| }, |
| { |
| "entropy": 2.19934800863266, |
| "epoch": 0.9511111111111111, |
| "grad_norm": 10.0, |
| "learning_rate": 1.0089347079037801e-06, |
| "loss": 0.8283, |
| "mean_token_accuracy": 0.7531795799732208, |
| "num_tokens": 58066324.0, |
| "step": 21400 |
| }, |
| { |
| "entropy": 2.189887263774872, |
| "epoch": 0.9533333333333334, |
| "grad_norm": 8.6875, |
| "learning_rate": 9.631156930126003e-07, |
| "loss": 0.8438, |
| "mean_token_accuracy": 0.7486509644985199, |
| "num_tokens": 58133522.0, |
| "step": 21450 |
| }, |
| { |
| "entropy": 2.2022621488571166, |
| "epoch": 0.9555555555555556, |
| "grad_norm": 12.0, |
| "learning_rate": 9.172966781214204e-07, |
| "loss": 0.8816, |
| "mean_token_accuracy": 0.7386648190021515, |
| "num_tokens": 58201032.0, |
| "step": 21500 |
| }, |
| { |
| "entropy": 2.2088757705688478, |
| "epoch": 0.9577777777777777, |
| "grad_norm": 7.65625, |
| "learning_rate": 8.714776632302406e-07, |
| "loss": 0.8685, |
| "mean_token_accuracy": 0.745578328371048, |
| "num_tokens": 58266051.0, |
| "step": 21550 |
| }, |
| { |
| "entropy": 2.2617397117614746, |
| "epoch": 0.96, |
| "grad_norm": 7.5625, |
| "learning_rate": 8.256586483390607e-07, |
| "loss": 0.857, |
| "mean_token_accuracy": 0.7448407518863678, |
| "num_tokens": 58332021.0, |
| "step": 21600 |
| }, |
| { |
| "entropy": 2.2229527854919433, |
| "epoch": 0.9622222222222222, |
| "grad_norm": 6.625, |
| "learning_rate": 7.79839633447881e-07, |
| "loss": 0.8636, |
| "mean_token_accuracy": 0.7455604159832001, |
| "num_tokens": 58396998.0, |
| "step": 21650 |
| }, |
| { |
| "entropy": 2.179794452190399, |
| "epoch": 0.9644444444444444, |
| "grad_norm": 7.71875, |
| "learning_rate": 7.340206185567011e-07, |
| "loss": 0.8293, |
| "mean_token_accuracy": 0.754916387796402, |
| "num_tokens": 58464526.0, |
| "step": 21700 |
| }, |
| { |
| "entropy": 2.2393770956993104, |
| "epoch": 0.9666666666666667, |
| "grad_norm": 12.4375, |
| "learning_rate": 6.882016036655212e-07, |
| "loss": 0.8434, |
| "mean_token_accuracy": 0.7503564131259918, |
| "num_tokens": 58530361.0, |
| "step": 21750 |
| }, |
| { |
| "entropy": 2.263950316905975, |
| "epoch": 0.9688888888888889, |
| "grad_norm": 6.40625, |
| "learning_rate": 6.423825887743414e-07, |
| "loss": 0.8466, |
| "mean_token_accuracy": 0.748320734500885, |
| "num_tokens": 58595415.0, |
| "step": 21800 |
| }, |
| { |
| "entropy": 2.3069614815711974, |
| "epoch": 0.9711111111111111, |
| "grad_norm": 8.4375, |
| "learning_rate": 5.965635738831616e-07, |
| "loss": 0.8578, |
| "mean_token_accuracy": 0.7452424871921539, |
| "num_tokens": 58658566.0, |
| "step": 21850 |
| }, |
| { |
| "entropy": 2.2174816036224367, |
| "epoch": 0.9733333333333334, |
| "grad_norm": 8.875, |
| "learning_rate": 5.507445589919817e-07, |
| "loss": 0.8814, |
| "mean_token_accuracy": 0.743362922668457, |
| "num_tokens": 58728364.0, |
| "step": 21900 |
| }, |
| { |
| "entropy": 2.2033449959754945, |
| "epoch": 0.9755555555555555, |
| "grad_norm": 7.15625, |
| "learning_rate": 5.049255441008018e-07, |
| "loss": 0.8709, |
| "mean_token_accuracy": 0.7446471619606018, |
| "num_tokens": 58796528.0, |
| "step": 21950 |
| }, |
| { |
| "entropy": 2.262139241695404, |
| "epoch": 0.9777777777777777, |
| "grad_norm": 12.75, |
| "learning_rate": 4.59106529209622e-07, |
| "loss": 0.8688, |
| "mean_token_accuracy": 0.7436162507534028, |
| "num_tokens": 58862439.0, |
| "step": 22000 |
| }, |
| { |
| "entropy": 2.2621299147605898, |
| "epoch": 0.98, |
| "grad_norm": 6.25, |
| "learning_rate": 4.132875143184422e-07, |
| "loss": 0.866, |
| "mean_token_accuracy": 0.7452659630775451, |
| "num_tokens": 58930357.0, |
| "step": 22050 |
| }, |
| { |
| "entropy": 2.2046650099754332, |
| "epoch": 0.9822222222222222, |
| "grad_norm": 8.0, |
| "learning_rate": 3.674684994272623e-07, |
| "loss": 0.8945, |
| "mean_token_accuracy": 0.7410675585269928, |
| "num_tokens": 58997717.0, |
| "step": 22100 |
| }, |
| { |
| "entropy": 2.210694715976715, |
| "epoch": 0.9844444444444445, |
| "grad_norm": 7.71875, |
| "learning_rate": 3.216494845360825e-07, |
| "loss": 0.8986, |
| "mean_token_accuracy": 0.7420401775836944, |
| "num_tokens": 59064018.0, |
| "step": 22150 |
| }, |
| { |
| "entropy": 2.196241397857666, |
| "epoch": 0.9866666666666667, |
| "grad_norm": 7.34375, |
| "learning_rate": 2.758304696449026e-07, |
| "loss": 0.9046, |
| "mean_token_accuracy": 0.7343410396575928, |
| "num_tokens": 59131839.0, |
| "step": 22200 |
| }, |
| { |
| "entropy": 2.1624761509895323, |
| "epoch": 0.9888888888888889, |
| "grad_norm": 8.6875, |
| "learning_rate": 2.3001145475372283e-07, |
| "loss": 0.8625, |
| "mean_token_accuracy": 0.749553587436676, |
| "num_tokens": 59199340.0, |
| "step": 22250 |
| }, |
| { |
| "entropy": 2.214530596733093, |
| "epoch": 0.9911111111111112, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.8419243986254296e-07, |
| "loss": 0.9018, |
| "mean_token_accuracy": 0.7367948520183564, |
| "num_tokens": 59265961.0, |
| "step": 22300 |
| }, |
| { |
| "entropy": 2.189036545753479, |
| "epoch": 0.9933333333333333, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.3837342497136314e-07, |
| "loss": 0.9118, |
| "mean_token_accuracy": 0.7363495242595672, |
| "num_tokens": 59335350.0, |
| "step": 22350 |
| }, |
| { |
| "entropy": 2.2046823930740356, |
| "epoch": 0.9955555555555555, |
| "grad_norm": 7.65625, |
| "learning_rate": 9.255441008018328e-08, |
| "loss": 0.8634, |
| "mean_token_accuracy": 0.7454369437694549, |
| "num_tokens": 59401509.0, |
| "step": 22400 |
| }, |
| { |
| "entropy": 2.1603779411315918, |
| "epoch": 0.9977777777777778, |
| "grad_norm": 8.25, |
| "learning_rate": 4.673539518900344e-08, |
| "loss": 0.8957, |
| "mean_token_accuracy": 0.741243121623993, |
| "num_tokens": 59470524.0, |
| "step": 22450 |
| }, |
| { |
| "entropy": 2.1624831557273865, |
| "epoch": 1.0, |
| "grad_norm": 9.0625, |
| "learning_rate": 9.163802978235968e-10, |
| "loss": 0.8587, |
| "mean_token_accuracy": 0.7448894453048706, |
| "num_tokens": 59536445.0, |
| "step": 22500 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 22500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.515437740795392e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|