{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.79375, "epoch": 0.021052631578947368, "grad_norm": 6.71875, "learning_rate": 4.9526315789473685e-05, "loss": 1.7312183380126953, "mean_token_accuracy": 0.6867853045463562, "num_tokens": 840.0, "step": 10 }, { "entropy": 1.52421875, "epoch": 0.042105263157894736, "grad_norm": 3.90625, "learning_rate": 4.9e-05, "loss": 1.3260906219482422, "mean_token_accuracy": 0.7451180815696716, "num_tokens": 1573.0, "step": 20 }, { "entropy": 1.69921875, "epoch": 0.06315789473684211, "grad_norm": 6.5625, "learning_rate": 4.847368421052632e-05, "loss": 1.6661336898803711, "mean_token_accuracy": 0.7098163902759552, "num_tokens": 2338.0, "step": 30 }, { "entropy": 1.5484375, "epoch": 0.08421052631578947, "grad_norm": 7.4375, "learning_rate": 4.794736842105264e-05, "loss": 1.5569435119628907, "mean_token_accuracy": 0.7198988318443298, "num_tokens": 3038.0, "step": 40 }, { "entropy": 1.59296875, "epoch": 0.10526315789473684, "grad_norm": 8.1875, "learning_rate": 4.742105263157895e-05, "loss": 1.5420040130615233, "mean_token_accuracy": 0.7149245262145996, "num_tokens": 3682.0, "step": 50 }, { "entropy": 1.80390625, "epoch": 0.12631578947368421, "grad_norm": 9.9375, "learning_rate": 4.6894736842105264e-05, "loss": 1.6832901000976563, "mean_token_accuracy": 0.7065998375415802, "num_tokens": 4617.0, "step": 60 }, { "entropy": 1.66796875, "epoch": 0.14736842105263157, "grad_norm": 8.375, "learning_rate": 4.6368421052631584e-05, "loss": 1.6378042221069335, "mean_token_accuracy": 0.7060423612594604, "num_tokens": 5349.0, "step": 70 }, { "entropy": 1.5109375, "epoch": 0.16842105263157894, "grad_norm": 6.03125, "learning_rate": 4.58421052631579e-05, "loss": 1.5179196357727052, "mean_token_accuracy": 0.7291716754436492, "num_tokens": 6122.0, "step": 80 }, { "entropy": 1.721875, "epoch": 0.18947368421052632, "grad_norm": 4.0625, "learning_rate": 4.531578947368421e-05, "loss": 1.8176845550537108, "mean_token_accuracy": 0.6842875778675079, "num_tokens": 7197.0, "step": 90 }, { "entropy": 1.52265625, "epoch": 0.21052631578947367, "grad_norm": 6.46875, "learning_rate": 4.478947368421053e-05, "loss": 1.5252375602722168, "mean_token_accuracy": 0.711300152540207, "num_tokens": 8046.0, "step": 100 }, { "entropy": 1.6515625, "epoch": 0.23157894736842105, "grad_norm": 7.0, "learning_rate": 4.426315789473684e-05, "loss": 1.611851119995117, "mean_token_accuracy": 0.7179319798946381, "num_tokens": 8698.0, "step": 110 }, { "entropy": 1.54140625, "epoch": 0.25263157894736843, "grad_norm": 6.90625, "learning_rate": 4.373684210526316e-05, "loss": 1.5053813934326172, "mean_token_accuracy": 0.7282716870307923, "num_tokens": 9455.0, "step": 120 }, { "entropy": 1.5375, "epoch": 0.2736842105263158, "grad_norm": 7.25, "learning_rate": 4.3210526315789475e-05, "loss": 1.5904606819152831, "mean_token_accuracy": 0.7234093546867371, "num_tokens": 10191.0, "step": 130 }, { "entropy": 1.58984375, "epoch": 0.29473684210526313, "grad_norm": 9.375, "learning_rate": 4.2684210526315795e-05, "loss": 1.6070585250854492, "mean_token_accuracy": 0.7228378415107727, "num_tokens": 10859.0, "step": 140 }, { "entropy": 1.80234375, "epoch": 0.3157894736842105, "grad_norm": 8.3125, "learning_rate": 4.215789473684211e-05, "loss": 1.7423076629638672, "mean_token_accuracy": 0.6761326909065246, "num_tokens": 11846.0, "step": 150 }, { "entropy": 1.592578125, "epoch": 0.3368421052631579, "grad_norm": 5.1875, "learning_rate": 4.163157894736842e-05, "loss": 1.6300687789916992, "mean_token_accuracy": 0.7145272672176362, "num_tokens": 12722.0, "step": 160 }, { "entropy": 1.5953125, "epoch": 0.35789473684210527, "grad_norm": 6.125, "learning_rate": 4.110526315789474e-05, "loss": 1.44827880859375, "mean_token_accuracy": 0.725699108839035, "num_tokens": 13458.0, "step": 170 }, { "entropy": 1.8171875, "epoch": 0.37894736842105264, "grad_norm": 7.125, "learning_rate": 4.0578947368421054e-05, "loss": 1.7392475128173828, "mean_token_accuracy": 0.6949776589870453, "num_tokens": 14197.0, "step": 180 }, { "entropy": 1.78671875, "epoch": 0.4, "grad_norm": 7.8125, "learning_rate": 4.0052631578947367e-05, "loss": 1.8162834167480468, "mean_token_accuracy": 0.6562099277973175, "num_tokens": 15056.0, "step": 190 }, { "entropy": 1.77421875, "epoch": 0.42105263157894735, "grad_norm": 7.53125, "learning_rate": 3.9526315789473686e-05, "loss": 1.7153417587280273, "mean_token_accuracy": 0.6948422849178314, "num_tokens": 15807.0, "step": 200 }, { "entropy": 1.6875, "epoch": 0.4421052631578947, "grad_norm": 9.125, "learning_rate": 3.9000000000000006e-05, "loss": 1.6782726287841796, "mean_token_accuracy": 0.7238920211791993, "num_tokens": 16432.0, "step": 210 }, { "entropy": 1.446484375, "epoch": 0.4631578947368421, "grad_norm": 5.84375, "learning_rate": 3.847368421052632e-05, "loss": 1.4408910751342774, "mean_token_accuracy": 0.7403214454650879, "num_tokens": 17103.0, "step": 220 }, { "entropy": 1.76171875, "epoch": 0.4842105263157895, "grad_norm": 6.375, "learning_rate": 3.794736842105263e-05, "loss": 1.7578521728515626, "mean_token_accuracy": 0.6848492741584777, "num_tokens": 18038.0, "step": 230 }, { "entropy": 1.53515625, "epoch": 0.5052631578947369, "grad_norm": 6.71875, "learning_rate": 3.742105263157895e-05, "loss": 1.48400821685791, "mean_token_accuracy": 0.7131125509738923, "num_tokens": 18720.0, "step": 240 }, { "entropy": 1.49140625, "epoch": 0.5263157894736842, "grad_norm": 6.375, "learning_rate": 3.6894736842105265e-05, "loss": 1.5496106147766113, "mean_token_accuracy": 0.7365909218788147, "num_tokens": 19308.0, "step": 250 }, { "entropy": 1.46796875, "epoch": 0.5473684210526316, "grad_norm": 8.125, "learning_rate": 3.636842105263158e-05, "loss": 1.5240853309631348, "mean_token_accuracy": 0.736497437953949, "num_tokens": 19967.0, "step": 260 }, { "entropy": 1.578125, "epoch": 0.5684210526315789, "grad_norm": 8.3125, "learning_rate": 3.58421052631579e-05, "loss": 1.5680004119873048, "mean_token_accuracy": 0.7332142323255539, "num_tokens": 20769.0, "step": 270 }, { "entropy": 1.501953125, "epoch": 0.5894736842105263, "grad_norm": 8.375, "learning_rate": 3.531578947368421e-05, "loss": 1.5029385566711426, "mean_token_accuracy": 0.7254173457622528, "num_tokens": 22503.0, "step": 280 }, { "entropy": 1.5921875, "epoch": 0.6105263157894737, "grad_norm": 7.0, "learning_rate": 3.478947368421053e-05, "loss": 1.6090343475341797, "mean_token_accuracy": 0.7082146763801574, "num_tokens": 23439.0, "step": 290 }, { "entropy": 1.75078125, "epoch": 0.631578947368421, "grad_norm": 8.25, "learning_rate": 3.426315789473684e-05, "loss": 1.6250024795532227, "mean_token_accuracy": 0.6883616149425507, "num_tokens": 24161.0, "step": 300 }, { "entropy": 1.269140625, "epoch": 0.6526315789473685, "grad_norm": 7.46875, "learning_rate": 3.373684210526316e-05, "loss": 1.3148769378662108, "mean_token_accuracy": 0.7618813216686249, "num_tokens": 24716.0, "step": 310 }, { "entropy": 1.809375, "epoch": 0.6736842105263158, "grad_norm": 7.40625, "learning_rate": 3.3210526315789476e-05, "loss": 1.9381757736206056, "mean_token_accuracy": 0.667089307308197, "num_tokens": 25481.0, "step": 320 }, { "entropy": 1.73515625, "epoch": 0.6947368421052632, "grad_norm": 11.5625, "learning_rate": 3.268421052631579e-05, "loss": 1.76402587890625, "mean_token_accuracy": 0.6992617845535278, "num_tokens": 26159.0, "step": 330 }, { "entropy": 1.644140625, "epoch": 0.7157894736842105, "grad_norm": 7.625, "learning_rate": 3.215789473684211e-05, "loss": 1.6910707473754882, "mean_token_accuracy": 0.6974358975887298, "num_tokens": 26983.0, "step": 340 }, { "entropy": 1.44140625, "epoch": 0.7368421052631579, "grad_norm": 7.84375, "learning_rate": 3.163157894736842e-05, "loss": 1.3823105812072753, "mean_token_accuracy": 0.7533604800701141, "num_tokens": 27564.0, "step": 350 }, { "entropy": 1.41953125, "epoch": 0.7578947368421053, "grad_norm": 9.5, "learning_rate": 3.1105263157894735e-05, "loss": 1.488726806640625, "mean_token_accuracy": 0.733670562505722, "num_tokens": 28200.0, "step": 360 }, { "entropy": 1.753125, "epoch": 0.7789473684210526, "grad_norm": 5.9375, "learning_rate": 3.0578947368421054e-05, "loss": 1.8948373794555664, "mean_token_accuracy": 0.675409197807312, "num_tokens": 28919.0, "step": 370 }, { "entropy": 1.98828125, "epoch": 0.8, "grad_norm": 8.5, "learning_rate": 3.005263157894737e-05, "loss": 2.146737289428711, "mean_token_accuracy": 0.6334406793117523, "num_tokens": 29894.0, "step": 380 }, { "entropy": 1.85625, "epoch": 0.8210526315789474, "grad_norm": 6.3125, "learning_rate": 2.9526315789473684e-05, "loss": 1.858269500732422, "mean_token_accuracy": 0.6439402669668197, "num_tokens": 30652.0, "step": 390 }, { "entropy": 1.727734375, "epoch": 0.8421052631578947, "grad_norm": 7.96875, "learning_rate": 2.9e-05, "loss": 1.7343599319458007, "mean_token_accuracy": 0.700012594461441, "num_tokens": 31372.0, "step": 400 }, { "entropy": 1.65, "epoch": 0.8631578947368421, "grad_norm": 8.0625, "learning_rate": 2.847368421052632e-05, "loss": 1.6553001403808594, "mean_token_accuracy": 0.7165175020694733, "num_tokens": 32149.0, "step": 410 }, { "entropy": 1.596875, "epoch": 0.8842105263157894, "grad_norm": 6.59375, "learning_rate": 2.7947368421052633e-05, "loss": 1.5661128997802733, "mean_token_accuracy": 0.7166097521781921, "num_tokens": 33203.0, "step": 420 }, { "entropy": 1.6015625, "epoch": 0.9052631578947369, "grad_norm": 7.1875, "learning_rate": 2.7421052631578946e-05, "loss": 1.6179162979125976, "mean_token_accuracy": 0.7093785464763641, "num_tokens": 33889.0, "step": 430 }, { "entropy": 1.75546875, "epoch": 0.9263157894736842, "grad_norm": 7.21875, "learning_rate": 2.6894736842105266e-05, "loss": 1.7675729751586915, "mean_token_accuracy": 0.6769322335720063, "num_tokens": 34862.0, "step": 440 }, { "entropy": 1.92109375, "epoch": 0.9473684210526315, "grad_norm": 11.0625, "learning_rate": 2.6368421052631582e-05, "loss": 1.9548963546752929, "mean_token_accuracy": 0.648491358757019, "num_tokens": 35710.0, "step": 450 }, { "entropy": 1.699609375, "epoch": 0.968421052631579, "grad_norm": 6.03125, "learning_rate": 2.5842105263157895e-05, "loss": 1.6432403564453124, "mean_token_accuracy": 0.6918732106685639, "num_tokens": 36374.0, "step": 460 }, { "entropy": 1.84453125, "epoch": 0.9894736842105263, "grad_norm": 7.03125, "learning_rate": 2.5315789473684208e-05, "loss": 1.7591934204101562, "mean_token_accuracy": 0.6581568241119384, "num_tokens": 37139.0, "step": 470 }, { "entropy": 1.75859375, "epoch": 1.0105263157894737, "grad_norm": 7.15625, "learning_rate": 2.4789473684210528e-05, "loss": 1.7067642211914062, "mean_token_accuracy": 0.7008034646511078, "num_tokens": 38037.0, "step": 480 }, { "entropy": 1.4203125, "epoch": 1.0315789473684212, "grad_norm": 6.4375, "learning_rate": 2.4263157894736844e-05, "loss": 1.378176498413086, "mean_token_accuracy": 0.7486959993839264, "num_tokens": 38668.0, "step": 490 }, { "entropy": 1.56328125, "epoch": 1.0526315789473684, "grad_norm": 7.15625, "learning_rate": 2.373684210526316e-05, "loss": 1.4762983322143555, "mean_token_accuracy": 0.7175065577030182, "num_tokens": 39424.0, "step": 500 }, { "entropy": 1.630078125, "epoch": 1.0736842105263158, "grad_norm": 4.84375, "learning_rate": 2.3210526315789473e-05, "loss": 1.5906378746032714, "mean_token_accuracy": 0.6876044690608978, "num_tokens": 40317.0, "step": 510 }, { "entropy": 1.556640625, "epoch": 1.0947368421052632, "grad_norm": 6.59375, "learning_rate": 2.268421052631579e-05, "loss": 1.6246864318847656, "mean_token_accuracy": 0.7021546125411987, "num_tokens": 41101.0, "step": 520 }, { "entropy": 1.86015625, "epoch": 1.1157894736842104, "grad_norm": 6.3125, "learning_rate": 2.2157894736842106e-05, "loss": 1.832110595703125, "mean_token_accuracy": 0.6659113824367523, "num_tokens": 42080.0, "step": 530 }, { "entropy": 1.50859375, "epoch": 1.1368421052631579, "grad_norm": 4.75, "learning_rate": 2.1631578947368423e-05, "loss": 1.495261001586914, "mean_token_accuracy": 0.7114485323429107, "num_tokens": 43064.0, "step": 540 }, { "entropy": 1.379296875, "epoch": 1.1578947368421053, "grad_norm": 7.90625, "learning_rate": 2.110526315789474e-05, "loss": 1.3109845161437987, "mean_token_accuracy": 0.7768619418144226, "num_tokens": 43638.0, "step": 550 }, { "entropy": 1.671875, "epoch": 1.1789473684210527, "grad_norm": 8.1875, "learning_rate": 2.0578947368421052e-05, "loss": 1.7390716552734375, "mean_token_accuracy": 0.6966245353221894, "num_tokens": 44538.0, "step": 560 }, { "entropy": 1.45390625, "epoch": 1.2, "grad_norm": 8.5, "learning_rate": 2.005263157894737e-05, "loss": 1.3690235137939453, "mean_token_accuracy": 0.742480456829071, "num_tokens": 45159.0, "step": 570 }, { "entropy": 1.8234375, "epoch": 1.2210526315789474, "grad_norm": 6.65625, "learning_rate": 1.9526315789473685e-05, "loss": 1.890069580078125, "mean_token_accuracy": 0.6900433540344239, "num_tokens": 45918.0, "step": 580 }, { "entropy": 1.8640625, "epoch": 1.2421052631578948, "grad_norm": 8.125, "learning_rate": 1.9e-05, "loss": 1.7881799697875977, "mean_token_accuracy": 0.6537608683109284, "num_tokens": 46795.0, "step": 590 }, { "entropy": 1.59453125, "epoch": 1.263157894736842, "grad_norm": 6.5, "learning_rate": 1.8473684210526317e-05, "loss": 1.5872214317321778, "mean_token_accuracy": 0.722487497329712, "num_tokens": 47423.0, "step": 600 }, { "entropy": 1.78828125, "epoch": 1.2842105263157895, "grad_norm": 7.3125, "learning_rate": 1.7947368421052634e-05, "loss": 1.6725593566894532, "mean_token_accuracy": 0.6980259001255036, "num_tokens": 48158.0, "step": 610 }, { "entropy": 1.61015625, "epoch": 1.305263157894737, "grad_norm": 6.5, "learning_rate": 1.742105263157895e-05, "loss": 1.5390226364135742, "mean_token_accuracy": 0.725999391078949, "num_tokens": 48875.0, "step": 620 }, { "entropy": 1.71953125, "epoch": 1.3263157894736843, "grad_norm": 8.25, "learning_rate": 1.6894736842105263e-05, "loss": 1.6228282928466797, "mean_token_accuracy": 0.7085169553756714, "num_tokens": 49697.0, "step": 630 }, { "entropy": 1.437109375, "epoch": 1.3473684210526315, "grad_norm": 7.5625, "learning_rate": 1.636842105263158e-05, "loss": 1.4906560897827148, "mean_token_accuracy": 0.7471937596797943, "num_tokens": 50300.0, "step": 640 }, { "entropy": 1.73359375, "epoch": 1.368421052631579, "grad_norm": 4.375, "learning_rate": 1.5842105263157896e-05, "loss": 1.5986823081970214, "mean_token_accuracy": 0.699000483751297, "num_tokens": 51332.0, "step": 650 }, { "entropy": 1.2671875, "epoch": 1.3894736842105262, "grad_norm": 11.8125, "learning_rate": 1.5315789473684212e-05, "loss": 1.234378719329834, "mean_token_accuracy": 0.7714344441890717, "num_tokens": 51900.0, "step": 660 }, { "entropy": 1.90859375, "epoch": 1.4105263157894736, "grad_norm": 6.875, "learning_rate": 1.4789473684210529e-05, "loss": 1.815606689453125, "mean_token_accuracy": 0.6656625211238861, "num_tokens": 52883.0, "step": 670 }, { "entropy": 1.558203125, "epoch": 1.431578947368421, "grad_norm": 9.375, "learning_rate": 1.4263157894736842e-05, "loss": 1.4858011245727538, "mean_token_accuracy": 0.7443289816379547, "num_tokens": 53576.0, "step": 680 }, { "entropy": 1.64765625, "epoch": 1.4526315789473685, "grad_norm": 8.9375, "learning_rate": 1.373684210526316e-05, "loss": 1.534531307220459, "mean_token_accuracy": 0.7204049170017243, "num_tokens": 54450.0, "step": 690 }, { "entropy": 1.58359375, "epoch": 1.4736842105263157, "grad_norm": 6.46875, "learning_rate": 1.3210526315789473e-05, "loss": 1.5365165710449218, "mean_token_accuracy": 0.7069519102573395, "num_tokens": 55160.0, "step": 700 }, { "entropy": 1.4171875, "epoch": 1.4947368421052631, "grad_norm": 7.1875, "learning_rate": 1.268421052631579e-05, "loss": 1.307802391052246, "mean_token_accuracy": 0.7627157270908356, "num_tokens": 55929.0, "step": 710 }, { "entropy": 1.678125, "epoch": 1.5157894736842106, "grad_norm": 6.34375, "learning_rate": 1.2157894736842105e-05, "loss": 1.5420659065246582, "mean_token_accuracy": 0.7097006201744079, "num_tokens": 56725.0, "step": 720 }, { "entropy": 1.58125, "epoch": 1.5368421052631578, "grad_norm": 7.0625, "learning_rate": 1.1631578947368422e-05, "loss": 1.4044910430908204, "mean_token_accuracy": 0.7465328335762024, "num_tokens": 57398.0, "step": 730 }, { "entropy": 1.576171875, "epoch": 1.5578947368421052, "grad_norm": 6.03125, "learning_rate": 1.1105263157894738e-05, "loss": 1.605686569213867, "mean_token_accuracy": 0.7333506286144257, "num_tokens": 58072.0, "step": 740 }, { "entropy": 1.59140625, "epoch": 1.5789473684210527, "grad_norm": 6.03125, "learning_rate": 1.0578947368421053e-05, "loss": 1.4928099632263183, "mean_token_accuracy": 0.7072650909423828, "num_tokens": 58748.0, "step": 750 }, { "entropy": 1.63046875, "epoch": 1.6, "grad_norm": 7.1875, "learning_rate": 1.005263157894737e-05, "loss": 1.624325942993164, "mean_token_accuracy": 0.690889635682106, "num_tokens": 59658.0, "step": 760 }, { "entropy": 1.803125, "epoch": 1.6210526315789475, "grad_norm": 6.03125, "learning_rate": 9.526315789473684e-06, "loss": 1.7750425338745117, "mean_token_accuracy": 0.6651369571685791, "num_tokens": 60859.0, "step": 770 }, { "entropy": 1.4703125, "epoch": 1.6421052631578947, "grad_norm": 7.03125, "learning_rate": 9e-06, "loss": 1.4258437156677246, "mean_token_accuracy": 0.7347829401493072, "num_tokens": 62439.0, "step": 780 }, { "entropy": 1.671875, "epoch": 1.663157894736842, "grad_norm": 7.71875, "learning_rate": 8.473684210526315e-06, "loss": 1.721219825744629, "mean_token_accuracy": 0.7035934925079346, "num_tokens": 63290.0, "step": 790 }, { "entropy": 1.554296875, "epoch": 1.6842105263157894, "grad_norm": 9.0625, "learning_rate": 7.947368421052633e-06, "loss": 1.5237810134887695, "mean_token_accuracy": 0.7325670003890992, "num_tokens": 64010.0, "step": 800 }, { "entropy": 1.7859375, "epoch": 1.7052631578947368, "grad_norm": 8.5, "learning_rate": 7.421052631578948e-06, "loss": 1.7944671630859375, "mean_token_accuracy": 0.683906614780426, "num_tokens": 64966.0, "step": 810 }, { "entropy": 1.840625, "epoch": 1.7263157894736842, "grad_norm": 7.53125, "learning_rate": 6.894736842105263e-06, "loss": 1.787227249145508, "mean_token_accuracy": 0.6670031368732452, "num_tokens": 65703.0, "step": 820 }, { "entropy": 1.29765625, "epoch": 1.7473684210526317, "grad_norm": 9.0625, "learning_rate": 6.368421052631579e-06, "loss": 1.2813177108764648, "mean_token_accuracy": 0.7610228896141052, "num_tokens": 66421.0, "step": 830 }, { "entropy": 1.504296875, "epoch": 1.768421052631579, "grad_norm": 6.5, "learning_rate": 5.842105263157895e-06, "loss": 1.477138137817383, "mean_token_accuracy": 0.7455608665943145, "num_tokens": 67005.0, "step": 840 }, { "entropy": 1.60859375, "epoch": 1.7894736842105263, "grad_norm": 5.03125, "learning_rate": 5.315789473684211e-06, "loss": 1.6519662857055664, "mean_token_accuracy": 0.7005816400051117, "num_tokens": 67975.0, "step": 850 }, { "entropy": 1.6765625, "epoch": 1.8105263157894735, "grad_norm": 6.53125, "learning_rate": 4.789473684210526e-06, "loss": 1.6701608657836915, "mean_token_accuracy": 0.7136963486671448, "num_tokens": 68650.0, "step": 860 }, { "entropy": 1.44609375, "epoch": 1.831578947368421, "grad_norm": 7.1875, "learning_rate": 4.2631578947368425e-06, "loss": 1.3475713729858398, "mean_token_accuracy": 0.7566476047039032, "num_tokens": 69317.0, "step": 870 }, { "entropy": 1.71328125, "epoch": 1.8526315789473684, "grad_norm": 6.90625, "learning_rate": 3.736842105263158e-06, "loss": 1.7077770233154297, "mean_token_accuracy": 0.6919207274913788, "num_tokens": 70169.0, "step": 880 }, { "entropy": 1.646875, "epoch": 1.8736842105263158, "grad_norm": 7.8125, "learning_rate": 3.2105263157894735e-06, "loss": 1.5933343887329101, "mean_token_accuracy": 0.708821702003479, "num_tokens": 71138.0, "step": 890 }, { "entropy": 1.437109375, "epoch": 1.8947368421052633, "grad_norm": 8.625, "learning_rate": 2.68421052631579e-06, "loss": 1.4426955223083495, "mean_token_accuracy": 0.7344056785106658, "num_tokens": 71776.0, "step": 900 }, { "entropy": 1.591015625, "epoch": 1.9157894736842105, "grad_norm": 8.125, "learning_rate": 2.1578947368421054e-06, "loss": 1.4523811340332031, "mean_token_accuracy": 0.7294364452362061, "num_tokens": 72390.0, "step": 910 }, { "entropy": 1.48984375, "epoch": 1.936842105263158, "grad_norm": 8.375, "learning_rate": 1.6315789473684212e-06, "loss": 1.4202921867370606, "mean_token_accuracy": 0.7283547043800354, "num_tokens": 73109.0, "step": 920 }, { "entropy": 1.37578125, "epoch": 1.9578947368421051, "grad_norm": 7.34375, "learning_rate": 1.1052631578947369e-06, "loss": 1.3212156295776367, "mean_token_accuracy": 0.765373581647873, "num_tokens": 73665.0, "step": 930 }, { "entropy": 1.69140625, "epoch": 1.9789473684210526, "grad_norm": 7.5625, "learning_rate": 5.789473684210527e-07, "loss": 1.5747876167297363, "mean_token_accuracy": 0.7353746354579925, "num_tokens": 74320.0, "step": 940 }, { "entropy": 1.75390625, "epoch": 2.0, "grad_norm": 6.46875, "learning_rate": 5.263157894736842e-08, "loss": 1.7055414199829102, "mean_token_accuracy": 0.6862038552761078, "num_tokens": 75304.0, "step": 950 } ], "logging_steps": 10, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 207562104419328.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }