diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22690 @@ +{ + "best_global_step": 3200, + "best_metric": 9.030352592468262, + "best_model_checkpoint": "./qwen3moe_tinystories_sft/checkpoint-3200", + "epoch": 0.8136024153821707, + "eval_steps": 100, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025425075480692836, + "grad_norm": 17874.9453125, + "learning_rate": 0.0, + "loss": 5.9147, + "step": 1 + }, + { + "epoch": 0.0005085015096138567, + "grad_norm": 17123.0625, + "learning_rate": 1.2690355329949238e-07, + "loss": 5.9008, + "step": 2 + }, + { + "epoch": 0.000762752264420785, + "grad_norm": 17448.5234375, + "learning_rate": 2.5380710659898475e-07, + "loss": 5.91, + "step": 3 + }, + { + "epoch": 0.0010170030192277134, + "grad_norm": 18966.7109375, + "learning_rate": 3.807106598984772e-07, + "loss": 5.9248, + "step": 4 + }, + { + "epoch": 0.0012712537740346417, + "grad_norm": 17238.7421875, + "learning_rate": 5.076142131979695e-07, + "loss": 5.9275, + "step": 5 + }, + { + "epoch": 0.00152550452884157, + "grad_norm": 18854.775390625, + "learning_rate": 6.345177664974619e-07, + "loss": 5.9297, + "step": 6 + }, + { + "epoch": 0.0017797552836484984, + "grad_norm": 18089.51171875, + "learning_rate": 7.614213197969544e-07, + "loss": 5.9428, + "step": 7 + }, + { + "epoch": 0.002034006038455427, + "grad_norm": 17900.16796875, + "learning_rate": 8.883248730964468e-07, + "loss": 5.9239, + "step": 8 + }, + { + "epoch": 0.002288256793262355, + "grad_norm": 18685.57421875, + "learning_rate": 1.015228426395939e-06, + "loss": 5.9148, + "step": 9 + }, + { + "epoch": 0.0025425075480692834, + "grad_norm": 18439.728515625, + "learning_rate": 1.1421319796954315e-06, + "loss": 5.9132, + "step": 10 + }, + { + "epoch": 0.002796758302876212, + "grad_norm": 18592.375, + "learning_rate": 1.2690355329949238e-06, + "loss": 5.9367, + "step": 11 + }, + { + "epoch": 0.00305100905768314, + "grad_norm": 18665.373046875, + "learning_rate": 1.3959390862944163e-06, + "loss": 5.9124, + "step": 12 + }, + { + "epoch": 0.0033052598124900683, + "grad_norm": 18325.0234375, + "learning_rate": 1.5228426395939088e-06, + "loss": 5.9293, + "step": 13 + }, + { + "epoch": 0.003559510567296997, + "grad_norm": 18158.3203125, + "learning_rate": 1.6497461928934011e-06, + "loss": 5.9115, + "step": 14 + }, + { + "epoch": 0.003813761322103925, + "grad_norm": 18622.93359375, + "learning_rate": 1.7766497461928936e-06, + "loss": 5.9264, + "step": 15 + }, + { + "epoch": 0.004068012076910854, + "grad_norm": 17586.322265625, + "learning_rate": 1.903553299492386e-06, + "loss": 5.9095, + "step": 16 + }, + { + "epoch": 0.004322262831717782, + "grad_norm": 18946.81640625, + "learning_rate": 2.030456852791878e-06, + "loss": 5.9144, + "step": 17 + }, + { + "epoch": 0.00457651358652471, + "grad_norm": 17792.5234375, + "learning_rate": 2.1573604060913707e-06, + "loss": 5.9174, + "step": 18 + }, + { + "epoch": 0.004830764341331639, + "grad_norm": 18125.546875, + "learning_rate": 2.284263959390863e-06, + "loss": 5.9149, + "step": 19 + }, + { + "epoch": 0.005085015096138567, + "grad_norm": 19357.408203125, + "learning_rate": 2.4111675126903553e-06, + "loss": 5.921, + "step": 20 + }, + { + "epoch": 0.005339265850945495, + "grad_norm": 18574.943359375, + "learning_rate": 2.5380710659898476e-06, + "loss": 5.8929, + "step": 21 + }, + { + "epoch": 0.005593516605752424, + "grad_norm": 19672.7578125, + "learning_rate": 2.6649746192893404e-06, + "loss": 5.9338, + "step": 22 + }, + { + "epoch": 0.005847767360559352, + "grad_norm": 19014.935546875, + "learning_rate": 2.7918781725888327e-06, + "loss": 5.9172, + "step": 23 + }, + { + "epoch": 0.00610201811536628, + "grad_norm": 18349.404296875, + "learning_rate": 2.918781725888325e-06, + "loss": 5.9241, + "step": 24 + }, + { + "epoch": 0.006356268870173209, + "grad_norm": 19636.85546875, + "learning_rate": 3.0456852791878177e-06, + "loss": 5.9213, + "step": 25 + }, + { + "epoch": 0.006610519624980137, + "grad_norm": 19006.40234375, + "learning_rate": 3.1725888324873095e-06, + "loss": 5.909, + "step": 26 + }, + { + "epoch": 0.006864770379787065, + "grad_norm": 18447.80859375, + "learning_rate": 3.2994923857868023e-06, + "loss": 5.9011, + "step": 27 + }, + { + "epoch": 0.007119021134593994, + "grad_norm": 18491.94921875, + "learning_rate": 3.4263959390862946e-06, + "loss": 5.9242, + "step": 28 + }, + { + "epoch": 0.007373271889400922, + "grad_norm": 18376.041015625, + "learning_rate": 3.5532994923857873e-06, + "loss": 5.9213, + "step": 29 + }, + { + "epoch": 0.00762752264420785, + "grad_norm": 18408.8359375, + "learning_rate": 3.680203045685279e-06, + "loss": 5.8996, + "step": 30 + }, + { + "epoch": 0.007881773399014778, + "grad_norm": 17806.453125, + "learning_rate": 3.807106598984772e-06, + "loss": 5.9194, + "step": 31 + }, + { + "epoch": 0.008136024153821707, + "grad_norm": 19786.220703125, + "learning_rate": 3.934010152284264e-06, + "loss": 5.93, + "step": 32 + }, + { + "epoch": 0.008390274908628636, + "grad_norm": 19096.904296875, + "learning_rate": 4.060913705583756e-06, + "loss": 5.922, + "step": 33 + }, + { + "epoch": 0.008644525663435564, + "grad_norm": 19384.958984375, + "learning_rate": 4.187817258883249e-06, + "loss": 5.9004, + "step": 34 + }, + { + "epoch": 0.008898776418242492, + "grad_norm": 18427.359375, + "learning_rate": 4.3147208121827415e-06, + "loss": 5.9233, + "step": 35 + }, + { + "epoch": 0.00915302717304942, + "grad_norm": 19002.884765625, + "learning_rate": 4.441624365482234e-06, + "loss": 5.9167, + "step": 36 + }, + { + "epoch": 0.009407277927856348, + "grad_norm": 17368.919921875, + "learning_rate": 4.568527918781726e-06, + "loss": 5.9175, + "step": 37 + }, + { + "epoch": 0.009661528682663277, + "grad_norm": 18952.740234375, + "learning_rate": 4.695431472081218e-06, + "loss": 5.9195, + "step": 38 + }, + { + "epoch": 0.009915779437470205, + "grad_norm": 20026.333984375, + "learning_rate": 4.822335025380711e-06, + "loss": 5.9014, + "step": 39 + }, + { + "epoch": 0.010170030192277133, + "grad_norm": 18163.779296875, + "learning_rate": 4.949238578680204e-06, + "loss": 5.9036, + "step": 40 + }, + { + "epoch": 0.010424280947084062, + "grad_norm": 18968.326171875, + "learning_rate": 5.076142131979695e-06, + "loss": 5.9211, + "step": 41 + }, + { + "epoch": 0.01067853170189099, + "grad_norm": 18830.423828125, + "learning_rate": 5.203045685279188e-06, + "loss": 5.9474, + "step": 42 + }, + { + "epoch": 0.010932782456697918, + "grad_norm": 17818.533203125, + "learning_rate": 5.329949238578681e-06, + "loss": 5.9221, + "step": 43 + }, + { + "epoch": 0.011187033211504847, + "grad_norm": 17642.869140625, + "learning_rate": 5.456852791878172e-06, + "loss": 5.9297, + "step": 44 + }, + { + "epoch": 0.011441283966311775, + "grad_norm": 17480.1171875, + "learning_rate": 5.583756345177665e-06, + "loss": 5.9168, + "step": 45 + }, + { + "epoch": 0.011695534721118703, + "grad_norm": 17172.3515625, + "learning_rate": 5.710659898477158e-06, + "loss": 5.913, + "step": 46 + }, + { + "epoch": 0.011949785475925631, + "grad_norm": 17409.3046875, + "learning_rate": 5.83756345177665e-06, + "loss": 5.9063, + "step": 47 + }, + { + "epoch": 0.01220403623073256, + "grad_norm": 17079.10546875, + "learning_rate": 5.964467005076142e-06, + "loss": 5.9117, + "step": 48 + }, + { + "epoch": 0.012458286985539488, + "grad_norm": 17173.505859375, + "learning_rate": 6.091370558375635e-06, + "loss": 5.8861, + "step": 49 + }, + { + "epoch": 0.012712537740346417, + "grad_norm": 16594.908203125, + "learning_rate": 6.218274111675127e-06, + "loss": 5.9148, + "step": 50 + }, + { + "epoch": 0.012966788495153345, + "grad_norm": 16416.0859375, + "learning_rate": 6.345177664974619e-06, + "loss": 5.9133, + "step": 51 + }, + { + "epoch": 0.013221039249960273, + "grad_norm": 16740.3359375, + "learning_rate": 6.472081218274112e-06, + "loss": 5.9171, + "step": 52 + }, + { + "epoch": 0.013475290004767201, + "grad_norm": 16300.2470703125, + "learning_rate": 6.5989847715736045e-06, + "loss": 5.9075, + "step": 53 + }, + { + "epoch": 0.01372954075957413, + "grad_norm": 15715.08984375, + "learning_rate": 6.725888324873096e-06, + "loss": 5.9175, + "step": 54 + }, + { + "epoch": 0.013983791514381057, + "grad_norm": 14736.154296875, + "learning_rate": 6.852791878172589e-06, + "loss": 5.9288, + "step": 55 + }, + { + "epoch": 0.014238042269187987, + "grad_norm": 15556.548828125, + "learning_rate": 6.9796954314720814e-06, + "loss": 5.9124, + "step": 56 + }, + { + "epoch": 0.014492293023994915, + "grad_norm": 13768.126953125, + "learning_rate": 7.106598984771575e-06, + "loss": 5.9091, + "step": 57 + }, + { + "epoch": 0.014746543778801843, + "grad_norm": 14840.53515625, + "learning_rate": 7.233502538071066e-06, + "loss": 5.9124, + "step": 58 + }, + { + "epoch": 0.015000794533608771, + "grad_norm": 15261.126953125, + "learning_rate": 7.360406091370558e-06, + "loss": 5.9162, + "step": 59 + }, + { + "epoch": 0.0152550452884157, + "grad_norm": 14289.3095703125, + "learning_rate": 7.4873096446700515e-06, + "loss": 5.9206, + "step": 60 + }, + { + "epoch": 0.01550929604322263, + "grad_norm": 14206.84375, + "learning_rate": 7.614213197969544e-06, + "loss": 5.9084, + "step": 61 + }, + { + "epoch": 0.015763546798029555, + "grad_norm": 13316.6611328125, + "learning_rate": 7.741116751269035e-06, + "loss": 5.9273, + "step": 62 + }, + { + "epoch": 0.016017797552836485, + "grad_norm": 13425.056640625, + "learning_rate": 7.868020304568528e-06, + "loss": 5.9029, + "step": 63 + }, + { + "epoch": 0.016272048307643415, + "grad_norm": 13660.4619140625, + "learning_rate": 7.994923857868022e-06, + "loss": 5.9054, + "step": 64 + }, + { + "epoch": 0.01652629906245034, + "grad_norm": 13360.3232421875, + "learning_rate": 8.121827411167512e-06, + "loss": 5.905, + "step": 65 + }, + { + "epoch": 0.01678054981725727, + "grad_norm": 13411.6171875, + "learning_rate": 8.248730964467004e-06, + "loss": 5.9125, + "step": 66 + }, + { + "epoch": 0.017034800572064197, + "grad_norm": 13302.5546875, + "learning_rate": 8.375634517766498e-06, + "loss": 5.9007, + "step": 67 + }, + { + "epoch": 0.017289051326871127, + "grad_norm": 13521.859375, + "learning_rate": 8.50253807106599e-06, + "loss": 5.916, + "step": 68 + }, + { + "epoch": 0.017543302081678053, + "grad_norm": 13756.3876953125, + "learning_rate": 8.629441624365483e-06, + "loss": 5.8867, + "step": 69 + }, + { + "epoch": 0.017797552836484983, + "grad_norm": 13790.0654296875, + "learning_rate": 8.756345177664975e-06, + "loss": 5.9113, + "step": 70 + }, + { + "epoch": 0.018051803591291913, + "grad_norm": 13175.75390625, + "learning_rate": 8.883248730964468e-06, + "loss": 5.8928, + "step": 71 + }, + { + "epoch": 0.01830605434609884, + "grad_norm": 13981.7958984375, + "learning_rate": 9.01015228426396e-06, + "loss": 5.904, + "step": 72 + }, + { + "epoch": 0.01856030510090577, + "grad_norm": 13376.328125, + "learning_rate": 9.137055837563452e-06, + "loss": 5.9054, + "step": 73 + }, + { + "epoch": 0.018814555855712695, + "grad_norm": 13306.1318359375, + "learning_rate": 9.263959390862944e-06, + "loss": 5.9076, + "step": 74 + }, + { + "epoch": 0.019068806610519625, + "grad_norm": 13207.986328125, + "learning_rate": 9.390862944162437e-06, + "loss": 5.9027, + "step": 75 + }, + { + "epoch": 0.019323057365326555, + "grad_norm": 13185.794921875, + "learning_rate": 9.517766497461929e-06, + "loss": 5.9094, + "step": 76 + }, + { + "epoch": 0.01957730812013348, + "grad_norm": 13483.1728515625, + "learning_rate": 9.644670050761421e-06, + "loss": 5.925, + "step": 77 + }, + { + "epoch": 0.01983155887494041, + "grad_norm": 13370.5615234375, + "learning_rate": 9.771573604060914e-06, + "loss": 5.9001, + "step": 78 + }, + { + "epoch": 0.020085809629747337, + "grad_norm": 13176.5107421875, + "learning_rate": 9.898477157360408e-06, + "loss": 5.9185, + "step": 79 + }, + { + "epoch": 0.020340060384554267, + "grad_norm": 13421.5244140625, + "learning_rate": 1.0025380710659898e-05, + "loss": 5.9031, + "step": 80 + }, + { + "epoch": 0.020594311139361193, + "grad_norm": 13590.021484375, + "learning_rate": 1.015228426395939e-05, + "loss": 5.9013, + "step": 81 + }, + { + "epoch": 0.020848561894168123, + "grad_norm": 13200.2705078125, + "learning_rate": 1.0279187817258885e-05, + "loss": 5.897, + "step": 82 + }, + { + "epoch": 0.021102812648975053, + "grad_norm": 13252.2431640625, + "learning_rate": 1.0406091370558377e-05, + "loss": 5.8837, + "step": 83 + }, + { + "epoch": 0.02135706340378198, + "grad_norm": 13280.3037109375, + "learning_rate": 1.0532994923857867e-05, + "loss": 5.9044, + "step": 84 + }, + { + "epoch": 0.02161131415858891, + "grad_norm": 13695.7080078125, + "learning_rate": 1.0659898477157361e-05, + "loss": 5.9125, + "step": 85 + }, + { + "epoch": 0.021865564913395835, + "grad_norm": 13352.21484375, + "learning_rate": 1.0786802030456854e-05, + "loss": 5.8973, + "step": 86 + }, + { + "epoch": 0.022119815668202765, + "grad_norm": 13334.6044921875, + "learning_rate": 1.0913705583756344e-05, + "loss": 5.9082, + "step": 87 + }, + { + "epoch": 0.022374066423009695, + "grad_norm": 13765.154296875, + "learning_rate": 1.1040609137055838e-05, + "loss": 5.9042, + "step": 88 + }, + { + "epoch": 0.02262831717781662, + "grad_norm": 13421.01171875, + "learning_rate": 1.116751269035533e-05, + "loss": 5.8932, + "step": 89 + }, + { + "epoch": 0.02288256793262355, + "grad_norm": 13485.15234375, + "learning_rate": 1.1294416243654823e-05, + "loss": 5.9081, + "step": 90 + }, + { + "epoch": 0.023136818687430477, + "grad_norm": 13717.1845703125, + "learning_rate": 1.1421319796954315e-05, + "loss": 5.9179, + "step": 91 + }, + { + "epoch": 0.023391069442237407, + "grad_norm": 13565.94140625, + "learning_rate": 1.1548223350253808e-05, + "loss": 5.9076, + "step": 92 + }, + { + "epoch": 0.023645320197044337, + "grad_norm": 13472.171875, + "learning_rate": 1.16751269035533e-05, + "loss": 5.8887, + "step": 93 + }, + { + "epoch": 0.023899570951851263, + "grad_norm": 13683.9931640625, + "learning_rate": 1.1802030456852794e-05, + "loss": 5.9068, + "step": 94 + }, + { + "epoch": 0.024153821706658193, + "grad_norm": 13872.4140625, + "learning_rate": 1.1928934010152284e-05, + "loss": 5.8972, + "step": 95 + }, + { + "epoch": 0.02440807246146512, + "grad_norm": 13557.8515625, + "learning_rate": 1.2055837563451777e-05, + "loss": 5.8923, + "step": 96 + }, + { + "epoch": 0.02466232321627205, + "grad_norm": 13712.21484375, + "learning_rate": 1.218274111675127e-05, + "loss": 5.9164, + "step": 97 + }, + { + "epoch": 0.024916573971078975, + "grad_norm": 13596.0732421875, + "learning_rate": 1.2309644670050761e-05, + "loss": 5.8984, + "step": 98 + }, + { + "epoch": 0.025170824725885905, + "grad_norm": 13513.576171875, + "learning_rate": 1.2436548223350254e-05, + "loss": 5.887, + "step": 99 + }, + { + "epoch": 0.025425075480692835, + "grad_norm": 13328.6044921875, + "learning_rate": 1.2563451776649746e-05, + "loss": 5.9013, + "step": 100 + }, + { + "epoch": 0.025425075480692835, + "eval_loss": 11.887640953063965, + "eval_runtime": 697.8119, + "eval_samples_per_second": 151.883, + "eval_steps_per_second": 9.494, + "step": 100 + }, + { + "epoch": 0.02567932623549976, + "grad_norm": 13536.2607421875, + "learning_rate": 1.2690355329949238e-05, + "loss": 5.8976, + "step": 101 + }, + { + "epoch": 0.02593357699030669, + "grad_norm": 13625.7626953125, + "learning_rate": 1.281725888324873e-05, + "loss": 5.8834, + "step": 102 + }, + { + "epoch": 0.026187827745113617, + "grad_norm": 13483.0322265625, + "learning_rate": 1.2944162436548224e-05, + "loss": 5.8999, + "step": 103 + }, + { + "epoch": 0.026442078499920547, + "grad_norm": 13512.1435546875, + "learning_rate": 1.3071065989847717e-05, + "loss": 5.8884, + "step": 104 + }, + { + "epoch": 0.026696329254727477, + "grad_norm": 13751.4580078125, + "learning_rate": 1.3197969543147209e-05, + "loss": 5.8908, + "step": 105 + }, + { + "epoch": 0.026950580009534403, + "grad_norm": 13559.2783203125, + "learning_rate": 1.3324873096446703e-05, + "loss": 5.9082, + "step": 106 + }, + { + "epoch": 0.027204830764341333, + "grad_norm": 13575.0419921875, + "learning_rate": 1.3451776649746192e-05, + "loss": 5.8927, + "step": 107 + }, + { + "epoch": 0.02745908151914826, + "grad_norm": 13510.3466796875, + "learning_rate": 1.3578680203045684e-05, + "loss": 5.8836, + "step": 108 + }, + { + "epoch": 0.02771333227395519, + "grad_norm": 13477.1591796875, + "learning_rate": 1.3705583756345178e-05, + "loss": 5.8889, + "step": 109 + }, + { + "epoch": 0.027967583028762115, + "grad_norm": 13679.9140625, + "learning_rate": 1.383248730964467e-05, + "loss": 5.9003, + "step": 110 + }, + { + "epoch": 0.028221833783569045, + "grad_norm": 13792.7548828125, + "learning_rate": 1.3959390862944163e-05, + "loss": 5.8894, + "step": 111 + }, + { + "epoch": 0.028476084538375974, + "grad_norm": 13725.0146484375, + "learning_rate": 1.4086294416243657e-05, + "loss": 5.8858, + "step": 112 + }, + { + "epoch": 0.0287303352931829, + "grad_norm": 13686.3720703125, + "learning_rate": 1.421319796954315e-05, + "loss": 5.8788, + "step": 113 + }, + { + "epoch": 0.02898458604798983, + "grad_norm": 13671.60546875, + "learning_rate": 1.4340101522842641e-05, + "loss": 5.8878, + "step": 114 + }, + { + "epoch": 0.029238836802796757, + "grad_norm": 13452.1201171875, + "learning_rate": 1.4467005076142132e-05, + "loss": 5.8957, + "step": 115 + }, + { + "epoch": 0.029493087557603687, + "grad_norm": 13602.41015625, + "learning_rate": 1.4593908629441624e-05, + "loss": 5.8766, + "step": 116 + }, + { + "epoch": 0.029747338312410616, + "grad_norm": 13690.53125, + "learning_rate": 1.4720812182741117e-05, + "loss": 5.8951, + "step": 117 + }, + { + "epoch": 0.030001589067217543, + "grad_norm": 13700.568359375, + "learning_rate": 1.484771573604061e-05, + "loss": 5.8945, + "step": 118 + }, + { + "epoch": 0.030255839822024472, + "grad_norm": 13505.111328125, + "learning_rate": 1.4974619289340103e-05, + "loss": 5.8965, + "step": 119 + }, + { + "epoch": 0.0305100905768314, + "grad_norm": 13736.1865234375, + "learning_rate": 1.5101522842639595e-05, + "loss": 5.8742, + "step": 120 + }, + { + "epoch": 0.03076434133163833, + "grad_norm": 13643.3935546875, + "learning_rate": 1.5228426395939088e-05, + "loss": 5.9014, + "step": 121 + }, + { + "epoch": 0.03101859208644526, + "grad_norm": 13563.412109375, + "learning_rate": 1.535532994923858e-05, + "loss": 5.8933, + "step": 122 + }, + { + "epoch": 0.03127284284125219, + "grad_norm": 13571.185546875, + "learning_rate": 1.548223350253807e-05, + "loss": 5.8736, + "step": 123 + }, + { + "epoch": 0.03152709359605911, + "grad_norm": 13560.033203125, + "learning_rate": 1.5609137055837564e-05, + "loss": 5.9019, + "step": 124 + }, + { + "epoch": 0.03178134435086604, + "grad_norm": 13774.8857421875, + "learning_rate": 1.5736040609137055e-05, + "loss": 5.9036, + "step": 125 + }, + { + "epoch": 0.03203559510567297, + "grad_norm": 13625.09765625, + "learning_rate": 1.586294416243655e-05, + "loss": 5.8879, + "step": 126 + }, + { + "epoch": 0.0322898458604799, + "grad_norm": 13534.6162109375, + "learning_rate": 1.5989847715736043e-05, + "loss": 5.8661, + "step": 127 + }, + { + "epoch": 0.03254409661528683, + "grad_norm": 13809.35546875, + "learning_rate": 1.6116751269035534e-05, + "loss": 5.895, + "step": 128 + }, + { + "epoch": 0.03279834737009375, + "grad_norm": 13707.41796875, + "learning_rate": 1.6243654822335024e-05, + "loss": 5.8883, + "step": 129 + }, + { + "epoch": 0.03305259812490068, + "grad_norm": 13843.4951171875, + "learning_rate": 1.6370558375634518e-05, + "loss": 5.9049, + "step": 130 + }, + { + "epoch": 0.03330684887970761, + "grad_norm": 13867.34765625, + "learning_rate": 1.649746192893401e-05, + "loss": 5.8972, + "step": 131 + }, + { + "epoch": 0.03356109963451454, + "grad_norm": 13773.0732421875, + "learning_rate": 1.6624365482233503e-05, + "loss": 5.8697, + "step": 132 + }, + { + "epoch": 0.033815350389321465, + "grad_norm": 13810.62890625, + "learning_rate": 1.6751269035532997e-05, + "loss": 5.8848, + "step": 133 + }, + { + "epoch": 0.034069601144128395, + "grad_norm": 13859.7099609375, + "learning_rate": 1.6878172588832487e-05, + "loss": 5.8775, + "step": 134 + }, + { + "epoch": 0.034323851898935324, + "grad_norm": 13713.4169921875, + "learning_rate": 1.700507614213198e-05, + "loss": 5.8831, + "step": 135 + }, + { + "epoch": 0.034578102653742254, + "grad_norm": 13702.9951171875, + "learning_rate": 1.7131979695431472e-05, + "loss": 5.8658, + "step": 136 + }, + { + "epoch": 0.034832353408549184, + "grad_norm": 13757.0, + "learning_rate": 1.7258883248730966e-05, + "loss": 5.8831, + "step": 137 + }, + { + "epoch": 0.03508660416335611, + "grad_norm": 14029.185546875, + "learning_rate": 1.7385786802030457e-05, + "loss": 5.8856, + "step": 138 + }, + { + "epoch": 0.03534085491816304, + "grad_norm": 13854.5400390625, + "learning_rate": 1.751269035532995e-05, + "loss": 5.903, + "step": 139 + }, + { + "epoch": 0.035595105672969966, + "grad_norm": 13903.626953125, + "learning_rate": 1.763959390862944e-05, + "loss": 5.8816, + "step": 140 + }, + { + "epoch": 0.035849356427776896, + "grad_norm": 13783.7431640625, + "learning_rate": 1.7766497461928935e-05, + "loss": 5.8934, + "step": 141 + }, + { + "epoch": 0.036103607182583826, + "grad_norm": 13807.6279296875, + "learning_rate": 1.789340101522843e-05, + "loss": 5.8794, + "step": 142 + }, + { + "epoch": 0.03635785793739075, + "grad_norm": 13903.8896484375, + "learning_rate": 1.802030456852792e-05, + "loss": 5.8909, + "step": 143 + }, + { + "epoch": 0.03661210869219768, + "grad_norm": 13844.205078125, + "learning_rate": 1.814720812182741e-05, + "loss": 5.8787, + "step": 144 + }, + { + "epoch": 0.03686635944700461, + "grad_norm": 13829.861328125, + "learning_rate": 1.8274111675126904e-05, + "loss": 5.8848, + "step": 145 + }, + { + "epoch": 0.03712061020181154, + "grad_norm": 13927.4833984375, + "learning_rate": 1.8401015228426395e-05, + "loss": 5.8872, + "step": 146 + }, + { + "epoch": 0.03737486095661847, + "grad_norm": 14146.7177734375, + "learning_rate": 1.852791878172589e-05, + "loss": 5.9012, + "step": 147 + }, + { + "epoch": 0.03762911171142539, + "grad_norm": 13803.76171875, + "learning_rate": 1.8654822335025383e-05, + "loss": 5.8809, + "step": 148 + }, + { + "epoch": 0.03788336246623232, + "grad_norm": 13910.3681640625, + "learning_rate": 1.8781725888324874e-05, + "loss": 5.8893, + "step": 149 + }, + { + "epoch": 0.03813761322103925, + "grad_norm": 14064.740234375, + "learning_rate": 1.8908629441624368e-05, + "loss": 5.8773, + "step": 150 + }, + { + "epoch": 0.03839186397584618, + "grad_norm": 13910.2197265625, + "learning_rate": 1.9035532994923858e-05, + "loss": 5.8726, + "step": 151 + }, + { + "epoch": 0.03864611473065311, + "grad_norm": 13988.330078125, + "learning_rate": 1.916243654822335e-05, + "loss": 5.8794, + "step": 152 + }, + { + "epoch": 0.03890036548546003, + "grad_norm": 13807.6494140625, + "learning_rate": 1.9289340101522843e-05, + "loss": 5.8654, + "step": 153 + }, + { + "epoch": 0.03915461624026696, + "grad_norm": 14099.2060546875, + "learning_rate": 1.9416243654822337e-05, + "loss": 5.8787, + "step": 154 + }, + { + "epoch": 0.03940886699507389, + "grad_norm": 14038.322265625, + "learning_rate": 1.9543147208121827e-05, + "loss": 5.8783, + "step": 155 + }, + { + "epoch": 0.03966311774988082, + "grad_norm": 13967.1875, + "learning_rate": 1.967005076142132e-05, + "loss": 5.8966, + "step": 156 + }, + { + "epoch": 0.03991736850468775, + "grad_norm": 13959.806640625, + "learning_rate": 1.9796954314720815e-05, + "loss": 5.86, + "step": 157 + }, + { + "epoch": 0.040171619259494674, + "grad_norm": 14113.689453125, + "learning_rate": 1.9923857868020303e-05, + "loss": 5.8542, + "step": 158 + }, + { + "epoch": 0.040425870014301604, + "grad_norm": 14134.0732421875, + "learning_rate": 2.0050761421319797e-05, + "loss": 5.8784, + "step": 159 + }, + { + "epoch": 0.040680120769108534, + "grad_norm": 13855.0908203125, + "learning_rate": 2.017766497461929e-05, + "loss": 5.8766, + "step": 160 + }, + { + "epoch": 0.040934371523915464, + "grad_norm": 13932.7353515625, + "learning_rate": 2.030456852791878e-05, + "loss": 5.8655, + "step": 161 + }, + { + "epoch": 0.04118862227872239, + "grad_norm": 14085.6005859375, + "learning_rate": 2.0431472081218275e-05, + "loss": 5.8788, + "step": 162 + }, + { + "epoch": 0.041442873033529316, + "grad_norm": 13949.18359375, + "learning_rate": 2.055837563451777e-05, + "loss": 5.8905, + "step": 163 + }, + { + "epoch": 0.041697123788336246, + "grad_norm": 14003.30859375, + "learning_rate": 2.068527918781726e-05, + "loss": 5.8823, + "step": 164 + }, + { + "epoch": 0.041951374543143176, + "grad_norm": 14008.98046875, + "learning_rate": 2.0812182741116754e-05, + "loss": 5.8612, + "step": 165 + }, + { + "epoch": 0.042205625297950106, + "grad_norm": 14037.0654296875, + "learning_rate": 2.0939086294416244e-05, + "loss": 5.8743, + "step": 166 + }, + { + "epoch": 0.04245987605275703, + "grad_norm": 13932.53125, + "learning_rate": 2.1065989847715735e-05, + "loss": 5.8667, + "step": 167 + }, + { + "epoch": 0.04271412680756396, + "grad_norm": 13870.0458984375, + "learning_rate": 2.119289340101523e-05, + "loss": 5.8795, + "step": 168 + }, + { + "epoch": 0.04296837756237089, + "grad_norm": 13954.0673828125, + "learning_rate": 2.1319796954314723e-05, + "loss": 5.8477, + "step": 169 + }, + { + "epoch": 0.04322262831717782, + "grad_norm": 13942.2314453125, + "learning_rate": 2.1446700507614213e-05, + "loss": 5.881, + "step": 170 + }, + { + "epoch": 0.04347687907198475, + "grad_norm": 13986.8291015625, + "learning_rate": 2.1573604060913707e-05, + "loss": 5.8705, + "step": 171 + }, + { + "epoch": 0.04373112982679167, + "grad_norm": 14061.9111328125, + "learning_rate": 2.17005076142132e-05, + "loss": 5.865, + "step": 172 + }, + { + "epoch": 0.0439853805815986, + "grad_norm": 14076.083984375, + "learning_rate": 2.182741116751269e-05, + "loss": 5.8634, + "step": 173 + }, + { + "epoch": 0.04423963133640553, + "grad_norm": 14042.853515625, + "learning_rate": 2.1954314720812183e-05, + "loss": 5.8763, + "step": 174 + }, + { + "epoch": 0.04449388209121246, + "grad_norm": 13883.662109375, + "learning_rate": 2.2081218274111677e-05, + "loss": 5.8849, + "step": 175 + }, + { + "epoch": 0.04474813284601939, + "grad_norm": 13985.61328125, + "learning_rate": 2.2208121827411167e-05, + "loss": 5.8639, + "step": 176 + }, + { + "epoch": 0.04500238360082631, + "grad_norm": 14063.1689453125, + "learning_rate": 2.233502538071066e-05, + "loss": 5.8698, + "step": 177 + }, + { + "epoch": 0.04525663435563324, + "grad_norm": 14112.0966796875, + "learning_rate": 2.2461928934010155e-05, + "loss": 5.8915, + "step": 178 + }, + { + "epoch": 0.04551088511044017, + "grad_norm": 14043.98828125, + "learning_rate": 2.2588832487309646e-05, + "loss": 5.8587, + "step": 179 + }, + { + "epoch": 0.0457651358652471, + "grad_norm": 14180.72265625, + "learning_rate": 2.2715736040609136e-05, + "loss": 5.8631, + "step": 180 + }, + { + "epoch": 0.04601938662005403, + "grad_norm": 14142.736328125, + "learning_rate": 2.284263959390863e-05, + "loss": 5.8681, + "step": 181 + }, + { + "epoch": 0.046273637374860954, + "grad_norm": 14137.55859375, + "learning_rate": 2.296954314720812e-05, + "loss": 5.8552, + "step": 182 + }, + { + "epoch": 0.046527888129667884, + "grad_norm": 14159.0224609375, + "learning_rate": 2.3096446700507615e-05, + "loss": 5.8597, + "step": 183 + }, + { + "epoch": 0.046782138884474814, + "grad_norm": 14298.9453125, + "learning_rate": 2.322335025380711e-05, + "loss": 5.8619, + "step": 184 + }, + { + "epoch": 0.047036389639281743, + "grad_norm": 14007.9853515625, + "learning_rate": 2.33502538071066e-05, + "loss": 5.8654, + "step": 185 + }, + { + "epoch": 0.04729064039408867, + "grad_norm": 14221.35546875, + "learning_rate": 2.3477157360406094e-05, + "loss": 5.8506, + "step": 186 + }, + { + "epoch": 0.047544891148895596, + "grad_norm": 13967.9560546875, + "learning_rate": 2.3604060913705588e-05, + "loss": 5.8619, + "step": 187 + }, + { + "epoch": 0.047799141903702526, + "grad_norm": 14121.439453125, + "learning_rate": 2.3730964467005075e-05, + "loss": 5.8651, + "step": 188 + }, + { + "epoch": 0.048053392658509456, + "grad_norm": 13945.2744140625, + "learning_rate": 2.385786802030457e-05, + "loss": 5.874, + "step": 189 + }, + { + "epoch": 0.048307643413316385, + "grad_norm": 13990.255859375, + "learning_rate": 2.3984771573604063e-05, + "loss": 5.8624, + "step": 190 + }, + { + "epoch": 0.04856189416812331, + "grad_norm": 13951.9560546875, + "learning_rate": 2.4111675126903553e-05, + "loss": 5.8442, + "step": 191 + }, + { + "epoch": 0.04881614492293024, + "grad_norm": 14099.447265625, + "learning_rate": 2.4238578680203047e-05, + "loss": 5.8675, + "step": 192 + }, + { + "epoch": 0.04907039567773717, + "grad_norm": 13915.048828125, + "learning_rate": 2.436548223350254e-05, + "loss": 5.8706, + "step": 193 + }, + { + "epoch": 0.0493246464325441, + "grad_norm": 14257.0009765625, + "learning_rate": 2.4492385786802032e-05, + "loss": 5.8689, + "step": 194 + }, + { + "epoch": 0.04957889718735103, + "grad_norm": 13921.3193359375, + "learning_rate": 2.4619289340101523e-05, + "loss": 5.8521, + "step": 195 + }, + { + "epoch": 0.04983314794215795, + "grad_norm": 13949.6025390625, + "learning_rate": 2.4746192893401017e-05, + "loss": 5.8449, + "step": 196 + }, + { + "epoch": 0.05008739869696488, + "grad_norm": 14410.4150390625, + "learning_rate": 2.4873096446700507e-05, + "loss": 5.8856, + "step": 197 + }, + { + "epoch": 0.05034164945177181, + "grad_norm": 14101.67578125, + "learning_rate": 2.5e-05, + "loss": 5.8372, + "step": 198 + }, + { + "epoch": 0.05059590020657874, + "grad_norm": 14241.529296875, + "learning_rate": 2.5126903553299492e-05, + "loss": 5.8823, + "step": 199 + }, + { + "epoch": 0.05085015096138567, + "grad_norm": 14324.90234375, + "learning_rate": 2.5253807106598986e-05, + "loss": 5.8547, + "step": 200 + }, + { + "epoch": 0.05085015096138567, + "eval_loss": 11.810378074645996, + "eval_runtime": 695.8663, + "eval_samples_per_second": 152.308, + "eval_steps_per_second": 9.521, + "step": 200 + }, + { + "epoch": 0.05110440171619259, + "grad_norm": 13824.125, + "learning_rate": 2.5380710659898476e-05, + "loss": 5.8447, + "step": 201 + }, + { + "epoch": 0.05135865247099952, + "grad_norm": 14183.697265625, + "learning_rate": 2.5507614213197974e-05, + "loss": 5.8576, + "step": 202 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 13871.00390625, + "learning_rate": 2.563451776649746e-05, + "loss": 5.8533, + "step": 203 + }, + { + "epoch": 0.05186715398061338, + "grad_norm": 14473.0146484375, + "learning_rate": 2.576142131979696e-05, + "loss": 5.8639, + "step": 204 + }, + { + "epoch": 0.05212140473542031, + "grad_norm": 14182.2587890625, + "learning_rate": 2.588832487309645e-05, + "loss": 5.8501, + "step": 205 + }, + { + "epoch": 0.052375655490227234, + "grad_norm": 14335.306640625, + "learning_rate": 2.6015228426395936e-05, + "loss": 5.8581, + "step": 206 + }, + { + "epoch": 0.052629906245034164, + "grad_norm": 14553.783203125, + "learning_rate": 2.6142131979695434e-05, + "loss": 5.8572, + "step": 207 + }, + { + "epoch": 0.052884156999841093, + "grad_norm": 14136.1455078125, + "learning_rate": 2.6269035532994924e-05, + "loss": 5.8586, + "step": 208 + }, + { + "epoch": 0.05313840775464802, + "grad_norm": 14284.5458984375, + "learning_rate": 2.6395939086294418e-05, + "loss": 5.8548, + "step": 209 + }, + { + "epoch": 0.05339265850945495, + "grad_norm": 14215.7578125, + "learning_rate": 2.652284263959391e-05, + "loss": 5.8581, + "step": 210 + }, + { + "epoch": 0.053646909264261876, + "grad_norm": 14402.2216796875, + "learning_rate": 2.6649746192893406e-05, + "loss": 5.8437, + "step": 211 + }, + { + "epoch": 0.053901160019068806, + "grad_norm": 14041.2705078125, + "learning_rate": 2.6776649746192893e-05, + "loss": 5.8627, + "step": 212 + }, + { + "epoch": 0.054155410773875735, + "grad_norm": 14536.2783203125, + "learning_rate": 2.6903553299492384e-05, + "loss": 5.8507, + "step": 213 + }, + { + "epoch": 0.054409661528682665, + "grad_norm": 14204.775390625, + "learning_rate": 2.703045685279188e-05, + "loss": 5.8558, + "step": 214 + }, + { + "epoch": 0.054663912283489595, + "grad_norm": 14513.5751953125, + "learning_rate": 2.715736040609137e-05, + "loss": 5.8297, + "step": 215 + }, + { + "epoch": 0.05491816303829652, + "grad_norm": 14284.46875, + "learning_rate": 2.7284263959390866e-05, + "loss": 5.8549, + "step": 216 + }, + { + "epoch": 0.05517241379310345, + "grad_norm": 14067.05078125, + "learning_rate": 2.7411167512690357e-05, + "loss": 5.8432, + "step": 217 + }, + { + "epoch": 0.05542666454791038, + "grad_norm": 14304.111328125, + "learning_rate": 2.753807106598985e-05, + "loss": 5.8596, + "step": 218 + }, + { + "epoch": 0.05568091530271731, + "grad_norm": 14343.162109375, + "learning_rate": 2.766497461928934e-05, + "loss": 5.834, + "step": 219 + }, + { + "epoch": 0.05593516605752423, + "grad_norm": 14471.5126953125, + "learning_rate": 2.7791878172588832e-05, + "loss": 5.8478, + "step": 220 + }, + { + "epoch": 0.05618941681233116, + "grad_norm": 14145.4619140625, + "learning_rate": 2.7918781725888326e-05, + "loss": 5.8566, + "step": 221 + }, + { + "epoch": 0.05644366756713809, + "grad_norm": 14756.76171875, + "learning_rate": 2.8045685279187816e-05, + "loss": 5.847, + "step": 222 + }, + { + "epoch": 0.05669791832194502, + "grad_norm": 14184.3203125, + "learning_rate": 2.8172588832487314e-05, + "loss": 5.8627, + "step": 223 + }, + { + "epoch": 0.05695216907675195, + "grad_norm": 14302.7763671875, + "learning_rate": 2.82994923857868e-05, + "loss": 5.8573, + "step": 224 + }, + { + "epoch": 0.05720641983155887, + "grad_norm": 14140.478515625, + "learning_rate": 2.84263959390863e-05, + "loss": 5.8599, + "step": 225 + }, + { + "epoch": 0.0574606705863658, + "grad_norm": 14565.8662109375, + "learning_rate": 2.855329949238579e-05, + "loss": 5.8474, + "step": 226 + }, + { + "epoch": 0.05771492134117273, + "grad_norm": 14593.7666015625, + "learning_rate": 2.8680203045685283e-05, + "loss": 5.8386, + "step": 227 + }, + { + "epoch": 0.05796917209597966, + "grad_norm": 14264.8037109375, + "learning_rate": 2.8807106598984774e-05, + "loss": 5.8382, + "step": 228 + }, + { + "epoch": 0.05822342285078659, + "grad_norm": 14515.216796875, + "learning_rate": 2.8934010152284264e-05, + "loss": 5.8271, + "step": 229 + }, + { + "epoch": 0.058477673605593514, + "grad_norm": 14455.34375, + "learning_rate": 2.9060913705583758e-05, + "loss": 5.8454, + "step": 230 + }, + { + "epoch": 0.05873192436040044, + "grad_norm": 14383.3017578125, + "learning_rate": 2.918781725888325e-05, + "loss": 5.8468, + "step": 231 + }, + { + "epoch": 0.05898617511520737, + "grad_norm": 14204.083984375, + "learning_rate": 2.9314720812182743e-05, + "loss": 5.8546, + "step": 232 + }, + { + "epoch": 0.0592404258700143, + "grad_norm": 14394.791015625, + "learning_rate": 2.9441624365482233e-05, + "loss": 5.8401, + "step": 233 + }, + { + "epoch": 0.05949467662482123, + "grad_norm": 14483.0283203125, + "learning_rate": 2.956852791878173e-05, + "loss": 5.8406, + "step": 234 + }, + { + "epoch": 0.059748927379628156, + "grad_norm": 14330.568359375, + "learning_rate": 2.969543147208122e-05, + "loss": 5.843, + "step": 235 + }, + { + "epoch": 0.060003178134435085, + "grad_norm": 14366.0263671875, + "learning_rate": 2.982233502538071e-05, + "loss": 5.8554, + "step": 236 + }, + { + "epoch": 0.060257428889242015, + "grad_norm": 14203.2294921875, + "learning_rate": 2.9949238578680206e-05, + "loss": 5.829, + "step": 237 + }, + { + "epoch": 0.060511679644048945, + "grad_norm": 14442.908203125, + "learning_rate": 3.0076142131979696e-05, + "loss": 5.8308, + "step": 238 + }, + { + "epoch": 0.060765930398855875, + "grad_norm": 14376.8310546875, + "learning_rate": 3.020304568527919e-05, + "loss": 5.8311, + "step": 239 + }, + { + "epoch": 0.0610201811536628, + "grad_norm": 14524.6455078125, + "learning_rate": 3.032994923857868e-05, + "loss": 5.8133, + "step": 240 + }, + { + "epoch": 0.06127443190846973, + "grad_norm": 16003.4541015625, + "learning_rate": 3.0456852791878175e-05, + "loss": 5.8434, + "step": 241 + }, + { + "epoch": 0.06152868266327666, + "grad_norm": 16413.31640625, + "learning_rate": 3.0583756345177666e-05, + "loss": 5.8513, + "step": 242 + }, + { + "epoch": 0.06178293341808359, + "grad_norm": 15272.3359375, + "learning_rate": 3.071065989847716e-05, + "loss": 5.8448, + "step": 243 + }, + { + "epoch": 0.06203718417289052, + "grad_norm": 14435.873046875, + "learning_rate": 3.0837563451776654e-05, + "loss": 5.8461, + "step": 244 + }, + { + "epoch": 0.06229143492769744, + "grad_norm": 16821.875, + "learning_rate": 3.096446700507614e-05, + "loss": 5.8183, + "step": 245 + }, + { + "epoch": 0.06254568568250438, + "grad_norm": 17576.55078125, + "learning_rate": 3.1091370558375635e-05, + "loss": 5.8332, + "step": 246 + }, + { + "epoch": 0.0627999364373113, + "grad_norm": 14018.4365234375, + "learning_rate": 3.121827411167513e-05, + "loss": 5.8088, + "step": 247 + }, + { + "epoch": 0.06305418719211822, + "grad_norm": 21211.966796875, + "learning_rate": 3.134517766497462e-05, + "loss": 5.8292, + "step": 248 + }, + { + "epoch": 0.06330843794692516, + "grad_norm": 18285.978515625, + "learning_rate": 3.147208121827411e-05, + "loss": 5.829, + "step": 249 + }, + { + "epoch": 0.06356268870173208, + "grad_norm": 15779.1826171875, + "learning_rate": 3.1598984771573604e-05, + "loss": 5.8409, + "step": 250 + }, + { + "epoch": 0.06381693945653902, + "grad_norm": 29363.07421875, + "learning_rate": 3.17258883248731e-05, + "loss": 5.8376, + "step": 251 + }, + { + "epoch": 0.06407119021134594, + "grad_norm": 14751.14453125, + "learning_rate": 3.185279187817259e-05, + "loss": 5.8504, + "step": 252 + }, + { + "epoch": 0.06432544096615286, + "grad_norm": 22211.19140625, + "learning_rate": 3.1979695431472086e-05, + "loss": 5.8253, + "step": 253 + }, + { + "epoch": 0.0645796917209598, + "grad_norm": 16582.279296875, + "learning_rate": 3.210659898477157e-05, + "loss": 5.8189, + "step": 254 + }, + { + "epoch": 0.06483394247576672, + "grad_norm": 28961.861328125, + "learning_rate": 3.223350253807107e-05, + "loss": 5.8252, + "step": 255 + }, + { + "epoch": 0.06508819323057366, + "grad_norm": 22928.2421875, + "learning_rate": 3.236040609137056e-05, + "loss": 5.8319, + "step": 256 + }, + { + "epoch": 0.06534244398538058, + "grad_norm": 21412.84765625, + "learning_rate": 3.248730964467005e-05, + "loss": 5.8437, + "step": 257 + }, + { + "epoch": 0.0655966947401875, + "grad_norm": 21025.75, + "learning_rate": 3.261421319796954e-05, + "loss": 5.8195, + "step": 258 + }, + { + "epoch": 0.06585094549499444, + "grad_norm": 14630.908203125, + "learning_rate": 3.2741116751269036e-05, + "loss": 5.8212, + "step": 259 + }, + { + "epoch": 0.06610519624980137, + "grad_norm": 25747.43359375, + "learning_rate": 3.286802030456853e-05, + "loss": 5.8387, + "step": 260 + }, + { + "epoch": 0.0663594470046083, + "grad_norm": 19133.5546875, + "learning_rate": 3.299492385786802e-05, + "loss": 5.8295, + "step": 261 + }, + { + "epoch": 0.06661369775941522, + "grad_norm": 18813.26171875, + "learning_rate": 3.312182741116752e-05, + "loss": 5.8433, + "step": 262 + }, + { + "epoch": 0.06686794851422215, + "grad_norm": 19891.6015625, + "learning_rate": 3.3248730964467006e-05, + "loss": 5.8359, + "step": 263 + }, + { + "epoch": 0.06712219926902908, + "grad_norm": 14633.3525390625, + "learning_rate": 3.33756345177665e-05, + "loss": 5.8482, + "step": 264 + }, + { + "epoch": 0.06737645002383601, + "grad_norm": 22824.740234375, + "learning_rate": 3.3502538071065994e-05, + "loss": 5.8368, + "step": 265 + }, + { + "epoch": 0.06763070077864293, + "grad_norm": 17525.8125, + "learning_rate": 3.362944162436548e-05, + "loss": 5.8235, + "step": 266 + }, + { + "epoch": 0.06788495153344987, + "grad_norm": 16179.4912109375, + "learning_rate": 3.3756345177664975e-05, + "loss": 5.8129, + "step": 267 + }, + { + "epoch": 0.06813920228825679, + "grad_norm": 19221.17578125, + "learning_rate": 3.388324873096447e-05, + "loss": 5.8252, + "step": 268 + }, + { + "epoch": 0.06839345304306373, + "grad_norm": 14516.4716796875, + "learning_rate": 3.401015228426396e-05, + "loss": 5.8114, + "step": 269 + }, + { + "epoch": 0.06864770379787065, + "grad_norm": 18901.623046875, + "learning_rate": 3.413705583756345e-05, + "loss": 5.8357, + "step": 270 + }, + { + "epoch": 0.06890195455267757, + "grad_norm": 17530.751953125, + "learning_rate": 3.4263959390862944e-05, + "loss": 5.8415, + "step": 271 + }, + { + "epoch": 0.06915620530748451, + "grad_norm": 16299.2001953125, + "learning_rate": 3.439086294416244e-05, + "loss": 5.8054, + "step": 272 + }, + { + "epoch": 0.06941045606229143, + "grad_norm": 18370.28515625, + "learning_rate": 3.451776649746193e-05, + "loss": 5.8022, + "step": 273 + }, + { + "epoch": 0.06966470681709837, + "grad_norm": 14381.2001953125, + "learning_rate": 3.4644670050761426e-05, + "loss": 5.8151, + "step": 274 + }, + { + "epoch": 0.06991895757190529, + "grad_norm": 18153.146484375, + "learning_rate": 3.477157360406091e-05, + "loss": 5.8362, + "step": 275 + }, + { + "epoch": 0.07017320832671221, + "grad_norm": 15234.4326171875, + "learning_rate": 3.489847715736041e-05, + "loss": 5.8043, + "step": 276 + }, + { + "epoch": 0.07042745908151915, + "grad_norm": 17595.009765625, + "learning_rate": 3.50253807106599e-05, + "loss": 5.8226, + "step": 277 + }, + { + "epoch": 0.07068170983632607, + "grad_norm": 17185.26171875, + "learning_rate": 3.5152284263959395e-05, + "loss": 5.8155, + "step": 278 + }, + { + "epoch": 0.07093596059113301, + "grad_norm": 16285.2021484375, + "learning_rate": 3.527918781725888e-05, + "loss": 5.8099, + "step": 279 + }, + { + "epoch": 0.07119021134593993, + "grad_norm": 16419.501953125, + "learning_rate": 3.5406091370558376e-05, + "loss": 5.8287, + "step": 280 + }, + { + "epoch": 0.07144446210074686, + "grad_norm": 15122.501953125, + "learning_rate": 3.553299492385787e-05, + "loss": 5.8258, + "step": 281 + }, + { + "epoch": 0.07169871285555379, + "grad_norm": 16796.24609375, + "learning_rate": 3.565989847715736e-05, + "loss": 5.8221, + "step": 282 + }, + { + "epoch": 0.07195296361036072, + "grad_norm": 14491.029296875, + "learning_rate": 3.578680203045686e-05, + "loss": 5.815, + "step": 283 + }, + { + "epoch": 0.07220721436516765, + "grad_norm": 15623.6533203125, + "learning_rate": 3.5913705583756346e-05, + "loss": 5.8259, + "step": 284 + }, + { + "epoch": 0.07246146511997457, + "grad_norm": 14359.7822265625, + "learning_rate": 3.604060913705584e-05, + "loss": 5.8075, + "step": 285 + }, + { + "epoch": 0.0727157158747815, + "grad_norm": 15124.78515625, + "learning_rate": 3.6167512690355334e-05, + "loss": 5.8165, + "step": 286 + }, + { + "epoch": 0.07296996662958843, + "grad_norm": 14818.9287109375, + "learning_rate": 3.629441624365482e-05, + "loss": 5.8249, + "step": 287 + }, + { + "epoch": 0.07322421738439536, + "grad_norm": 16652.857421875, + "learning_rate": 3.6421319796954315e-05, + "loss": 5.818, + "step": 288 + }, + { + "epoch": 0.0734784681392023, + "grad_norm": 14468.9892578125, + "learning_rate": 3.654822335025381e-05, + "loss": 5.8158, + "step": 289 + }, + { + "epoch": 0.07373271889400922, + "grad_norm": 16769.716796875, + "learning_rate": 3.66751269035533e-05, + "loss": 5.8083, + "step": 290 + }, + { + "epoch": 0.07398696964881614, + "grad_norm": 14478.5517578125, + "learning_rate": 3.680203045685279e-05, + "loss": 5.8264, + "step": 291 + }, + { + "epoch": 0.07424122040362308, + "grad_norm": 16448.939453125, + "learning_rate": 3.692893401015229e-05, + "loss": 5.82, + "step": 292 + }, + { + "epoch": 0.07449547115843, + "grad_norm": 14361.9794921875, + "learning_rate": 3.705583756345178e-05, + "loss": 5.8203, + "step": 293 + }, + { + "epoch": 0.07474972191323694, + "grad_norm": 16790.470703125, + "learning_rate": 3.7182741116751265e-05, + "loss": 5.8109, + "step": 294 + }, + { + "epoch": 0.07500397266804386, + "grad_norm": 14581.0, + "learning_rate": 3.7309644670050766e-05, + "loss": 5.8106, + "step": 295 + }, + { + "epoch": 0.07525822342285078, + "grad_norm": 16166.630859375, + "learning_rate": 3.743654822335025e-05, + "loss": 5.822, + "step": 296 + }, + { + "epoch": 0.07551247417765772, + "grad_norm": 14668.458984375, + "learning_rate": 3.756345177664975e-05, + "loss": 5.8238, + "step": 297 + }, + { + "epoch": 0.07576672493246464, + "grad_norm": 15926.828125, + "learning_rate": 3.769035532994924e-05, + "loss": 5.8182, + "step": 298 + }, + { + "epoch": 0.07602097568727158, + "grad_norm": 16209.4697265625, + "learning_rate": 3.7817258883248735e-05, + "loss": 5.8131, + "step": 299 + }, + { + "epoch": 0.0762752264420785, + "grad_norm": 14617.1806640625, + "learning_rate": 3.794416243654822e-05, + "loss": 5.7984, + "step": 300 + }, + { + "epoch": 0.0762752264420785, + "eval_loss": 11.709574699401855, + "eval_runtime": 696.5225, + "eval_samples_per_second": 152.164, + "eval_steps_per_second": 9.512, + "step": 300 + }, + { + "epoch": 0.07652947719688542, + "grad_norm": 16631.537109375, + "learning_rate": 3.8071065989847716e-05, + "loss": 5.8042, + "step": 301 + }, + { + "epoch": 0.07678372795169236, + "grad_norm": 14410.109375, + "learning_rate": 3.819796954314721e-05, + "loss": 5.8009, + "step": 302 + }, + { + "epoch": 0.07703797870649928, + "grad_norm": 16206.8203125, + "learning_rate": 3.83248730964467e-05, + "loss": 5.7978, + "step": 303 + }, + { + "epoch": 0.07729222946130622, + "grad_norm": 15047.525390625, + "learning_rate": 3.84517766497462e-05, + "loss": 5.8107, + "step": 304 + }, + { + "epoch": 0.07754648021611314, + "grad_norm": 14869.10546875, + "learning_rate": 3.8578680203045685e-05, + "loss": 5.8066, + "step": 305 + }, + { + "epoch": 0.07780073097092007, + "grad_norm": 15287.2099609375, + "learning_rate": 3.870558375634518e-05, + "loss": 5.7939, + "step": 306 + }, + { + "epoch": 0.078054981725727, + "grad_norm": 14399.8154296875, + "learning_rate": 3.8832487309644673e-05, + "loss": 5.7954, + "step": 307 + }, + { + "epoch": 0.07830923248053392, + "grad_norm": 14655.740234375, + "learning_rate": 3.895939086294416e-05, + "loss": 5.7993, + "step": 308 + }, + { + "epoch": 0.07856348323534086, + "grad_norm": 15159.48828125, + "learning_rate": 3.9086294416243655e-05, + "loss": 5.8116, + "step": 309 + }, + { + "epoch": 0.07881773399014778, + "grad_norm": 15555.21875, + "learning_rate": 3.921319796954315e-05, + "loss": 5.8081, + "step": 310 + }, + { + "epoch": 0.07907198474495471, + "grad_norm": 15971.390625, + "learning_rate": 3.934010152284264e-05, + "loss": 5.8004, + "step": 311 + }, + { + "epoch": 0.07932623549976164, + "grad_norm": 14717.396484375, + "learning_rate": 3.946700507614213e-05, + "loss": 5.8227, + "step": 312 + }, + { + "epoch": 0.07958048625456857, + "grad_norm": 15724.9521484375, + "learning_rate": 3.959390862944163e-05, + "loss": 5.7976, + "step": 313 + }, + { + "epoch": 0.0798347370093755, + "grad_norm": 15999.9755859375, + "learning_rate": 3.972081218274112e-05, + "loss": 5.8175, + "step": 314 + }, + { + "epoch": 0.08008898776418243, + "grad_norm": 14571.6796875, + "learning_rate": 3.9847715736040605e-05, + "loss": 5.7995, + "step": 315 + }, + { + "epoch": 0.08034323851898935, + "grad_norm": 15078.56640625, + "learning_rate": 3.9974619289340106e-05, + "loss": 5.8054, + "step": 316 + }, + { + "epoch": 0.08059748927379629, + "grad_norm": 16242.9296875, + "learning_rate": 4.010152284263959e-05, + "loss": 5.8063, + "step": 317 + }, + { + "epoch": 0.08085174002860321, + "grad_norm": 15402.4443359375, + "learning_rate": 4.022842639593909e-05, + "loss": 5.8046, + "step": 318 + }, + { + "epoch": 0.08110599078341015, + "grad_norm": 14665.2529296875, + "learning_rate": 4.035532994923858e-05, + "loss": 5.8062, + "step": 319 + }, + { + "epoch": 0.08136024153821707, + "grad_norm": 15843.931640625, + "learning_rate": 4.0482233502538075e-05, + "loss": 5.8005, + "step": 320 + }, + { + "epoch": 0.08161449229302399, + "grad_norm": 16756.453125, + "learning_rate": 4.060913705583756e-05, + "loss": 5.7895, + "step": 321 + }, + { + "epoch": 0.08186874304783093, + "grad_norm": 16084.455078125, + "learning_rate": 4.073604060913706e-05, + "loss": 5.8098, + "step": 322 + }, + { + "epoch": 0.08212299380263785, + "grad_norm": 14420.24609375, + "learning_rate": 4.086294416243655e-05, + "loss": 5.7863, + "step": 323 + }, + { + "epoch": 0.08237724455744477, + "grad_norm": 15253.8271484375, + "learning_rate": 4.098984771573604e-05, + "loss": 5.7787, + "step": 324 + }, + { + "epoch": 0.08263149531225171, + "grad_norm": 18756.3671875, + "learning_rate": 4.111675126903554e-05, + "loss": 5.8293, + "step": 325 + }, + { + "epoch": 0.08288574606705863, + "grad_norm": 17335.005859375, + "learning_rate": 4.1243654822335025e-05, + "loss": 5.8052, + "step": 326 + }, + { + "epoch": 0.08313999682186557, + "grad_norm": 14691.3466796875, + "learning_rate": 4.137055837563452e-05, + "loss": 5.7827, + "step": 327 + }, + { + "epoch": 0.08339424757667249, + "grad_norm": 15806.7529296875, + "learning_rate": 4.1497461928934013e-05, + "loss": 5.7762, + "step": 328 + }, + { + "epoch": 0.08364849833147942, + "grad_norm": 22169.841796875, + "learning_rate": 4.162436548223351e-05, + "loss": 5.778, + "step": 329 + }, + { + "epoch": 0.08390274908628635, + "grad_norm": 21508.794921875, + "learning_rate": 4.1751269035532995e-05, + "loss": 5.8074, + "step": 330 + }, + { + "epoch": 0.08415699984109327, + "grad_norm": 15525.013671875, + "learning_rate": 4.187817258883249e-05, + "loss": 5.7905, + "step": 331 + }, + { + "epoch": 0.08441125059590021, + "grad_norm": 14844.3486328125, + "learning_rate": 4.200507614213198e-05, + "loss": 5.7763, + "step": 332 + }, + { + "epoch": 0.08466550135070713, + "grad_norm": 16572.13671875, + "learning_rate": 4.213197969543147e-05, + "loss": 5.781, + "step": 333 + }, + { + "epoch": 0.08491975210551406, + "grad_norm": 18925.337890625, + "learning_rate": 4.225888324873097e-05, + "loss": 5.7868, + "step": 334 + }, + { + "epoch": 0.085174002860321, + "grad_norm": 19097.6875, + "learning_rate": 4.238578680203046e-05, + "loss": 5.7849, + "step": 335 + }, + { + "epoch": 0.08542825361512792, + "grad_norm": 18242.955078125, + "learning_rate": 4.251269035532995e-05, + "loss": 5.7796, + "step": 336 + }, + { + "epoch": 0.08568250436993485, + "grad_norm": 19598.6796875, + "learning_rate": 4.2639593908629446e-05, + "loss": 5.7923, + "step": 337 + }, + { + "epoch": 0.08593675512474178, + "grad_norm": 19591.587890625, + "learning_rate": 4.276649746192893e-05, + "loss": 5.7912, + "step": 338 + }, + { + "epoch": 0.0861910058795487, + "grad_norm": 15452.560546875, + "learning_rate": 4.289340101522843e-05, + "loss": 5.7846, + "step": 339 + }, + { + "epoch": 0.08644525663435564, + "grad_norm": 15059.263671875, + "learning_rate": 4.302030456852792e-05, + "loss": 5.7758, + "step": 340 + }, + { + "epoch": 0.08669950738916256, + "grad_norm": 21027.1640625, + "learning_rate": 4.3147208121827415e-05, + "loss": 5.8121, + "step": 341 + }, + { + "epoch": 0.0869537581439695, + "grad_norm": 19786.572265625, + "learning_rate": 4.32741116751269e-05, + "loss": 5.7721, + "step": 342 + }, + { + "epoch": 0.08720800889877642, + "grad_norm": 14951.8037109375, + "learning_rate": 4.34010152284264e-05, + "loss": 5.782, + "step": 343 + }, + { + "epoch": 0.08746225965358334, + "grad_norm": 22819.787109375, + "learning_rate": 4.352791878172589e-05, + "loss": 5.7658, + "step": 344 + }, + { + "epoch": 0.08771651040839028, + "grad_norm": 21210.58984375, + "learning_rate": 4.365482233502538e-05, + "loss": 5.7992, + "step": 345 + }, + { + "epoch": 0.0879707611631972, + "grad_norm": 14568.482421875, + "learning_rate": 4.378172588832488e-05, + "loss": 5.7944, + "step": 346 + }, + { + "epoch": 0.08822501191800414, + "grad_norm": 21739.427734375, + "learning_rate": 4.3908629441624365e-05, + "loss": 5.7758, + "step": 347 + }, + { + "epoch": 0.08847926267281106, + "grad_norm": 20614.376953125, + "learning_rate": 4.403553299492386e-05, + "loss": 5.7728, + "step": 348 + }, + { + "epoch": 0.08873351342761798, + "grad_norm": 14787.8486328125, + "learning_rate": 4.416243654822335e-05, + "loss": 5.786, + "step": 349 + }, + { + "epoch": 0.08898776418242492, + "grad_norm": 17875.984375, + "learning_rate": 4.428934010152285e-05, + "loss": 5.7926, + "step": 350 + }, + { + "epoch": 0.08924201493723184, + "grad_norm": 22780.046875, + "learning_rate": 4.4416243654822335e-05, + "loss": 5.7632, + "step": 351 + }, + { + "epoch": 0.08949626569203878, + "grad_norm": 15378.287109375, + "learning_rate": 4.454314720812183e-05, + "loss": 5.7809, + "step": 352 + }, + { + "epoch": 0.0897505164468457, + "grad_norm": 18587.49609375, + "learning_rate": 4.467005076142132e-05, + "loss": 5.7907, + "step": 353 + }, + { + "epoch": 0.09000476720165262, + "grad_norm": 25359.814453125, + "learning_rate": 4.479695431472081e-05, + "loss": 5.7719, + "step": 354 + }, + { + "epoch": 0.09025901795645956, + "grad_norm": 15609.775390625, + "learning_rate": 4.492385786802031e-05, + "loss": 5.784, + "step": 355 + }, + { + "epoch": 0.09051326871126648, + "grad_norm": 16369.25, + "learning_rate": 4.50507614213198e-05, + "loss": 5.7869, + "step": 356 + }, + { + "epoch": 0.09076751946607342, + "grad_norm": 23553.107421875, + "learning_rate": 4.517766497461929e-05, + "loss": 5.76, + "step": 357 + }, + { + "epoch": 0.09102177022088034, + "grad_norm": 16909.330078125, + "learning_rate": 4.5304568527918786e-05, + "loss": 5.769, + "step": 358 + }, + { + "epoch": 0.09127602097568727, + "grad_norm": 14716.7490234375, + "learning_rate": 4.543147208121827e-05, + "loss": 5.7636, + "step": 359 + }, + { + "epoch": 0.0915302717304942, + "grad_norm": 15915.0244140625, + "learning_rate": 4.555837563451777e-05, + "loss": 5.7797, + "step": 360 + }, + { + "epoch": 0.09178452248530113, + "grad_norm": 20231.58203125, + "learning_rate": 4.568527918781726e-05, + "loss": 5.7885, + "step": 361 + }, + { + "epoch": 0.09203877324010806, + "grad_norm": 19425.763671875, + "learning_rate": 4.5812182741116755e-05, + "loss": 5.7585, + "step": 362 + }, + { + "epoch": 0.09229302399491499, + "grad_norm": 15632.22265625, + "learning_rate": 4.593908629441624e-05, + "loss": 5.7596, + "step": 363 + }, + { + "epoch": 0.09254727474972191, + "grad_norm": 14821.1826171875, + "learning_rate": 4.606598984771574e-05, + "loss": 5.7629, + "step": 364 + }, + { + "epoch": 0.09280152550452885, + "grad_norm": 14788.517578125, + "learning_rate": 4.619289340101523e-05, + "loss": 5.7517, + "step": 365 + }, + { + "epoch": 0.09305577625933577, + "grad_norm": 16733.14453125, + "learning_rate": 4.631979695431472e-05, + "loss": 5.7765, + "step": 366 + }, + { + "epoch": 0.0933100270141427, + "grad_norm": 17748.39453125, + "learning_rate": 4.644670050761422e-05, + "loss": 5.7574, + "step": 367 + }, + { + "epoch": 0.09356427776894963, + "grad_norm": 17039.28125, + "learning_rate": 4.6573604060913705e-05, + "loss": 5.7625, + "step": 368 + }, + { + "epoch": 0.09381852852375655, + "grad_norm": 17125.658203125, + "learning_rate": 4.67005076142132e-05, + "loss": 5.7632, + "step": 369 + }, + { + "epoch": 0.09407277927856349, + "grad_norm": 19509.609375, + "learning_rate": 4.682741116751269e-05, + "loss": 5.7818, + "step": 370 + }, + { + "epoch": 0.09432703003337041, + "grad_norm": 19239.0546875, + "learning_rate": 4.695431472081219e-05, + "loss": 5.7627, + "step": 371 + }, + { + "epoch": 0.09458128078817735, + "grad_norm": 17571.00390625, + "learning_rate": 4.7081218274111674e-05, + "loss": 5.7619, + "step": 372 + }, + { + "epoch": 0.09483553154298427, + "grad_norm": 16918.115234375, + "learning_rate": 4.7208121827411175e-05, + "loss": 5.7548, + "step": 373 + }, + { + "epoch": 0.09508978229779119, + "grad_norm": 19990.57421875, + "learning_rate": 4.733502538071066e-05, + "loss": 5.7612, + "step": 374 + }, + { + "epoch": 0.09534403305259813, + "grad_norm": 23999.90234375, + "learning_rate": 4.746192893401015e-05, + "loss": 5.7643, + "step": 375 + }, + { + "epoch": 0.09559828380740505, + "grad_norm": 19003.72265625, + "learning_rate": 4.758883248730965e-05, + "loss": 5.7416, + "step": 376 + }, + { + "epoch": 0.09585253456221199, + "grad_norm": 16454.0859375, + "learning_rate": 4.771573604060914e-05, + "loss": 5.7641, + "step": 377 + }, + { + "epoch": 0.09610678531701891, + "grad_norm": 17433.04296875, + "learning_rate": 4.784263959390863e-05, + "loss": 5.7706, + "step": 378 + }, + { + "epoch": 0.09636103607182583, + "grad_norm": 20504.951171875, + "learning_rate": 4.7969543147208126e-05, + "loss": 5.7641, + "step": 379 + }, + { + "epoch": 0.09661528682663277, + "grad_norm": 18765.4765625, + "learning_rate": 4.809644670050762e-05, + "loss": 5.7462, + "step": 380 + }, + { + "epoch": 0.0968695375814397, + "grad_norm": 16072.2783203125, + "learning_rate": 4.822335025380711e-05, + "loss": 5.7729, + "step": 381 + }, + { + "epoch": 0.09712378833624662, + "grad_norm": 15258.67578125, + "learning_rate": 4.83502538071066e-05, + "loss": 5.7606, + "step": 382 + }, + { + "epoch": 0.09737803909105355, + "grad_norm": 14945.3935546875, + "learning_rate": 4.8477157360406095e-05, + "loss": 5.7608, + "step": 383 + }, + { + "epoch": 0.09763228984586048, + "grad_norm": 15470.05859375, + "learning_rate": 4.860406091370558e-05, + "loss": 5.7531, + "step": 384 + }, + { + "epoch": 0.09788654060066741, + "grad_norm": 20388.533203125, + "learning_rate": 4.873096446700508e-05, + "loss": 5.7696, + "step": 385 + }, + { + "epoch": 0.09814079135547434, + "grad_norm": 28126.078125, + "learning_rate": 4.885786802030457e-05, + "loss": 5.7731, + "step": 386 + }, + { + "epoch": 0.09839504211028126, + "grad_norm": 15670.330078125, + "learning_rate": 4.8984771573604064e-05, + "loss": 5.7566, + "step": 387 + }, + { + "epoch": 0.0986492928650882, + "grad_norm": 17388.93359375, + "learning_rate": 4.911167512690356e-05, + "loss": 5.7731, + "step": 388 + }, + { + "epoch": 0.09890354361989512, + "grad_norm": 28166.693359375, + "learning_rate": 4.9238578680203045e-05, + "loss": 5.7375, + "step": 389 + }, + { + "epoch": 0.09915779437470205, + "grad_norm": 17543.197265625, + "learning_rate": 4.936548223350254e-05, + "loss": 5.7589, + "step": 390 + }, + { + "epoch": 0.09941204512950898, + "grad_norm": 15108.068359375, + "learning_rate": 4.949238578680203e-05, + "loss": 5.7438, + "step": 391 + }, + { + "epoch": 0.0996662958843159, + "grad_norm": 14917.2275390625, + "learning_rate": 4.961928934010153e-05, + "loss": 5.7403, + "step": 392 + }, + { + "epoch": 0.09992054663912284, + "grad_norm": 14791.08984375, + "learning_rate": 4.9746192893401014e-05, + "loss": 5.7605, + "step": 393 + }, + { + "epoch": 0.10017479739392976, + "grad_norm": 16332.1171875, + "learning_rate": 4.9873096446700515e-05, + "loss": 5.755, + "step": 394 + }, + { + "epoch": 0.1004290481487367, + "grad_norm": 24996.771484375, + "learning_rate": 5e-05, + "loss": 5.7636, + "step": 395 + }, + { + "epoch": 0.10068329890354362, + "grad_norm": 28145.7265625, + "learning_rate": 4.999999014971934e-05, + "loss": 5.7464, + "step": 396 + }, + { + "epoch": 0.10093754965835054, + "grad_norm": 18033.251953125, + "learning_rate": 4.999996059888515e-05, + "loss": 5.7638, + "step": 397 + }, + { + "epoch": 0.10119180041315748, + "grad_norm": 17646.732421875, + "learning_rate": 4.999991134752069e-05, + "loss": 5.7582, + "step": 398 + }, + { + "epoch": 0.1014460511679644, + "grad_norm": 24271.55078125, + "learning_rate": 4.9999842395664773e-05, + "loss": 5.7309, + "step": 399 + }, + { + "epoch": 0.10170030192277134, + "grad_norm": 28549.97265625, + "learning_rate": 4.999975374337176e-05, + "loss": 5.7474, + "step": 400 + }, + { + "epoch": 0.10170030192277134, + "eval_loss": 11.57848072052002, + "eval_runtime": 697.3055, + "eval_samples_per_second": 151.994, + "eval_steps_per_second": 9.501, + "step": 400 + }, + { + "epoch": 0.10195455267757826, + "grad_norm": 17738.744140625, + "learning_rate": 4.999964539071148e-05, + "loss": 5.7357, + "step": 401 + }, + { + "epoch": 0.10220880343238518, + "grad_norm": 15361.2373046875, + "learning_rate": 4.999951733776933e-05, + "loss": 5.7243, + "step": 402 + }, + { + "epoch": 0.10246305418719212, + "grad_norm": 15122.3681640625, + "learning_rate": 4.9999369584646226e-05, + "loss": 5.7351, + "step": 403 + }, + { + "epoch": 0.10271730494199904, + "grad_norm": 15341.8037109375, + "learning_rate": 4.99992021314586e-05, + "loss": 5.7437, + "step": 404 + }, + { + "epoch": 0.10297155569680598, + "grad_norm": 19395.509765625, + "learning_rate": 4.99990149783384e-05, + "loss": 5.7491, + "step": 405 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 26739.3359375, + "learning_rate": 4.9998808125433106e-05, + "loss": 5.7481, + "step": 406 + }, + { + "epoch": 0.10348005720641983, + "grad_norm": 17126.6171875, + "learning_rate": 4.9998581572905724e-05, + "loss": 5.7338, + "step": 407 + }, + { + "epoch": 0.10373430796122676, + "grad_norm": 15103.6669921875, + "learning_rate": 4.9998335320934795e-05, + "loss": 5.7518, + "step": 408 + }, + { + "epoch": 0.10398855871603369, + "grad_norm": 16252.2412109375, + "learning_rate": 4.999806936971435e-05, + "loss": 5.7407, + "step": 409 + }, + { + "epoch": 0.10424280947084062, + "grad_norm": 24540.556640625, + "learning_rate": 4.999778371945399e-05, + "loss": 5.7342, + "step": 410 + }, + { + "epoch": 0.10449706022564755, + "grad_norm": 24937.2578125, + "learning_rate": 4.9997478370378794e-05, + "loss": 5.7225, + "step": 411 + }, + { + "epoch": 0.10475131098045447, + "grad_norm": 17110.91796875, + "learning_rate": 4.9997153322729386e-05, + "loss": 5.7338, + "step": 412 + }, + { + "epoch": 0.1050055617352614, + "grad_norm": 16506.455078125, + "learning_rate": 4.999680857676192e-05, + "loss": 5.7277, + "step": 413 + }, + { + "epoch": 0.10525981249006833, + "grad_norm": 24584.044921875, + "learning_rate": 4.9996444132748055e-05, + "loss": 5.7373, + "step": 414 + }, + { + "epoch": 0.10551406324487526, + "grad_norm": 26646.958984375, + "learning_rate": 4.9996059990974984e-05, + "loss": 5.7243, + "step": 415 + }, + { + "epoch": 0.10576831399968219, + "grad_norm": 15127.9755859375, + "learning_rate": 4.999565615174542e-05, + "loss": 5.7263, + "step": 416 + }, + { + "epoch": 0.10602256475448911, + "grad_norm": 22097.779296875, + "learning_rate": 4.99952326153776e-05, + "loss": 5.7323, + "step": 417 + }, + { + "epoch": 0.10627681550929605, + "grad_norm": 32168.458984375, + "learning_rate": 4.9994789382205275e-05, + "loss": 5.7075, + "step": 418 + }, + { + "epoch": 0.10653106626410297, + "grad_norm": 15955.8134765625, + "learning_rate": 4.9994326452577735e-05, + "loss": 5.7404, + "step": 419 + }, + { + "epoch": 0.1067853170189099, + "grad_norm": 51767.4296875, + "learning_rate": 4.999384382685975e-05, + "loss": 5.7317, + "step": 420 + }, + { + "epoch": 0.10703956777371683, + "grad_norm": 18034.890625, + "learning_rate": 4.9993341505431675e-05, + "loss": 5.7169, + "step": 421 + }, + { + "epoch": 0.10729381852852375, + "grad_norm": 40395.14453125, + "learning_rate": 4.999281948868932e-05, + "loss": 5.7429, + "step": 422 + }, + { + "epoch": 0.10754806928333069, + "grad_norm": 38049.34765625, + "learning_rate": 4.9992277777044075e-05, + "loss": 5.7306, + "step": 423 + }, + { + "epoch": 0.10780232003813761, + "grad_norm": 24377.923828125, + "learning_rate": 4.9991716370922804e-05, + "loss": 5.7364, + "step": 424 + }, + { + "epoch": 0.10805657079294455, + "grad_norm": 31733.537109375, + "learning_rate": 4.9991135270767904e-05, + "loss": 5.729, + "step": 425 + }, + { + "epoch": 0.10831082154775147, + "grad_norm": 24497.560546875, + "learning_rate": 4.9990534477037296e-05, + "loss": 5.7321, + "step": 426 + }, + { + "epoch": 0.1085650723025584, + "grad_norm": 21727.189453125, + "learning_rate": 4.9989913990204436e-05, + "loss": 5.7215, + "step": 427 + }, + { + "epoch": 0.10881932305736533, + "grad_norm": 36596.0859375, + "learning_rate": 4.9989273810758265e-05, + "loss": 5.72, + "step": 428 + }, + { + "epoch": 0.10907357381217225, + "grad_norm": 26537.92578125, + "learning_rate": 4.998861393920326e-05, + "loss": 5.7302, + "step": 429 + }, + { + "epoch": 0.10932782456697919, + "grad_norm": 32205.70703125, + "learning_rate": 4.998793437605942e-05, + "loss": 5.7199, + "step": 430 + }, + { + "epoch": 0.10958207532178611, + "grad_norm": 28683.19140625, + "learning_rate": 4.9987235121862255e-05, + "loss": 5.7394, + "step": 431 + }, + { + "epoch": 0.10983632607659304, + "grad_norm": 17488.791015625, + "learning_rate": 4.998651617716279e-05, + "loss": 5.7219, + "step": 432 + }, + { + "epoch": 0.11009057683139997, + "grad_norm": 27071.734375, + "learning_rate": 4.9985777542527566e-05, + "loss": 5.7032, + "step": 433 + }, + { + "epoch": 0.1103448275862069, + "grad_norm": 16953.912109375, + "learning_rate": 4.9985019218538656e-05, + "loss": 5.7104, + "step": 434 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 18923.18359375, + "learning_rate": 4.998424120579363e-05, + "loss": 5.7327, + "step": 435 + }, + { + "epoch": 0.11085332909582075, + "grad_norm": 18261.357421875, + "learning_rate": 4.998344350490558e-05, + "loss": 5.7221, + "step": 436 + }, + { + "epoch": 0.11110757985062768, + "grad_norm": 15947.138671875, + "learning_rate": 4.99826261165031e-05, + "loss": 5.7246, + "step": 437 + }, + { + "epoch": 0.11136183060543461, + "grad_norm": 18423.921875, + "learning_rate": 4.998178904123033e-05, + "loss": 5.7102, + "step": 438 + }, + { + "epoch": 0.11161608136024154, + "grad_norm": 15425.244140625, + "learning_rate": 4.99809322797469e-05, + "loss": 5.7016, + "step": 439 + }, + { + "epoch": 0.11187033211504846, + "grad_norm": 22775.279296875, + "learning_rate": 4.9980055832727946e-05, + "loss": 5.7221, + "step": 440 + }, + { + "epoch": 0.1121245828698554, + "grad_norm": 15630.93359375, + "learning_rate": 4.997915970086413e-05, + "loss": 5.7345, + "step": 441 + }, + { + "epoch": 0.11237883362466232, + "grad_norm": 16381.5498046875, + "learning_rate": 4.9978243884861635e-05, + "loss": 5.7293, + "step": 442 + }, + { + "epoch": 0.11263308437946926, + "grad_norm": 16565.13671875, + "learning_rate": 4.997730838544214e-05, + "loss": 5.694, + "step": 443 + }, + { + "epoch": 0.11288733513427618, + "grad_norm": 15240.77734375, + "learning_rate": 4.997635320334283e-05, + "loss": 5.7241, + "step": 444 + }, + { + "epoch": 0.1131415858890831, + "grad_norm": 15775.3681640625, + "learning_rate": 4.9975378339316434e-05, + "loss": 5.7336, + "step": 445 + }, + { + "epoch": 0.11339583664389004, + "grad_norm": 15154.955078125, + "learning_rate": 4.997438379413114e-05, + "loss": 5.726, + "step": 446 + }, + { + "epoch": 0.11365008739869696, + "grad_norm": 15382.392578125, + "learning_rate": 4.997336956857068e-05, + "loss": 5.7209, + "step": 447 + }, + { + "epoch": 0.1139043381535039, + "grad_norm": 15969.6162109375, + "learning_rate": 4.997233566343429e-05, + "loss": 5.7175, + "step": 448 + }, + { + "epoch": 0.11415858890831082, + "grad_norm": 15737.732421875, + "learning_rate": 4.997128207953671e-05, + "loss": 5.6985, + "step": 449 + }, + { + "epoch": 0.11441283966311774, + "grad_norm": 15486.15234375, + "learning_rate": 4.99702088177082e-05, + "loss": 5.7084, + "step": 450 + }, + { + "epoch": 0.11466709041792468, + "grad_norm": 15467.587890625, + "learning_rate": 4.9969115878794484e-05, + "loss": 5.7265, + "step": 451 + }, + { + "epoch": 0.1149213411727316, + "grad_norm": 15145.166015625, + "learning_rate": 4.996800326365685e-05, + "loss": 5.7209, + "step": 452 + }, + { + "epoch": 0.11517559192753854, + "grad_norm": 15714.81640625, + "learning_rate": 4.9966870973172046e-05, + "loss": 5.7124, + "step": 453 + }, + { + "epoch": 0.11542984268234546, + "grad_norm": 18364.576171875, + "learning_rate": 4.996571900823236e-05, + "loss": 5.701, + "step": 454 + }, + { + "epoch": 0.11568409343715239, + "grad_norm": 16856.05859375, + "learning_rate": 4.996454736974555e-05, + "loss": 5.7009, + "step": 455 + }, + { + "epoch": 0.11593834419195932, + "grad_norm": 15237.99609375, + "learning_rate": 4.9963356058634903e-05, + "loss": 5.7102, + "step": 456 + }, + { + "epoch": 0.11619259494676625, + "grad_norm": 16812.8671875, + "learning_rate": 4.996214507583919e-05, + "loss": 5.6973, + "step": 457 + }, + { + "epoch": 0.11644684570157318, + "grad_norm": 17960.55078125, + "learning_rate": 4.99609144223127e-05, + "loss": 5.7145, + "step": 458 + }, + { + "epoch": 0.1167010964563801, + "grad_norm": 16421.55078125, + "learning_rate": 4.9959664099025216e-05, + "loss": 5.7148, + "step": 459 + }, + { + "epoch": 0.11695534721118703, + "grad_norm": 15507.2197265625, + "learning_rate": 4.995839410696202e-05, + "loss": 5.6977, + "step": 460 + }, + { + "epoch": 0.11720959796599396, + "grad_norm": 18020.625, + "learning_rate": 4.995710444712389e-05, + "loss": 5.7285, + "step": 461 + }, + { + "epoch": 0.11746384872080089, + "grad_norm": 18671.88671875, + "learning_rate": 4.995579512052712e-05, + "loss": 5.7145, + "step": 462 + }, + { + "epoch": 0.11771809947560782, + "grad_norm": 15958.7607421875, + "learning_rate": 4.995446612820346e-05, + "loss": 5.7144, + "step": 463 + }, + { + "epoch": 0.11797235023041475, + "grad_norm": 16075.9072265625, + "learning_rate": 4.9953117471200215e-05, + "loss": 5.7147, + "step": 464 + }, + { + "epoch": 0.11822660098522167, + "grad_norm": 21117.54296875, + "learning_rate": 4.995174915058015e-05, + "loss": 5.6987, + "step": 465 + }, + { + "epoch": 0.1184808517400286, + "grad_norm": 21167.033203125, + "learning_rate": 4.9950361167421526e-05, + "loss": 5.6985, + "step": 466 + }, + { + "epoch": 0.11873510249483553, + "grad_norm": 15609.7109375, + "learning_rate": 4.994895352281811e-05, + "loss": 5.7058, + "step": 467 + }, + { + "epoch": 0.11898935324964247, + "grad_norm": 19494.00390625, + "learning_rate": 4.994752621787915e-05, + "loss": 5.6978, + "step": 468 + }, + { + "epoch": 0.11924360400444939, + "grad_norm": 23788.21875, + "learning_rate": 4.9946079253729406e-05, + "loss": 5.6764, + "step": 469 + }, + { + "epoch": 0.11949785475925631, + "grad_norm": 15217.6826171875, + "learning_rate": 4.99446126315091e-05, + "loss": 5.6901, + "step": 470 + }, + { + "epoch": 0.11975210551406325, + "grad_norm": 34201.64453125, + "learning_rate": 4.9943126352373984e-05, + "loss": 5.6901, + "step": 471 + }, + { + "epoch": 0.12000635626887017, + "grad_norm": 16188.8125, + "learning_rate": 4.994162041749527e-05, + "loss": 5.7206, + "step": 472 + }, + { + "epoch": 0.12026060702367711, + "grad_norm": 19348.451171875, + "learning_rate": 4.994009482805967e-05, + "loss": 5.6751, + "step": 473 + }, + { + "epoch": 0.12051485777848403, + "grad_norm": 26459.375, + "learning_rate": 4.993854958526938e-05, + "loss": 5.6943, + "step": 474 + }, + { + "epoch": 0.12076910853329095, + "grad_norm": 15800.505859375, + "learning_rate": 4.9936984690342094e-05, + "loss": 5.6905, + "step": 475 + }, + { + "epoch": 0.12102335928809789, + "grad_norm": 17625.212890625, + "learning_rate": 4.9935400144510966e-05, + "loss": 5.6914, + "step": 476 + }, + { + "epoch": 0.12127761004290481, + "grad_norm": 27727.19140625, + "learning_rate": 4.993379594902468e-05, + "loss": 5.7044, + "step": 477 + }, + { + "epoch": 0.12153186079771175, + "grad_norm": 17069.87109375, + "learning_rate": 4.993217210514734e-05, + "loss": 5.6939, + "step": 478 + }, + { + "epoch": 0.12178611155251867, + "grad_norm": 15412.1865234375, + "learning_rate": 4.993052861415862e-05, + "loss": 5.7013, + "step": 479 + }, + { + "epoch": 0.1220403623073256, + "grad_norm": 20495.89453125, + "learning_rate": 4.992886547735359e-05, + "loss": 5.7095, + "step": 480 + }, + { + "epoch": 0.12229461306213253, + "grad_norm": 20597.22265625, + "learning_rate": 4.9927182696042856e-05, + "loss": 5.6795, + "step": 481 + }, + { + "epoch": 0.12254886381693945, + "grad_norm": 15255.236328125, + "learning_rate": 4.992548027155248e-05, + "loss": 5.6751, + "step": 482 + }, + { + "epoch": 0.12280311457174639, + "grad_norm": 25595.765625, + "learning_rate": 4.9923758205224025e-05, + "loss": 5.6862, + "step": 483 + }, + { + "epoch": 0.12305736532655331, + "grad_norm": 19672.923828125, + "learning_rate": 4.99220164984145e-05, + "loss": 5.6854, + "step": 484 + }, + { + "epoch": 0.12331161608136024, + "grad_norm": 15898.150390625, + "learning_rate": 4.992025515249642e-05, + "loss": 5.7032, + "step": 485 + }, + { + "epoch": 0.12356586683616717, + "grad_norm": 25745.66015625, + "learning_rate": 4.9918474168857755e-05, + "loss": 5.6798, + "step": 486 + }, + { + "epoch": 0.1238201175909741, + "grad_norm": 18262.912109375, + "learning_rate": 4.991667354890196e-05, + "loss": 5.6727, + "step": 487 + }, + { + "epoch": 0.12407436834578103, + "grad_norm": 15679.765625, + "learning_rate": 4.9914853294047986e-05, + "loss": 5.6857, + "step": 488 + }, + { + "epoch": 0.12432861910058796, + "grad_norm": 22240.96875, + "learning_rate": 4.9913013405730215e-05, + "loss": 5.6853, + "step": 489 + }, + { + "epoch": 0.12458286985539488, + "grad_norm": 19653.525390625, + "learning_rate": 4.991115388539852e-05, + "loss": 5.6615, + "step": 490 + }, + { + "epoch": 0.12483712061020182, + "grad_norm": 15283.314453125, + "learning_rate": 4.990927473451825e-05, + "loss": 5.6852, + "step": 491 + }, + { + "epoch": 0.12509137136500875, + "grad_norm": 22386.185546875, + "learning_rate": 4.9907375954570225e-05, + "loss": 5.6877, + "step": 492 + }, + { + "epoch": 0.12534562211981568, + "grad_norm": 20030.65625, + "learning_rate": 4.990545754705071e-05, + "loss": 5.6858, + "step": 493 + }, + { + "epoch": 0.1255998728746226, + "grad_norm": 15557.42578125, + "learning_rate": 4.990351951347147e-05, + "loss": 5.6791, + "step": 494 + }, + { + "epoch": 0.12585412362942952, + "grad_norm": 22766.5703125, + "learning_rate": 4.9901561855359705e-05, + "loss": 5.6886, + "step": 495 + }, + { + "epoch": 0.12610837438423644, + "grad_norm": 20646.341796875, + "learning_rate": 4.98995845742581e-05, + "loss": 5.6811, + "step": 496 + }, + { + "epoch": 0.1263626251390434, + "grad_norm": 16207.798828125, + "learning_rate": 4.989758767172479e-05, + "loss": 5.6696, + "step": 497 + }, + { + "epoch": 0.12661687589385032, + "grad_norm": 29511.078125, + "learning_rate": 4.989557114933339e-05, + "loss": 5.6652, + "step": 498 + }, + { + "epoch": 0.12687112664865724, + "grad_norm": 15871.7978515625, + "learning_rate": 4.989353500867296e-05, + "loss": 5.6692, + "step": 499 + }, + { + "epoch": 0.12712537740346416, + "grad_norm": 18247.537109375, + "learning_rate": 4.9891479251348026e-05, + "loss": 5.6786, + "step": 500 + }, + { + "epoch": 0.12712537740346416, + "eval_loss": 11.436968803405762, + "eval_runtime": 696.683, + "eval_samples_per_second": 152.129, + "eval_steps_per_second": 9.509, + "step": 500 + }, + { + "epoch": 0.12737962815827109, + "grad_norm": 21737.20703125, + "learning_rate": 4.988940387897857e-05, + "loss": 5.66, + "step": 501 + }, + { + "epoch": 0.12763387891307804, + "grad_norm": 16418.75, + "learning_rate": 4.988730889320004e-05, + "loss": 5.6799, + "step": 502 + }, + { + "epoch": 0.12788812966788496, + "grad_norm": 15836.2626953125, + "learning_rate": 4.9885194295663306e-05, + "loss": 5.6765, + "step": 503 + }, + { + "epoch": 0.12814238042269188, + "grad_norm": 21452.65234375, + "learning_rate": 4.988306008803475e-05, + "loss": 5.6771, + "step": 504 + }, + { + "epoch": 0.1283966311774988, + "grad_norm": 20613.23828125, + "learning_rate": 4.988090627199615e-05, + "loss": 5.6607, + "step": 505 + }, + { + "epoch": 0.12865088193230573, + "grad_norm": 15403.5283203125, + "learning_rate": 4.987873284924478e-05, + "loss": 5.6664, + "step": 506 + }, + { + "epoch": 0.12890513268711268, + "grad_norm": 18133.78515625, + "learning_rate": 4.987653982149334e-05, + "loss": 5.6795, + "step": 507 + }, + { + "epoch": 0.1291593834419196, + "grad_norm": 20240.896484375, + "learning_rate": 4.987432719046998e-05, + "loss": 5.6629, + "step": 508 + }, + { + "epoch": 0.12941363419672652, + "grad_norm": 15481.91015625, + "learning_rate": 4.987209495791831e-05, + "loss": 5.6612, + "step": 509 + }, + { + "epoch": 0.12966788495153345, + "grad_norm": 27840.04296875, + "learning_rate": 4.9869843125597374e-05, + "loss": 5.6792, + "step": 510 + }, + { + "epoch": 0.12992213570634037, + "grad_norm": 19821.810546875, + "learning_rate": 4.9867571695281666e-05, + "loss": 5.664, + "step": 511 + }, + { + "epoch": 0.13017638646114732, + "grad_norm": 16410.453125, + "learning_rate": 4.986528066876113e-05, + "loss": 5.6731, + "step": 512 + }, + { + "epoch": 0.13043063721595424, + "grad_norm": 29050.4296875, + "learning_rate": 4.9862970047841144e-05, + "loss": 5.6657, + "step": 513 + }, + { + "epoch": 0.13068488797076117, + "grad_norm": 16172.8349609375, + "learning_rate": 4.9860639834342525e-05, + "loss": 5.6749, + "step": 514 + }, + { + "epoch": 0.1309391387255681, + "grad_norm": 18856.3359375, + "learning_rate": 4.985829003010154e-05, + "loss": 5.6627, + "step": 515 + }, + { + "epoch": 0.131193389480375, + "grad_norm": 24609.775390625, + "learning_rate": 4.985592063696988e-05, + "loss": 5.6709, + "step": 516 + }, + { + "epoch": 0.13144764023518196, + "grad_norm": 16182.5205078125, + "learning_rate": 4.985353165681469e-05, + "loss": 5.6843, + "step": 517 + }, + { + "epoch": 0.13170189098998888, + "grad_norm": 15324.4169921875, + "learning_rate": 4.985112309151853e-05, + "loss": 5.6688, + "step": 518 + }, + { + "epoch": 0.1319561417447958, + "grad_norm": 15727.365234375, + "learning_rate": 4.984869494297941e-05, + "loss": 5.6662, + "step": 519 + }, + { + "epoch": 0.13221039249960273, + "grad_norm": 15892.57421875, + "learning_rate": 4.9846247213110765e-05, + "loss": 5.6601, + "step": 520 + }, + { + "epoch": 0.13246464325440965, + "grad_norm": 15805.169921875, + "learning_rate": 4.984377990384145e-05, + "loss": 5.6494, + "step": 521 + }, + { + "epoch": 0.1327188940092166, + "grad_norm": 15867.2119140625, + "learning_rate": 4.984129301711578e-05, + "loss": 5.6411, + "step": 522 + }, + { + "epoch": 0.13297314476402353, + "grad_norm": 17240.55078125, + "learning_rate": 4.9838786554893455e-05, + "loss": 5.6732, + "step": 523 + }, + { + "epoch": 0.13322739551883045, + "grad_norm": 18650.443359375, + "learning_rate": 4.9836260519149644e-05, + "loss": 5.6509, + "step": 524 + }, + { + "epoch": 0.13348164627363737, + "grad_norm": 16894.62890625, + "learning_rate": 4.983371491187492e-05, + "loss": 5.6669, + "step": 525 + }, + { + "epoch": 0.1337358970284443, + "grad_norm": 15476.4794921875, + "learning_rate": 4.9831149735075255e-05, + "loss": 5.6341, + "step": 526 + }, + { + "epoch": 0.13399014778325122, + "grad_norm": 15612.296875, + "learning_rate": 4.982856499077209e-05, + "loss": 5.6663, + "step": 527 + }, + { + "epoch": 0.13424439853805817, + "grad_norm": 16043.396484375, + "learning_rate": 4.982596068100225e-05, + "loss": 5.6415, + "step": 528 + }, + { + "epoch": 0.1344986492928651, + "grad_norm": 16745.125, + "learning_rate": 4.982333680781799e-05, + "loss": 5.6387, + "step": 529 + }, + { + "epoch": 0.13475290004767201, + "grad_norm": 18327.9921875, + "learning_rate": 4.982069337328698e-05, + "loss": 5.6466, + "step": 530 + }, + { + "epoch": 0.13500715080247894, + "grad_norm": 20842.2109375, + "learning_rate": 4.9818030379492314e-05, + "loss": 5.6564, + "step": 531 + }, + { + "epoch": 0.13526140155728586, + "grad_norm": 18361.029296875, + "learning_rate": 4.9815347828532486e-05, + "loss": 5.6416, + "step": 532 + }, + { + "epoch": 0.1355156523120928, + "grad_norm": 15737.955078125, + "learning_rate": 4.981264572252141e-05, + "loss": 5.6446, + "step": 533 + }, + { + "epoch": 0.13576990306689973, + "grad_norm": 15640.228515625, + "learning_rate": 4.9809924063588394e-05, + "loss": 5.6594, + "step": 534 + }, + { + "epoch": 0.13602415382170666, + "grad_norm": 19447.421875, + "learning_rate": 4.980718285387818e-05, + "loss": 5.6464, + "step": 535 + }, + { + "epoch": 0.13627840457651358, + "grad_norm": 23074.4921875, + "learning_rate": 4.9804422095550894e-05, + "loss": 5.6481, + "step": 536 + }, + { + "epoch": 0.1365326553313205, + "grad_norm": 16168.689453125, + "learning_rate": 4.9801641790782085e-05, + "loss": 5.6358, + "step": 537 + }, + { + "epoch": 0.13678690608612745, + "grad_norm": 17012.951171875, + "learning_rate": 4.979884194176268e-05, + "loss": 5.6407, + "step": 538 + }, + { + "epoch": 0.13704115684093437, + "grad_norm": 22454.2265625, + "learning_rate": 4.979602255069904e-05, + "loss": 5.6222, + "step": 539 + }, + { + "epoch": 0.1372954075957413, + "grad_norm": 17975.060546875, + "learning_rate": 4.97931836198129e-05, + "loss": 5.6552, + "step": 540 + }, + { + "epoch": 0.13754965835054822, + "grad_norm": 15872.36328125, + "learning_rate": 4.97903251513414e-05, + "loss": 5.6584, + "step": 541 + }, + { + "epoch": 0.13780390910535514, + "grad_norm": 18451.474609375, + "learning_rate": 4.978744714753708e-05, + "loss": 5.629, + "step": 542 + }, + { + "epoch": 0.1380581598601621, + "grad_norm": 23676.044921875, + "learning_rate": 4.978454961066787e-05, + "loss": 5.6582, + "step": 543 + }, + { + "epoch": 0.13831241061496902, + "grad_norm": 19392.4296875, + "learning_rate": 4.97816325430171e-05, + "loss": 5.6415, + "step": 544 + }, + { + "epoch": 0.13856666136977594, + "grad_norm": 16136.1005859375, + "learning_rate": 4.977869594688348e-05, + "loss": 5.6457, + "step": 545 + }, + { + "epoch": 0.13882091212458286, + "grad_norm": 16434.310546875, + "learning_rate": 4.977573982458111e-05, + "loss": 5.6549, + "step": 546 + }, + { + "epoch": 0.13907516287938979, + "grad_norm": 19780.55078125, + "learning_rate": 4.9772764178439485e-05, + "loss": 5.6132, + "step": 547 + }, + { + "epoch": 0.13932941363419674, + "grad_norm": 21970.705078125, + "learning_rate": 4.976976901080348e-05, + "loss": 5.6392, + "step": 548 + }, + { + "epoch": 0.13958366438900366, + "grad_norm": 18257.62109375, + "learning_rate": 4.976675432403336e-05, + "loss": 5.6445, + "step": 549 + }, + { + "epoch": 0.13983791514381058, + "grad_norm": 16311.8681640625, + "learning_rate": 4.9763720120504756e-05, + "loss": 5.6347, + "step": 550 + }, + { + "epoch": 0.1400921658986175, + "grad_norm": 16021.2177734375, + "learning_rate": 4.976066640260869e-05, + "loss": 5.6239, + "step": 551 + }, + { + "epoch": 0.14034641665342443, + "grad_norm": 16831.166015625, + "learning_rate": 4.975759317275157e-05, + "loss": 5.632, + "step": 552 + }, + { + "epoch": 0.14060066740823138, + "grad_norm": 18917.677734375, + "learning_rate": 4.975450043335517e-05, + "loss": 5.6209, + "step": 553 + }, + { + "epoch": 0.1408549181630383, + "grad_norm": 19774.138671875, + "learning_rate": 4.975138818685662e-05, + "loss": 5.6466, + "step": 554 + }, + { + "epoch": 0.14110916891784522, + "grad_norm": 18575.240234375, + "learning_rate": 4.974825643570845e-05, + "loss": 5.608, + "step": 555 + }, + { + "epoch": 0.14136341967265215, + "grad_norm": 17651.923828125, + "learning_rate": 4.974510518237856e-05, + "loss": 5.6164, + "step": 556 + }, + { + "epoch": 0.14161767042745907, + "grad_norm": 17001.119140625, + "learning_rate": 4.97419344293502e-05, + "loss": 5.6189, + "step": 557 + }, + { + "epoch": 0.14187192118226602, + "grad_norm": 17398.4296875, + "learning_rate": 4.973874417912199e-05, + "loss": 5.62, + "step": 558 + }, + { + "epoch": 0.14212617193707294, + "grad_norm": 18886.431640625, + "learning_rate": 4.9735534434207925e-05, + "loss": 5.6276, + "step": 559 + }, + { + "epoch": 0.14238042269187987, + "grad_norm": 20584.83203125, + "learning_rate": 4.9732305197137356e-05, + "loss": 5.6137, + "step": 560 + }, + { + "epoch": 0.1426346734466868, + "grad_norm": 19928.982421875, + "learning_rate": 4.972905647045499e-05, + "loss": 5.6381, + "step": 561 + }, + { + "epoch": 0.1428889242014937, + "grad_norm": 17919.140625, + "learning_rate": 4.9725788256720905e-05, + "loss": 5.6481, + "step": 562 + }, + { + "epoch": 0.14314317495630066, + "grad_norm": 18179.376953125, + "learning_rate": 4.9722500558510524e-05, + "loss": 5.6181, + "step": 563 + }, + { + "epoch": 0.14339742571110758, + "grad_norm": 17842.84375, + "learning_rate": 4.9719193378414616e-05, + "loss": 5.6353, + "step": 564 + }, + { + "epoch": 0.1436516764659145, + "grad_norm": 18230.892578125, + "learning_rate": 4.9715866719039326e-05, + "loss": 5.621, + "step": 565 + }, + { + "epoch": 0.14390592722072143, + "grad_norm": 18977.90625, + "learning_rate": 4.971252058300614e-05, + "loss": 5.6392, + "step": 566 + }, + { + "epoch": 0.14416017797552835, + "grad_norm": 19686.40234375, + "learning_rate": 4.970915497295187e-05, + "loss": 5.5983, + "step": 567 + }, + { + "epoch": 0.1444144287303353, + "grad_norm": 18930.306640625, + "learning_rate": 4.970576989152871e-05, + "loss": 5.6122, + "step": 568 + }, + { + "epoch": 0.14466867948514223, + "grad_norm": 18535.255859375, + "learning_rate": 4.970236534140417e-05, + "loss": 5.6109, + "step": 569 + }, + { + "epoch": 0.14492293023994915, + "grad_norm": 19047.4296875, + "learning_rate": 4.9698941325261104e-05, + "loss": 5.6177, + "step": 570 + }, + { + "epoch": 0.14517718099475607, + "grad_norm": 18814.443359375, + "learning_rate": 4.969549784579773e-05, + "loss": 5.6271, + "step": 571 + }, + { + "epoch": 0.145431431749563, + "grad_norm": 16480.15234375, + "learning_rate": 4.969203490572759e-05, + "loss": 5.6281, + "step": 572 + }, + { + "epoch": 0.14568568250436995, + "grad_norm": 15779.3984375, + "learning_rate": 4.9688552507779554e-05, + "loss": 5.6277, + "step": 573 + }, + { + "epoch": 0.14593993325917687, + "grad_norm": 16809.806640625, + "learning_rate": 4.9685050654697806e-05, + "loss": 5.6211, + "step": 574 + }, + { + "epoch": 0.1461941840139838, + "grad_norm": 20341.103515625, + "learning_rate": 4.968152934924192e-05, + "loss": 5.5933, + "step": 575 + }, + { + "epoch": 0.14644843476879071, + "grad_norm": 18689.234375, + "learning_rate": 4.967798859418674e-05, + "loss": 5.6168, + "step": 576 + }, + { + "epoch": 0.14670268552359764, + "grad_norm": 16318.267578125, + "learning_rate": 4.9674428392322476e-05, + "loss": 5.6036, + "step": 577 + }, + { + "epoch": 0.1469569362784046, + "grad_norm": 29769.20703125, + "learning_rate": 4.967084874645463e-05, + "loss": 5.6077, + "step": 578 + }, + { + "epoch": 0.1472111870332115, + "grad_norm": 20526.70703125, + "learning_rate": 4.966724965940407e-05, + "loss": 5.6155, + "step": 579 + }, + { + "epoch": 0.14746543778801843, + "grad_norm": 16122.1318359375, + "learning_rate": 4.966363113400693e-05, + "loss": 5.6045, + "step": 580 + }, + { + "epoch": 0.14771968854282536, + "grad_norm": 28428.771484375, + "learning_rate": 4.965999317311469e-05, + "loss": 5.6197, + "step": 581 + }, + { + "epoch": 0.14797393929763228, + "grad_norm": 19143.50390625, + "learning_rate": 4.965633577959417e-05, + "loss": 5.6081, + "step": 582 + }, + { + "epoch": 0.14822819005243923, + "grad_norm": 16851.91796875, + "learning_rate": 4.9652658956327457e-05, + "loss": 5.6062, + "step": 583 + }, + { + "epoch": 0.14848244080724615, + "grad_norm": 24752.48828125, + "learning_rate": 4.964896270621198e-05, + "loss": 5.6142, + "step": 584 + }, + { + "epoch": 0.14873669156205307, + "grad_norm": 19609.951171875, + "learning_rate": 4.964524703216046e-05, + "loss": 5.6065, + "step": 585 + }, + { + "epoch": 0.14899094231686, + "grad_norm": 16254.486328125, + "learning_rate": 4.9641511937100934e-05, + "loss": 5.5968, + "step": 586 + }, + { + "epoch": 0.14924519307166692, + "grad_norm": 36469.53125, + "learning_rate": 4.963775742397674e-05, + "loss": 5.6088, + "step": 587 + }, + { + "epoch": 0.14949944382647387, + "grad_norm": 17407.080078125, + "learning_rate": 4.963398349574653e-05, + "loss": 5.5943, + "step": 588 + }, + { + "epoch": 0.1497536945812808, + "grad_norm": 19350.60546875, + "learning_rate": 4.963019015538422e-05, + "loss": 5.6002, + "step": 589 + }, + { + "epoch": 0.15000794533608772, + "grad_norm": 31245.01171875, + "learning_rate": 4.962637740587907e-05, + "loss": 5.6126, + "step": 590 + }, + { + "epoch": 0.15026219609089464, + "grad_norm": 15731.158203125, + "learning_rate": 4.962254525023561e-05, + "loss": 5.5871, + "step": 591 + }, + { + "epoch": 0.15051644684570156, + "grad_norm": 22137.224609375, + "learning_rate": 4.961869369147365e-05, + "loss": 5.5978, + "step": 592 + }, + { + "epoch": 0.1507706976005085, + "grad_norm": 18919.173828125, + "learning_rate": 4.961482273262831e-05, + "loss": 5.6055, + "step": 593 + }, + { + "epoch": 0.15102494835531544, + "grad_norm": 15994.861328125, + "learning_rate": 4.9610932376750006e-05, + "loss": 5.5934, + "step": 594 + }, + { + "epoch": 0.15127919911012236, + "grad_norm": 19860.427734375, + "learning_rate": 4.960702262690441e-05, + "loss": 5.5961, + "step": 595 + }, + { + "epoch": 0.15153344986492928, + "grad_norm": 19515.59375, + "learning_rate": 4.9603093486172504e-05, + "loss": 5.5927, + "step": 596 + }, + { + "epoch": 0.1517877006197362, + "grad_norm": 16144.533203125, + "learning_rate": 4.959914495765052e-05, + "loss": 5.5984, + "step": 597 + }, + { + "epoch": 0.15204195137454316, + "grad_norm": 15955.1083984375, + "learning_rate": 4.959517704445001e-05, + "loss": 5.5973, + "step": 598 + }, + { + "epoch": 0.15229620212935008, + "grad_norm": 17576.66796875, + "learning_rate": 4.959118974969777e-05, + "loss": 5.5894, + "step": 599 + }, + { + "epoch": 0.152550452884157, + "grad_norm": 18451.59375, + "learning_rate": 4.958718307653588e-05, + "loss": 5.5885, + "step": 600 + }, + { + "epoch": 0.152550452884157, + "eval_loss": 11.28321647644043, + "eval_runtime": 699.4596, + "eval_samples_per_second": 151.526, + "eval_steps_per_second": 9.472, + "step": 600 + }, + { + "epoch": 0.15280470363896392, + "grad_norm": 15883.5673828125, + "learning_rate": 4.958315702812168e-05, + "loss": 5.6017, + "step": 601 + }, + { + "epoch": 0.15305895439377085, + "grad_norm": 16934.130859375, + "learning_rate": 4.957911160762779e-05, + "loss": 5.6034, + "step": 602 + }, + { + "epoch": 0.1533132051485778, + "grad_norm": 19183.017578125, + "learning_rate": 4.9575046818242106e-05, + "loss": 5.5867, + "step": 603 + }, + { + "epoch": 0.15356745590338472, + "grad_norm": 17251.52734375, + "learning_rate": 4.9570962663167756e-05, + "loss": 5.5952, + "step": 604 + }, + { + "epoch": 0.15382170665819164, + "grad_norm": 16172.880859375, + "learning_rate": 4.956685914562315e-05, + "loss": 5.6117, + "step": 605 + }, + { + "epoch": 0.15407595741299857, + "grad_norm": 16140.353515625, + "learning_rate": 4.9562736268841946e-05, + "loss": 5.5772, + "step": 606 + }, + { + "epoch": 0.1543302081678055, + "grad_norm": 19752.396484375, + "learning_rate": 4.955859403607308e-05, + "loss": 5.5945, + "step": 607 + }, + { + "epoch": 0.15458445892261244, + "grad_norm": 20558.962890625, + "learning_rate": 4.955443245058071e-05, + "loss": 5.5899, + "step": 608 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 15940.642578125, + "learning_rate": 4.9550251515644275e-05, + "loss": 5.5884, + "step": 609 + }, + { + "epoch": 0.15509296043222628, + "grad_norm": 23089.677734375, + "learning_rate": 4.954605123455842e-05, + "loss": 5.5997, + "step": 610 + }, + { + "epoch": 0.1553472111870332, + "grad_norm": 21694.041015625, + "learning_rate": 4.954183161063309e-05, + "loss": 5.5975, + "step": 611 + }, + { + "epoch": 0.15560146194184013, + "grad_norm": 16205.421875, + "learning_rate": 4.953759264719342e-05, + "loss": 5.5904, + "step": 612 + }, + { + "epoch": 0.15585571269664708, + "grad_norm": 30056.099609375, + "learning_rate": 4.9533334347579816e-05, + "loss": 5.5811, + "step": 613 + }, + { + "epoch": 0.156109963451454, + "grad_norm": 17744.61328125, + "learning_rate": 4.952905671514792e-05, + "loss": 5.586, + "step": 614 + }, + { + "epoch": 0.15636421420626093, + "grad_norm": 17406.412109375, + "learning_rate": 4.9524759753268594e-05, + "loss": 5.5827, + "step": 615 + }, + { + "epoch": 0.15661846496106785, + "grad_norm": 21392.123046875, + "learning_rate": 4.952044346532795e-05, + "loss": 5.5851, + "step": 616 + }, + { + "epoch": 0.15687271571587477, + "grad_norm": 16311.4326171875, + "learning_rate": 4.9516107854727304e-05, + "loss": 5.5871, + "step": 617 + }, + { + "epoch": 0.15712696647068172, + "grad_norm": 18318.0546875, + "learning_rate": 4.951175292488323e-05, + "loss": 5.5831, + "step": 618 + }, + { + "epoch": 0.15738121722548865, + "grad_norm": 19625.283203125, + "learning_rate": 4.950737867922751e-05, + "loss": 5.5946, + "step": 619 + }, + { + "epoch": 0.15763546798029557, + "grad_norm": 16345.9580078125, + "learning_rate": 4.950298512120714e-05, + "loss": 5.5791, + "step": 620 + }, + { + "epoch": 0.1578897187351025, + "grad_norm": 17236.556640625, + "learning_rate": 4.9498572254284336e-05, + "loss": 5.596, + "step": 621 + }, + { + "epoch": 0.15814396948990941, + "grad_norm": 21715.25390625, + "learning_rate": 4.949414008193655e-05, + "loss": 5.5954, + "step": 622 + }, + { + "epoch": 0.15839822024471636, + "grad_norm": 17622.6328125, + "learning_rate": 4.9489688607656424e-05, + "loss": 5.5868, + "step": 623 + }, + { + "epoch": 0.1586524709995233, + "grad_norm": 16723.8359375, + "learning_rate": 4.948521783495183e-05, + "loss": 5.5918, + "step": 624 + }, + { + "epoch": 0.1589067217543302, + "grad_norm": 24881.146484375, + "learning_rate": 4.948072776734583e-05, + "loss": 5.5659, + "step": 625 + }, + { + "epoch": 0.15916097250913713, + "grad_norm": 18340.728515625, + "learning_rate": 4.94762184083767e-05, + "loss": 5.581, + "step": 626 + }, + { + "epoch": 0.15941522326394406, + "grad_norm": 17637.662109375, + "learning_rate": 4.947168976159792e-05, + "loss": 5.5728, + "step": 627 + }, + { + "epoch": 0.159669474018751, + "grad_norm": 31284.01171875, + "learning_rate": 4.946714183057815e-05, + "loss": 5.5648, + "step": 628 + }, + { + "epoch": 0.15992372477355793, + "grad_norm": 16909.30078125, + "learning_rate": 4.946257461890128e-05, + "loss": 5.5673, + "step": 629 + }, + { + "epoch": 0.16017797552836485, + "grad_norm": 18615.958984375, + "learning_rate": 4.9457988130166365e-05, + "loss": 5.5708, + "step": 630 + }, + { + "epoch": 0.16043222628317177, + "grad_norm": 25193.091796875, + "learning_rate": 4.9453382367987664e-05, + "loss": 5.5711, + "step": 631 + }, + { + "epoch": 0.1606864770379787, + "grad_norm": 16814.205078125, + "learning_rate": 4.944875733599462e-05, + "loss": 5.574, + "step": 632 + }, + { + "epoch": 0.16094072779278565, + "grad_norm": 18124.3515625, + "learning_rate": 4.944411303783187e-05, + "loss": 5.5659, + "step": 633 + }, + { + "epoch": 0.16119497854759257, + "grad_norm": 25308.673828125, + "learning_rate": 4.943944947715922e-05, + "loss": 5.5784, + "step": 634 + }, + { + "epoch": 0.1614492293023995, + "grad_norm": 17068.517578125, + "learning_rate": 4.9434766657651644e-05, + "loss": 5.5731, + "step": 635 + }, + { + "epoch": 0.16170348005720642, + "grad_norm": 18465.509765625, + "learning_rate": 4.9430064582999335e-05, + "loss": 5.5724, + "step": 636 + }, + { + "epoch": 0.16195773081201334, + "grad_norm": 20113.205078125, + "learning_rate": 4.942534325690762e-05, + "loss": 5.5759, + "step": 637 + }, + { + "epoch": 0.1622119815668203, + "grad_norm": 16281.3671875, + "learning_rate": 4.942060268309701e-05, + "loss": 5.5619, + "step": 638 + }, + { + "epoch": 0.1624662323216272, + "grad_norm": 16662.591796875, + "learning_rate": 4.941584286530319e-05, + "loss": 5.5623, + "step": 639 + }, + { + "epoch": 0.16272048307643414, + "grad_norm": 17126.716796875, + "learning_rate": 4.941106380727699e-05, + "loss": 5.5664, + "step": 640 + }, + { + "epoch": 0.16297473383124106, + "grad_norm": 16530.728515625, + "learning_rate": 4.9406265512784435e-05, + "loss": 5.5784, + "step": 641 + }, + { + "epoch": 0.16322898458604798, + "grad_norm": 16175.728515625, + "learning_rate": 4.9401447985606676e-05, + "loss": 5.5666, + "step": 642 + }, + { + "epoch": 0.1634832353408549, + "grad_norm": 16602.26953125, + "learning_rate": 4.939661122954003e-05, + "loss": 5.571, + "step": 643 + }, + { + "epoch": 0.16373748609566185, + "grad_norm": 16899.26953125, + "learning_rate": 4.939175524839598e-05, + "loss": 5.5559, + "step": 644 + }, + { + "epoch": 0.16399173685046878, + "grad_norm": 16559.48046875, + "learning_rate": 4.938688004600113e-05, + "loss": 5.5655, + "step": 645 + }, + { + "epoch": 0.1642459876052757, + "grad_norm": 16872.763671875, + "learning_rate": 4.938198562619727e-05, + "loss": 5.5592, + "step": 646 + }, + { + "epoch": 0.16450023836008262, + "grad_norm": 17657.427734375, + "learning_rate": 4.93770719928413e-05, + "loss": 5.5664, + "step": 647 + }, + { + "epoch": 0.16475448911488955, + "grad_norm": 17201.142578125, + "learning_rate": 4.937213914980528e-05, + "loss": 5.5816, + "step": 648 + }, + { + "epoch": 0.1650087398696965, + "grad_norm": 16415.009765625, + "learning_rate": 4.93671871009764e-05, + "loss": 5.5573, + "step": 649 + }, + { + "epoch": 0.16526299062450342, + "grad_norm": 16179.7470703125, + "learning_rate": 4.936221585025698e-05, + "loss": 5.544, + "step": 650 + }, + { + "epoch": 0.16551724137931034, + "grad_norm": 17267.3203125, + "learning_rate": 4.935722540156448e-05, + "loss": 5.5482, + "step": 651 + }, + { + "epoch": 0.16577149213411727, + "grad_norm": 16624.052734375, + "learning_rate": 4.935221575883149e-05, + "loss": 5.5636, + "step": 652 + }, + { + "epoch": 0.1660257428889242, + "grad_norm": 16263.875, + "learning_rate": 4.9347186926005714e-05, + "loss": 5.5569, + "step": 653 + }, + { + "epoch": 0.16627999364373114, + "grad_norm": 19820.095703125, + "learning_rate": 4.934213890704999e-05, + "loss": 5.5588, + "step": 654 + }, + { + "epoch": 0.16653424439853806, + "grad_norm": 18276.126953125, + "learning_rate": 4.9337071705942276e-05, + "loss": 5.5559, + "step": 655 + }, + { + "epoch": 0.16678849515334498, + "grad_norm": 16186.74609375, + "learning_rate": 4.9331985326675624e-05, + "loss": 5.5613, + "step": 656 + }, + { + "epoch": 0.1670427459081519, + "grad_norm": 19869.337890625, + "learning_rate": 4.932687977325823e-05, + "loss": 5.5524, + "step": 657 + }, + { + "epoch": 0.16729699666295883, + "grad_norm": 19918.806640625, + "learning_rate": 4.932175504971337e-05, + "loss": 5.5572, + "step": 658 + }, + { + "epoch": 0.16755124741776578, + "grad_norm": 16860.58203125, + "learning_rate": 4.9316611160079454e-05, + "loss": 5.5482, + "step": 659 + }, + { + "epoch": 0.1678054981725727, + "grad_norm": 31688.140625, + "learning_rate": 4.931144810840999e-05, + "loss": 5.551, + "step": 660 + }, + { + "epoch": 0.16805974892737963, + "grad_norm": 16726.48828125, + "learning_rate": 4.930626589877355e-05, + "loss": 5.5557, + "step": 661 + }, + { + "epoch": 0.16831399968218655, + "grad_norm": 20100.140625, + "learning_rate": 4.930106453525386e-05, + "loss": 5.5503, + "step": 662 + }, + { + "epoch": 0.16856825043699347, + "grad_norm": 21077.978515625, + "learning_rate": 4.92958440219497e-05, + "loss": 5.5375, + "step": 663 + }, + { + "epoch": 0.16882250119180042, + "grad_norm": 16542.46484375, + "learning_rate": 4.9290604362974946e-05, + "loss": 5.5482, + "step": 664 + }, + { + "epoch": 0.16907675194660735, + "grad_norm": 17726.416015625, + "learning_rate": 4.928534556245857e-05, + "loss": 5.5473, + "step": 665 + }, + { + "epoch": 0.16933100270141427, + "grad_norm": 20504.6953125, + "learning_rate": 4.928006762454463e-05, + "loss": 5.5525, + "step": 666 + }, + { + "epoch": 0.1695852534562212, + "grad_norm": 17580.134765625, + "learning_rate": 4.927477055339227e-05, + "loss": 5.5115, + "step": 667 + }, + { + "epoch": 0.16983950421102811, + "grad_norm": 16410.330078125, + "learning_rate": 4.9269454353175674e-05, + "loss": 5.5393, + "step": 668 + }, + { + "epoch": 0.17009375496583506, + "grad_norm": 22899.498046875, + "learning_rate": 4.926411902808415e-05, + "loss": 5.5336, + "step": 669 + }, + { + "epoch": 0.170348005720642, + "grad_norm": 19626.82421875, + "learning_rate": 4.925876458232204e-05, + "loss": 5.551, + "step": 670 + }, + { + "epoch": 0.1706022564754489, + "grad_norm": 17094.5546875, + "learning_rate": 4.925339102010877e-05, + "loss": 5.5443, + "step": 671 + }, + { + "epoch": 0.17085650723025583, + "grad_norm": 29964.884765625, + "learning_rate": 4.9247998345678836e-05, + "loss": 5.5488, + "step": 672 + }, + { + "epoch": 0.17111075798506276, + "grad_norm": 17078.453125, + "learning_rate": 4.924258656328178e-05, + "loss": 5.53, + "step": 673 + }, + { + "epoch": 0.1713650087398697, + "grad_norm": 18604.427734375, + "learning_rate": 4.9237155677182215e-05, + "loss": 5.5466, + "step": 674 + }, + { + "epoch": 0.17161925949467663, + "grad_norm": 20238.46484375, + "learning_rate": 4.923170569165979e-05, + "loss": 5.5507, + "step": 675 + }, + { + "epoch": 0.17187351024948355, + "grad_norm": 16794.333984375, + "learning_rate": 4.9226236611009214e-05, + "loss": 5.5359, + "step": 676 + }, + { + "epoch": 0.17212776100429047, + "grad_norm": 17136.896484375, + "learning_rate": 4.922074843954026e-05, + "loss": 5.536, + "step": 677 + }, + { + "epoch": 0.1723820117590974, + "grad_norm": 17338.365234375, + "learning_rate": 4.921524118157772e-05, + "loss": 5.5381, + "step": 678 + }, + { + "epoch": 0.17263626251390435, + "grad_norm": 16591.2109375, + "learning_rate": 4.920971484146144e-05, + "loss": 5.5322, + "step": 679 + }, + { + "epoch": 0.17289051326871127, + "grad_norm": 16398.8046875, + "learning_rate": 4.9204169423546304e-05, + "loss": 5.549, + "step": 680 + }, + { + "epoch": 0.1731447640235182, + "grad_norm": 17364.615234375, + "learning_rate": 4.9198604932202216e-05, + "loss": 5.5261, + "step": 681 + }, + { + "epoch": 0.17339901477832512, + "grad_norm": 17418.7265625, + "learning_rate": 4.919302137181413e-05, + "loss": 5.547, + "step": 682 + }, + { + "epoch": 0.17365326553313204, + "grad_norm": 16392.10546875, + "learning_rate": 4.918741874678201e-05, + "loss": 5.5165, + "step": 683 + }, + { + "epoch": 0.173907516287939, + "grad_norm": 17120.955078125, + "learning_rate": 4.918179706152086e-05, + "loss": 5.5222, + "step": 684 + }, + { + "epoch": 0.1741617670427459, + "grad_norm": 17222.70703125, + "learning_rate": 4.917615632046068e-05, + "loss": 5.5374, + "step": 685 + }, + { + "epoch": 0.17441601779755284, + "grad_norm": 16485.86328125, + "learning_rate": 4.917049652804651e-05, + "loss": 5.5285, + "step": 686 + }, + { + "epoch": 0.17467026855235976, + "grad_norm": 19400.7421875, + "learning_rate": 4.916481768873839e-05, + "loss": 5.5386, + "step": 687 + }, + { + "epoch": 0.17492451930716668, + "grad_norm": 17421.396484375, + "learning_rate": 4.915911980701137e-05, + "loss": 5.516, + "step": 688 + }, + { + "epoch": 0.17517877006197363, + "grad_norm": 17308.685546875, + "learning_rate": 4.915340288735552e-05, + "loss": 5.5295, + "step": 689 + }, + { + "epoch": 0.17543302081678055, + "grad_norm": 22447.404296875, + "learning_rate": 4.9147666934275895e-05, + "loss": 5.5153, + "step": 690 + }, + { + "epoch": 0.17568727157158748, + "grad_norm": 17880.169921875, + "learning_rate": 4.9141911952292554e-05, + "loss": 5.5266, + "step": 691 + }, + { + "epoch": 0.1759415223263944, + "grad_norm": 16937.1796875, + "learning_rate": 4.9136137945940544e-05, + "loss": 5.5306, + "step": 692 + }, + { + "epoch": 0.17619577308120132, + "grad_norm": 22388.53125, + "learning_rate": 4.913034491976992e-05, + "loss": 5.5206, + "step": 693 + }, + { + "epoch": 0.17645002383600827, + "grad_norm": 18653.53125, + "learning_rate": 4.9124532878345724e-05, + "loss": 5.5387, + "step": 694 + }, + { + "epoch": 0.1767042745908152, + "grad_norm": 17489.583984375, + "learning_rate": 4.911870182624796e-05, + "loss": 5.5089, + "step": 695 + }, + { + "epoch": 0.17695852534562212, + "grad_norm": 32763.01171875, + "learning_rate": 4.911285176807164e-05, + "loss": 5.529, + "step": 696 + }, + { + "epoch": 0.17721277610042904, + "grad_norm": 17255.34765625, + "learning_rate": 4.910698270842674e-05, + "loss": 5.5427, + "step": 697 + }, + { + "epoch": 0.17746702685523597, + "grad_norm": 18857.74609375, + "learning_rate": 4.910109465193821e-05, + "loss": 5.5177, + "step": 698 + }, + { + "epoch": 0.17772127761004292, + "grad_norm": 22253.734375, + "learning_rate": 4.909518760324595e-05, + "loss": 5.5126, + "step": 699 + }, + { + "epoch": 0.17797552836484984, + "grad_norm": 16930.79296875, + "learning_rate": 4.908926156700488e-05, + "loss": 5.5265, + "step": 700 + }, + { + "epoch": 0.17797552836484984, + "eval_loss": 11.12784481048584, + "eval_runtime": 699.3397, + "eval_samples_per_second": 151.552, + "eval_steps_per_second": 9.473, + "step": 700 + }, + { + "epoch": 0.17822977911965676, + "grad_norm": 17669.5, + "learning_rate": 4.9083316547884826e-05, + "loss": 5.5158, + "step": 701 + }, + { + "epoch": 0.17848402987446368, + "grad_norm": 18947.248046875, + "learning_rate": 4.907735255057061e-05, + "loss": 5.5145, + "step": 702 + }, + { + "epoch": 0.1787382806292706, + "grad_norm": 16640.62109375, + "learning_rate": 4.9071369579761995e-05, + "loss": 5.5266, + "step": 703 + }, + { + "epoch": 0.17899253138407756, + "grad_norm": 16972.955078125, + "learning_rate": 4.906536764017369e-05, + "loss": 5.524, + "step": 704 + }, + { + "epoch": 0.17924678213888448, + "grad_norm": 19407.259765625, + "learning_rate": 4.905934673653536e-05, + "loss": 5.5118, + "step": 705 + }, + { + "epoch": 0.1795010328936914, + "grad_norm": 18232.404296875, + "learning_rate": 4.905330687359161e-05, + "loss": 5.5218, + "step": 706 + }, + { + "epoch": 0.17975528364849833, + "grad_norm": 17059.677734375, + "learning_rate": 4.904724805610199e-05, + "loss": 5.5465, + "step": 707 + }, + { + "epoch": 0.18000953440330525, + "grad_norm": 27981.82421875, + "learning_rate": 4.9041170288840985e-05, + "loss": 5.5318, + "step": 708 + }, + { + "epoch": 0.1802637851581122, + "grad_norm": 19111.18359375, + "learning_rate": 4.9035073576598014e-05, + "loss": 5.507, + "step": 709 + }, + { + "epoch": 0.18051803591291912, + "grad_norm": 17588.126953125, + "learning_rate": 4.902895792417742e-05, + "loss": 5.5193, + "step": 710 + }, + { + "epoch": 0.18077228666772605, + "grad_norm": 27783.1328125, + "learning_rate": 4.902282333639847e-05, + "loss": 5.5129, + "step": 711 + }, + { + "epoch": 0.18102653742253297, + "grad_norm": 17371.529296875, + "learning_rate": 4.901666981809537e-05, + "loss": 5.5128, + "step": 712 + }, + { + "epoch": 0.1812807881773399, + "grad_norm": 18173.57421875, + "learning_rate": 4.9010497374117214e-05, + "loss": 5.5234, + "step": 713 + }, + { + "epoch": 0.18153503893214684, + "grad_norm": 20874.427734375, + "learning_rate": 4.900430600932804e-05, + "loss": 5.5069, + "step": 714 + }, + { + "epoch": 0.18178928968695376, + "grad_norm": 17087.595703125, + "learning_rate": 4.899809572860677e-05, + "loss": 5.5097, + "step": 715 + }, + { + "epoch": 0.1820435404417607, + "grad_norm": 17009.755859375, + "learning_rate": 4.899186653684726e-05, + "loss": 5.5087, + "step": 716 + }, + { + "epoch": 0.1822977911965676, + "grad_norm": 19133.095703125, + "learning_rate": 4.8985618438958254e-05, + "loss": 5.5119, + "step": 717 + }, + { + "epoch": 0.18255204195137453, + "grad_norm": 16642.587890625, + "learning_rate": 4.8979351439863376e-05, + "loss": 5.5097, + "step": 718 + }, + { + "epoch": 0.18280629270618148, + "grad_norm": 16945.041015625, + "learning_rate": 4.897306554450117e-05, + "loss": 5.5119, + "step": 719 + }, + { + "epoch": 0.1830605434609884, + "grad_norm": 17844.716796875, + "learning_rate": 4.896676075782506e-05, + "loss": 5.5076, + "step": 720 + }, + { + "epoch": 0.18331479421579533, + "grad_norm": 17752.22265625, + "learning_rate": 4.896043708480337e-05, + "loss": 5.5019, + "step": 721 + }, + { + "epoch": 0.18356904497060225, + "grad_norm": 16694.708984375, + "learning_rate": 4.895409453041928e-05, + "loss": 5.5121, + "step": 722 + }, + { + "epoch": 0.18382329572540917, + "grad_norm": 16469.119140625, + "learning_rate": 4.894773309967088e-05, + "loss": 5.5073, + "step": 723 + }, + { + "epoch": 0.18407754648021613, + "grad_norm": 16523.3828125, + "learning_rate": 4.894135279757111e-05, + "loss": 5.5003, + "step": 724 + }, + { + "epoch": 0.18433179723502305, + "grad_norm": 16605.66015625, + "learning_rate": 4.89349536291478e-05, + "loss": 5.5058, + "step": 725 + }, + { + "epoch": 0.18458604798982997, + "grad_norm": 16735.14453125, + "learning_rate": 4.892853559944363e-05, + "loss": 5.5068, + "step": 726 + }, + { + "epoch": 0.1848402987446369, + "grad_norm": 16979.35546875, + "learning_rate": 4.8922098713516165e-05, + "loss": 5.5005, + "step": 727 + }, + { + "epoch": 0.18509454949944382, + "grad_norm": 17449.349609375, + "learning_rate": 4.89156429764378e-05, + "loss": 5.499, + "step": 728 + }, + { + "epoch": 0.18534880025425077, + "grad_norm": 17190.453125, + "learning_rate": 4.8909168393295803e-05, + "loss": 5.4871, + "step": 729 + }, + { + "epoch": 0.1856030510090577, + "grad_norm": 16658.19921875, + "learning_rate": 4.8902674969192294e-05, + "loss": 5.5104, + "step": 730 + }, + { + "epoch": 0.1858573017638646, + "grad_norm": 16575.8046875, + "learning_rate": 4.889616270924425e-05, + "loss": 5.5139, + "step": 731 + }, + { + "epoch": 0.18611155251867154, + "grad_norm": 17014.2421875, + "learning_rate": 4.888963161858346e-05, + "loss": 5.4892, + "step": 732 + }, + { + "epoch": 0.18636580327347846, + "grad_norm": 17797.419921875, + "learning_rate": 4.888308170235657e-05, + "loss": 5.4889, + "step": 733 + }, + { + "epoch": 0.1866200540282854, + "grad_norm": 18417.068359375, + "learning_rate": 4.887651296572508e-05, + "loss": 5.4938, + "step": 734 + }, + { + "epoch": 0.18687430478309233, + "grad_norm": 17704.82421875, + "learning_rate": 4.886992541386528e-05, + "loss": 5.4811, + "step": 735 + }, + { + "epoch": 0.18712855553789925, + "grad_norm": 16696.72265625, + "learning_rate": 4.886331905196831e-05, + "loss": 5.4995, + "step": 736 + }, + { + "epoch": 0.18738280629270618, + "grad_norm": 18708.583984375, + "learning_rate": 4.8856693885240154e-05, + "loss": 5.4957, + "step": 737 + }, + { + "epoch": 0.1876370570475131, + "grad_norm": 22231.39453125, + "learning_rate": 4.8850049918901574e-05, + "loss": 5.5008, + "step": 738 + }, + { + "epoch": 0.18789130780232005, + "grad_norm": 16901.51171875, + "learning_rate": 4.884338715818817e-05, + "loss": 5.4764, + "step": 739 + }, + { + "epoch": 0.18814555855712697, + "grad_norm": 20277.150390625, + "learning_rate": 4.883670560835034e-05, + "loss": 5.4936, + "step": 740 + }, + { + "epoch": 0.1883998093119339, + "grad_norm": 22048.296875, + "learning_rate": 4.88300052746533e-05, + "loss": 5.5044, + "step": 741 + }, + { + "epoch": 0.18865406006674082, + "grad_norm": 17260.599609375, + "learning_rate": 4.882328616237707e-05, + "loss": 5.4832, + "step": 742 + }, + { + "epoch": 0.18890831082154774, + "grad_norm": 39408.87890625, + "learning_rate": 4.8816548276816446e-05, + "loss": 5.5015, + "step": 743 + }, + { + "epoch": 0.1891625615763547, + "grad_norm": 16681.390625, + "learning_rate": 4.880979162328105e-05, + "loss": 5.4792, + "step": 744 + }, + { + "epoch": 0.18941681233116162, + "grad_norm": 23736.94921875, + "learning_rate": 4.8803016207095263e-05, + "loss": 5.4793, + "step": 745 + }, + { + "epoch": 0.18967106308596854, + "grad_norm": 17365.373046875, + "learning_rate": 4.879622203359828e-05, + "loss": 5.4833, + "step": 746 + }, + { + "epoch": 0.18992531384077546, + "grad_norm": 38777.96875, + "learning_rate": 4.8789409108144046e-05, + "loss": 5.4903, + "step": 747 + }, + { + "epoch": 0.19017956459558238, + "grad_norm": 17310.361328125, + "learning_rate": 4.878257743610131e-05, + "loss": 5.4888, + "step": 748 + }, + { + "epoch": 0.19043381535038933, + "grad_norm": 24678.96484375, + "learning_rate": 4.877572702285358e-05, + "loss": 5.4846, + "step": 749 + }, + { + "epoch": 0.19068806610519626, + "grad_norm": 20161.830078125, + "learning_rate": 4.8768857873799136e-05, + "loss": 5.4817, + "step": 750 + }, + { + "epoch": 0.19094231686000318, + "grad_norm": 36119.22265625, + "learning_rate": 4.876196999435101e-05, + "loss": 5.4972, + "step": 751 + }, + { + "epoch": 0.1911965676148101, + "grad_norm": 21399.68359375, + "learning_rate": 4.875506338993703e-05, + "loss": 5.4785, + "step": 752 + }, + { + "epoch": 0.19145081836961703, + "grad_norm": 22758.49609375, + "learning_rate": 4.8748138065999736e-05, + "loss": 5.4763, + "step": 753 + }, + { + "epoch": 0.19170506912442398, + "grad_norm": 23260.380859375, + "learning_rate": 4.874119402799644e-05, + "loss": 5.475, + "step": 754 + }, + { + "epoch": 0.1919593198792309, + "grad_norm": 17307.568359375, + "learning_rate": 4.873423128139921e-05, + "loss": 5.4839, + "step": 755 + }, + { + "epoch": 0.19221357063403782, + "grad_norm": 29400.138671875, + "learning_rate": 4.8727249831694845e-05, + "loss": 5.462, + "step": 756 + }, + { + "epoch": 0.19246782138884475, + "grad_norm": 16735.525390625, + "learning_rate": 4.872024968438487e-05, + "loss": 5.4587, + "step": 757 + }, + { + "epoch": 0.19272207214365167, + "grad_norm": 20396.298828125, + "learning_rate": 4.871323084498557e-05, + "loss": 5.481, + "step": 758 + }, + { + "epoch": 0.1929763228984586, + "grad_norm": 18180.109375, + "learning_rate": 4.870619331902795e-05, + "loss": 5.4751, + "step": 759 + }, + { + "epoch": 0.19323057365326554, + "grad_norm": 25730.244140625, + "learning_rate": 4.869913711205773e-05, + "loss": 5.4655, + "step": 760 + }, + { + "epoch": 0.19348482440807246, + "grad_norm": 19287.75390625, + "learning_rate": 4.869206222963537e-05, + "loss": 5.497, + "step": 761 + }, + { + "epoch": 0.1937390751628794, + "grad_norm": 20065.8671875, + "learning_rate": 4.868496867733603e-05, + "loss": 5.4752, + "step": 762 + }, + { + "epoch": 0.1939933259176863, + "grad_norm": 20584.716796875, + "learning_rate": 4.867785646074959e-05, + "loss": 5.4681, + "step": 763 + }, + { + "epoch": 0.19424757667249323, + "grad_norm": 17356.533203125, + "learning_rate": 4.8670725585480635e-05, + "loss": 5.4707, + "step": 764 + }, + { + "epoch": 0.19450182742730018, + "grad_norm": 22042.802734375, + "learning_rate": 4.866357605714845e-05, + "loss": 5.4789, + "step": 765 + }, + { + "epoch": 0.1947560781821071, + "grad_norm": 17293.966796875, + "learning_rate": 4.8656407881387035e-05, + "loss": 5.4921, + "step": 766 + }, + { + "epoch": 0.19501032893691403, + "grad_norm": 18169.630859375, + "learning_rate": 4.864922106384506e-05, + "loss": 5.4772, + "step": 767 + }, + { + "epoch": 0.19526457969172095, + "grad_norm": 16894.482421875, + "learning_rate": 4.8642015610185914e-05, + "loss": 5.4545, + "step": 768 + }, + { + "epoch": 0.19551883044652787, + "grad_norm": 19278.734375, + "learning_rate": 4.863479152608764e-05, + "loss": 5.4621, + "step": 769 + }, + { + "epoch": 0.19577308120133483, + "grad_norm": 16807.013671875, + "learning_rate": 4.8627548817242995e-05, + "loss": 5.4468, + "step": 770 + }, + { + "epoch": 0.19602733195614175, + "grad_norm": 17927.306640625, + "learning_rate": 4.8620287489359384e-05, + "loss": 5.4683, + "step": 771 + }, + { + "epoch": 0.19628158271094867, + "grad_norm": 17142.498046875, + "learning_rate": 4.8613007548158906e-05, + "loss": 5.4564, + "step": 772 + }, + { + "epoch": 0.1965358334657556, + "grad_norm": 17235.14453125, + "learning_rate": 4.860570899937831e-05, + "loss": 5.4768, + "step": 773 + }, + { + "epoch": 0.19679008422056252, + "grad_norm": 17203.421875, + "learning_rate": 4.8598391848769006e-05, + "loss": 5.494, + "step": 774 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 17139.166015625, + "learning_rate": 4.85910561020971e-05, + "loss": 5.4797, + "step": 775 + }, + { + "epoch": 0.1972985857301764, + "grad_norm": 17136.216796875, + "learning_rate": 4.858370176514331e-05, + "loss": 5.4656, + "step": 776 + }, + { + "epoch": 0.1975528364849833, + "grad_norm": 17079.115234375, + "learning_rate": 4.857632884370301e-05, + "loss": 5.4574, + "step": 777 + }, + { + "epoch": 0.19780708723979024, + "grad_norm": 17595.861328125, + "learning_rate": 4.856893734358625e-05, + "loss": 5.4625, + "step": 778 + }, + { + "epoch": 0.19806133799459716, + "grad_norm": 16878.646484375, + "learning_rate": 4.856152727061768e-05, + "loss": 5.4655, + "step": 779 + }, + { + "epoch": 0.1983155887494041, + "grad_norm": 17753.927734375, + "learning_rate": 4.85540986306366e-05, + "loss": 5.4699, + "step": 780 + }, + { + "epoch": 0.19856983950421103, + "grad_norm": 17242.048828125, + "learning_rate": 4.8546651429496967e-05, + "loss": 5.4609, + "step": 781 + }, + { + "epoch": 0.19882409025901795, + "grad_norm": 17300.095703125, + "learning_rate": 4.8539185673067325e-05, + "loss": 5.4773, + "step": 782 + }, + { + "epoch": 0.19907834101382488, + "grad_norm": 17454.78515625, + "learning_rate": 4.853170136723086e-05, + "loss": 5.4481, + "step": 783 + }, + { + "epoch": 0.1993325917686318, + "grad_norm": 17033.931640625, + "learning_rate": 4.8524198517885376e-05, + "loss": 5.4591, + "step": 784 + }, + { + "epoch": 0.19958684252343875, + "grad_norm": 17601.669921875, + "learning_rate": 4.851667713094329e-05, + "loss": 5.4621, + "step": 785 + }, + { + "epoch": 0.19984109327824567, + "grad_norm": 16838.68359375, + "learning_rate": 4.850913721233162e-05, + "loss": 5.4574, + "step": 786 + }, + { + "epoch": 0.2000953440330526, + "grad_norm": 16886.314453125, + "learning_rate": 4.850157876799198e-05, + "loss": 5.4516, + "step": 787 + }, + { + "epoch": 0.20034959478785952, + "grad_norm": 16944.869140625, + "learning_rate": 4.849400180388062e-05, + "loss": 5.4362, + "step": 788 + }, + { + "epoch": 0.20060384554266644, + "grad_norm": 17570.51171875, + "learning_rate": 4.848640632596834e-05, + "loss": 5.4597, + "step": 789 + }, + { + "epoch": 0.2008580962974734, + "grad_norm": 16958.255859375, + "learning_rate": 4.8478792340240543e-05, + "loss": 5.4461, + "step": 790 + }, + { + "epoch": 0.20111234705228032, + "grad_norm": 17096.994140625, + "learning_rate": 4.847115985269723e-05, + "loss": 5.4676, + "step": 791 + }, + { + "epoch": 0.20136659780708724, + "grad_norm": 16994.615234375, + "learning_rate": 4.846350886935298e-05, + "loss": 5.4394, + "step": 792 + }, + { + "epoch": 0.20162084856189416, + "grad_norm": 17317.64453125, + "learning_rate": 4.845583939623692e-05, + "loss": 5.4537, + "step": 793 + }, + { + "epoch": 0.20187509931670108, + "grad_norm": 17152.9921875, + "learning_rate": 4.844815143939277e-05, + "loss": 5.4515, + "step": 794 + }, + { + "epoch": 0.20212935007150803, + "grad_norm": 17149.2421875, + "learning_rate": 4.8440445004878836e-05, + "loss": 5.4507, + "step": 795 + }, + { + "epoch": 0.20238360082631496, + "grad_norm": 17196.46484375, + "learning_rate": 4.8432720098767934e-05, + "loss": 5.4573, + "step": 796 + }, + { + "epoch": 0.20263785158112188, + "grad_norm": 17182.818359375, + "learning_rate": 4.842497672714749e-05, + "loss": 5.4417, + "step": 797 + }, + { + "epoch": 0.2028921023359288, + "grad_norm": 17190.388671875, + "learning_rate": 4.841721489611942e-05, + "loss": 5.4481, + "step": 798 + }, + { + "epoch": 0.20314635309073573, + "grad_norm": 17173.533203125, + "learning_rate": 4.8409434611800254e-05, + "loss": 5.434, + "step": 799 + }, + { + "epoch": 0.20340060384554268, + "grad_norm": 17155.787109375, + "learning_rate": 4.8401635880321007e-05, + "loss": 5.4492, + "step": 800 + }, + { + "epoch": 0.20340060384554268, + "eval_loss": 10.975968360900879, + "eval_runtime": 698.7587, + "eval_samples_per_second": 151.678, + "eval_steps_per_second": 9.481, + "step": 800 + }, + { + "epoch": 0.2036548546003496, + "grad_norm": 17709.8515625, + "learning_rate": 4.839381870782726e-05, + "loss": 5.4467, + "step": 801 + }, + { + "epoch": 0.20390910535515652, + "grad_norm": 18139.349609375, + "learning_rate": 4.8385983100479135e-05, + "loss": 5.4396, + "step": 802 + }, + { + "epoch": 0.20416335610996345, + "grad_norm": 19168.87890625, + "learning_rate": 4.837812906445126e-05, + "loss": 5.4391, + "step": 803 + }, + { + "epoch": 0.20441760686477037, + "grad_norm": 18207.087890625, + "learning_rate": 4.8370256605932784e-05, + "loss": 5.4577, + "step": 804 + }, + { + "epoch": 0.20467185761957732, + "grad_norm": 17704.251953125, + "learning_rate": 4.8362365731127385e-05, + "loss": 5.4406, + "step": 805 + }, + { + "epoch": 0.20492610837438424, + "grad_norm": 18517.12890625, + "learning_rate": 4.835445644625325e-05, + "loss": 5.4563, + "step": 806 + }, + { + "epoch": 0.20518035912919116, + "grad_norm": 16896.431640625, + "learning_rate": 4.834652875754307e-05, + "loss": 5.438, + "step": 807 + }, + { + "epoch": 0.2054346098839981, + "grad_norm": 17488.44140625, + "learning_rate": 4.833858267124405e-05, + "loss": 5.4463, + "step": 808 + }, + { + "epoch": 0.205688860638805, + "grad_norm": 17255.595703125, + "learning_rate": 4.833061819361787e-05, + "loss": 5.4575, + "step": 809 + }, + { + "epoch": 0.20594311139361196, + "grad_norm": 17746.630859375, + "learning_rate": 4.832263533094073e-05, + "loss": 5.4447, + "step": 810 + }, + { + "epoch": 0.20619736214841888, + "grad_norm": 18005.53125, + "learning_rate": 4.831463408950331e-05, + "loss": 5.442, + "step": 811 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 17274.60546875, + "learning_rate": 4.830661447561074e-05, + "loss": 5.4466, + "step": 812 + }, + { + "epoch": 0.20670586365803273, + "grad_norm": 19063.30078125, + "learning_rate": 4.8298576495582694e-05, + "loss": 5.4185, + "step": 813 + }, + { + "epoch": 0.20696011441283965, + "grad_norm": 17205.919921875, + "learning_rate": 4.8290520155753254e-05, + "loss": 5.4361, + "step": 814 + }, + { + "epoch": 0.2072143651676466, + "grad_norm": 18312.072265625, + "learning_rate": 4.8282445462471004e-05, + "loss": 5.424, + "step": 815 + }, + { + "epoch": 0.20746861592245353, + "grad_norm": 18321.888671875, + "learning_rate": 4.827435242209898e-05, + "loss": 5.4184, + "step": 816 + }, + { + "epoch": 0.20772286667726045, + "grad_norm": 17325.552734375, + "learning_rate": 4.826624104101469e-05, + "loss": 5.4168, + "step": 817 + }, + { + "epoch": 0.20797711743206737, + "grad_norm": 18686.46484375, + "learning_rate": 4.825811132561008e-05, + "loss": 5.4188, + "step": 818 + }, + { + "epoch": 0.2082313681868743, + "grad_norm": 17174.45703125, + "learning_rate": 4.8249963282291544e-05, + "loss": 5.459, + "step": 819 + }, + { + "epoch": 0.20848561894168124, + "grad_norm": 18713.06640625, + "learning_rate": 4.824179691747992e-05, + "loss": 5.4098, + "step": 820 + }, + { + "epoch": 0.20873986969648817, + "grad_norm": 17252.634765625, + "learning_rate": 4.8233612237610493e-05, + "loss": 5.4444, + "step": 821 + }, + { + "epoch": 0.2089941204512951, + "grad_norm": 17764.0, + "learning_rate": 4.822540924913298e-05, + "loss": 5.4287, + "step": 822 + }, + { + "epoch": 0.209248371206102, + "grad_norm": 17277.396484375, + "learning_rate": 4.82171879585115e-05, + "loss": 5.4396, + "step": 823 + }, + { + "epoch": 0.20950262196090894, + "grad_norm": 17452.599609375, + "learning_rate": 4.820894837222464e-05, + "loss": 5.4151, + "step": 824 + }, + { + "epoch": 0.2097568727157159, + "grad_norm": 17822.01953125, + "learning_rate": 4.820069049676537e-05, + "loss": 5.4145, + "step": 825 + }, + { + "epoch": 0.2100111234705228, + "grad_norm": 17212.953125, + "learning_rate": 4.819241433864107e-05, + "loss": 5.422, + "step": 826 + }, + { + "epoch": 0.21026537422532973, + "grad_norm": 18727.654296875, + "learning_rate": 4.818411990437355e-05, + "loss": 5.4338, + "step": 827 + }, + { + "epoch": 0.21051962498013665, + "grad_norm": 17617.97265625, + "learning_rate": 4.817580720049901e-05, + "loss": 5.4423, + "step": 828 + }, + { + "epoch": 0.21077387573494358, + "grad_norm": 17747.548828125, + "learning_rate": 4.8167476233568045e-05, + "loss": 5.4114, + "step": 829 + }, + { + "epoch": 0.21102812648975053, + "grad_norm": 18010.123046875, + "learning_rate": 4.815912701014563e-05, + "loss": 5.4305, + "step": 830 + }, + { + "epoch": 0.21128237724455745, + "grad_norm": 17344.083984375, + "learning_rate": 4.815075953681117e-05, + "loss": 5.4259, + "step": 831 + }, + { + "epoch": 0.21153662799936437, + "grad_norm": 18157.705078125, + "learning_rate": 4.8142373820158396e-05, + "loss": 5.4213, + "step": 832 + }, + { + "epoch": 0.2117908787541713, + "grad_norm": 17346.078125, + "learning_rate": 4.813396986679546e-05, + "loss": 5.441, + "step": 833 + }, + { + "epoch": 0.21204512950897822, + "grad_norm": 18123.140625, + "learning_rate": 4.8125547683344854e-05, + "loss": 5.4167, + "step": 834 + }, + { + "epoch": 0.21229938026378517, + "grad_norm": 17636.365234375, + "learning_rate": 4.8117107276443446e-05, + "loss": 5.4146, + "step": 835 + }, + { + "epoch": 0.2125536310185921, + "grad_norm": 17534.755859375, + "learning_rate": 4.8108648652742475e-05, + "loss": 5.4249, + "step": 836 + }, + { + "epoch": 0.21280788177339902, + "grad_norm": 18610.494140625, + "learning_rate": 4.810017181890752e-05, + "loss": 5.4308, + "step": 837 + }, + { + "epoch": 0.21306213252820594, + "grad_norm": 17490.263671875, + "learning_rate": 4.809167678161852e-05, + "loss": 5.4097, + "step": 838 + }, + { + "epoch": 0.21331638328301286, + "grad_norm": 17355.6171875, + "learning_rate": 4.8083163547569754e-05, + "loss": 5.4133, + "step": 839 + }, + { + "epoch": 0.2135706340378198, + "grad_norm": 18585.685546875, + "learning_rate": 4.8074632123469834e-05, + "loss": 5.4228, + "step": 840 + }, + { + "epoch": 0.21382488479262673, + "grad_norm": 18207.43359375, + "learning_rate": 4.806608251604173e-05, + "loss": 5.4165, + "step": 841 + }, + { + "epoch": 0.21407913554743366, + "grad_norm": 17172.05859375, + "learning_rate": 4.8057514732022716e-05, + "loss": 5.4087, + "step": 842 + }, + { + "epoch": 0.21433338630224058, + "grad_norm": 17985.322265625, + "learning_rate": 4.8048928778164395e-05, + "loss": 5.4057, + "step": 843 + }, + { + "epoch": 0.2145876370570475, + "grad_norm": 17661.4453125, + "learning_rate": 4.8040324661232686e-05, + "loss": 5.4098, + "step": 844 + }, + { + "epoch": 0.21484188781185445, + "grad_norm": 17584.203125, + "learning_rate": 4.8031702388007845e-05, + "loss": 5.411, + "step": 845 + }, + { + "epoch": 0.21509613856666138, + "grad_norm": 19419.599609375, + "learning_rate": 4.80230619652844e-05, + "loss": 5.4085, + "step": 846 + }, + { + "epoch": 0.2153503893214683, + "grad_norm": 18347.896484375, + "learning_rate": 4.801440339987121e-05, + "loss": 5.4191, + "step": 847 + }, + { + "epoch": 0.21560464007627522, + "grad_norm": 17328.365234375, + "learning_rate": 4.800572669859141e-05, + "loss": 5.3976, + "step": 848 + }, + { + "epoch": 0.21585889083108215, + "grad_norm": 19423.048828125, + "learning_rate": 4.7997031868282435e-05, + "loss": 5.4107, + "step": 849 + }, + { + "epoch": 0.2161131415858891, + "grad_norm": 18428.134765625, + "learning_rate": 4.7988318915796016e-05, + "loss": 5.4035, + "step": 850 + }, + { + "epoch": 0.21636739234069602, + "grad_norm": 17473.404296875, + "learning_rate": 4.797958784799815e-05, + "loss": 5.3878, + "step": 851 + }, + { + "epoch": 0.21662164309550294, + "grad_norm": 18494.755859375, + "learning_rate": 4.7970838671769114e-05, + "loss": 5.3993, + "step": 852 + }, + { + "epoch": 0.21687589385030986, + "grad_norm": 17283.00390625, + "learning_rate": 4.796207139400345e-05, + "loss": 5.3892, + "step": 853 + }, + { + "epoch": 0.2171301446051168, + "grad_norm": 18038.091796875, + "learning_rate": 4.795328602160998e-05, + "loss": 5.4199, + "step": 854 + }, + { + "epoch": 0.21738439535992374, + "grad_norm": 18823.369140625, + "learning_rate": 4.7944482561511773e-05, + "loss": 5.3995, + "step": 855 + }, + { + "epoch": 0.21763864611473066, + "grad_norm": 17727.009765625, + "learning_rate": 4.793566102064614e-05, + "loss": 5.4035, + "step": 856 + }, + { + "epoch": 0.21789289686953758, + "grad_norm": 17558.423828125, + "learning_rate": 4.792682140596467e-05, + "loss": 5.4095, + "step": 857 + }, + { + "epoch": 0.2181471476243445, + "grad_norm": 17932.189453125, + "learning_rate": 4.791796372443317e-05, + "loss": 5.4084, + "step": 858 + }, + { + "epoch": 0.21840139837915143, + "grad_norm": 17523.052734375, + "learning_rate": 4.790908798303169e-05, + "loss": 5.3872, + "step": 859 + }, + { + "epoch": 0.21865564913395838, + "grad_norm": 18066.94921875, + "learning_rate": 4.790019418875452e-05, + "loss": 5.3969, + "step": 860 + }, + { + "epoch": 0.2189098998887653, + "grad_norm": 17449.490234375, + "learning_rate": 4.789128234861017e-05, + "loss": 5.3993, + "step": 861 + }, + { + "epoch": 0.21916415064357223, + "grad_norm": 17495.359375, + "learning_rate": 4.7882352469621354e-05, + "loss": 5.3987, + "step": 862 + }, + { + "epoch": 0.21941840139837915, + "grad_norm": 17641.064453125, + "learning_rate": 4.787340455882504e-05, + "loss": 5.3996, + "step": 863 + }, + { + "epoch": 0.21967265215318607, + "grad_norm": 17604.541015625, + "learning_rate": 4.786443862327237e-05, + "loss": 5.4015, + "step": 864 + }, + { + "epoch": 0.21992690290799302, + "grad_norm": 17249.251953125, + "learning_rate": 4.78554546700287e-05, + "loss": 5.3957, + "step": 865 + }, + { + "epoch": 0.22018115366279994, + "grad_norm": 17524.5703125, + "learning_rate": 4.7846452706173596e-05, + "loss": 5.3888, + "step": 866 + }, + { + "epoch": 0.22043540441760687, + "grad_norm": 17364.328125, + "learning_rate": 4.7837432738800796e-05, + "loss": 5.3971, + "step": 867 + }, + { + "epoch": 0.2206896551724138, + "grad_norm": 17521.37109375, + "learning_rate": 4.782839477501825e-05, + "loss": 5.3782, + "step": 868 + }, + { + "epoch": 0.2209439059272207, + "grad_norm": 18890.900390625, + "learning_rate": 4.781933882194807e-05, + "loss": 5.384, + "step": 869 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 17461.412109375, + "learning_rate": 4.781026488672655e-05, + "loss": 5.3917, + "step": 870 + }, + { + "epoch": 0.2214524074368346, + "grad_norm": 17433.080078125, + "learning_rate": 4.780117297650415e-05, + "loss": 5.3781, + "step": 871 + }, + { + "epoch": 0.2217066581916415, + "grad_norm": 17750.408203125, + "learning_rate": 4.779206309844551e-05, + "loss": 5.3757, + "step": 872 + }, + { + "epoch": 0.22196090894644843, + "grad_norm": 17979.791015625, + "learning_rate": 4.7782935259729414e-05, + "loss": 5.3908, + "step": 873 + }, + { + "epoch": 0.22221515970125535, + "grad_norm": 18263.474609375, + "learning_rate": 4.77737894675488e-05, + "loss": 5.3898, + "step": 874 + }, + { + "epoch": 0.22246941045606228, + "grad_norm": 18163.5, + "learning_rate": 4.776462572911076e-05, + "loss": 5.4258, + "step": 875 + }, + { + "epoch": 0.22272366121086923, + "grad_norm": 17572.2890625, + "learning_rate": 4.7755444051636525e-05, + "loss": 5.3994, + "step": 876 + }, + { + "epoch": 0.22297791196567615, + "grad_norm": 17499.98046875, + "learning_rate": 4.774624444236147e-05, + "loss": 5.3757, + "step": 877 + }, + { + "epoch": 0.22323216272048307, + "grad_norm": 17581.466796875, + "learning_rate": 4.773702690853508e-05, + "loss": 5.3822, + "step": 878 + }, + { + "epoch": 0.22348641347529, + "grad_norm": 17665.419921875, + "learning_rate": 4.772779145742099e-05, + "loss": 5.3816, + "step": 879 + }, + { + "epoch": 0.22374066423009692, + "grad_norm": 17547.91015625, + "learning_rate": 4.771853809629694e-05, + "loss": 5.3914, + "step": 880 + }, + { + "epoch": 0.22399491498490387, + "grad_norm": 17665.396484375, + "learning_rate": 4.7709266832454786e-05, + "loss": 5.3713, + "step": 881 + }, + { + "epoch": 0.2242491657397108, + "grad_norm": 17600.853515625, + "learning_rate": 4.769997767320049e-05, + "loss": 5.3654, + "step": 882 + }, + { + "epoch": 0.22450341649451772, + "grad_norm": 17626.947265625, + "learning_rate": 4.769067062585412e-05, + "loss": 5.3778, + "step": 883 + }, + { + "epoch": 0.22475766724932464, + "grad_norm": 18446.50390625, + "learning_rate": 4.768134569774984e-05, + "loss": 5.3867, + "step": 884 + }, + { + "epoch": 0.22501191800413156, + "grad_norm": 18473.2109375, + "learning_rate": 4.76720028962359e-05, + "loss": 5.367, + "step": 885 + }, + { + "epoch": 0.2252661687589385, + "grad_norm": 17785.484375, + "learning_rate": 4.766264222867463e-05, + "loss": 5.3794, + "step": 886 + }, + { + "epoch": 0.22552041951374543, + "grad_norm": 17738.54296875, + "learning_rate": 4.7653263702442464e-05, + "loss": 5.3989, + "step": 887 + }, + { + "epoch": 0.22577467026855236, + "grad_norm": 19106.841796875, + "learning_rate": 4.764386732492988e-05, + "loss": 5.3801, + "step": 888 + }, + { + "epoch": 0.22602892102335928, + "grad_norm": 18218.19140625, + "learning_rate": 4.7634453103541434e-05, + "loss": 5.3671, + "step": 889 + }, + { + "epoch": 0.2262831717781662, + "grad_norm": 17934.93359375, + "learning_rate": 4.7625021045695736e-05, + "loss": 5.3913, + "step": 890 + }, + { + "epoch": 0.22653742253297315, + "grad_norm": 17768.75390625, + "learning_rate": 4.761557115882549e-05, + "loss": 5.3628, + "step": 891 + }, + { + "epoch": 0.22679167328778008, + "grad_norm": 17855.341796875, + "learning_rate": 4.760610345037738e-05, + "loss": 5.3907, + "step": 892 + }, + { + "epoch": 0.227045924042587, + "grad_norm": 17917.2421875, + "learning_rate": 4.759661792781219e-05, + "loss": 5.3769, + "step": 893 + }, + { + "epoch": 0.22730017479739392, + "grad_norm": 17846.27734375, + "learning_rate": 4.7587114598604745e-05, + "loss": 5.3572, + "step": 894 + }, + { + "epoch": 0.22755442555220085, + "grad_norm": 17893.140625, + "learning_rate": 4.757759347024384e-05, + "loss": 5.3806, + "step": 895 + }, + { + "epoch": 0.2278086763070078, + "grad_norm": 18742.2890625, + "learning_rate": 4.7568054550232376e-05, + "loss": 5.3574, + "step": 896 + }, + { + "epoch": 0.22806292706181472, + "grad_norm": 17868.220703125, + "learning_rate": 4.755849784608721e-05, + "loss": 5.3726, + "step": 897 + }, + { + "epoch": 0.22831717781662164, + "grad_norm": 17623.63671875, + "learning_rate": 4.754892336533926e-05, + "loss": 5.3723, + "step": 898 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 18265.275390625, + "learning_rate": 4.7539331115533416e-05, + "loss": 5.367, + "step": 899 + }, + { + "epoch": 0.2288256793262355, + "grad_norm": 18189.677734375, + "learning_rate": 4.7529721104228594e-05, + "loss": 5.3674, + "step": 900 + }, + { + "epoch": 0.2288256793262355, + "eval_loss": 10.820060729980469, + "eval_runtime": 700.6181, + "eval_samples_per_second": 151.275, + "eval_steps_per_second": 9.456, + "step": 900 + }, + { + "epoch": 0.22907993008104244, + "grad_norm": 17588.166015625, + "learning_rate": 4.75200933389977e-05, + "loss": 5.3686, + "step": 901 + }, + { + "epoch": 0.22933418083584936, + "grad_norm": 17979.3203125, + "learning_rate": 4.751044782742762e-05, + "loss": 5.3782, + "step": 902 + }, + { + "epoch": 0.22958843159065628, + "grad_norm": 17750.35546875, + "learning_rate": 4.750078457711924e-05, + "loss": 5.3605, + "step": 903 + }, + { + "epoch": 0.2298426823454632, + "grad_norm": 18194.66796875, + "learning_rate": 4.749110359568741e-05, + "loss": 5.3513, + "step": 904 + }, + { + "epoch": 0.23009693310027013, + "grad_norm": 18587.244140625, + "learning_rate": 4.748140489076098e-05, + "loss": 5.372, + "step": 905 + }, + { + "epoch": 0.23035118385507708, + "grad_norm": 18668.67578125, + "learning_rate": 4.747168846998273e-05, + "loss": 5.3629, + "step": 906 + }, + { + "epoch": 0.230605434609884, + "grad_norm": 18011.33203125, + "learning_rate": 4.746195434100943e-05, + "loss": 5.3521, + "step": 907 + }, + { + "epoch": 0.23085968536469093, + "grad_norm": 17774.375, + "learning_rate": 4.745220251151178e-05, + "loss": 5.3644, + "step": 908 + }, + { + "epoch": 0.23111393611949785, + "grad_norm": 18431.01171875, + "learning_rate": 4.7442432989174447e-05, + "loss": 5.3618, + "step": 909 + }, + { + "epoch": 0.23136818687430477, + "grad_norm": 21850.416015625, + "learning_rate": 4.743264578169603e-05, + "loss": 5.3504, + "step": 910 + }, + { + "epoch": 0.23162243762911172, + "grad_norm": 22610.974609375, + "learning_rate": 4.742284089678908e-05, + "loss": 5.3578, + "step": 911 + }, + { + "epoch": 0.23187668838391864, + "grad_norm": 17902.734375, + "learning_rate": 4.741301834218006e-05, + "loss": 5.3586, + "step": 912 + }, + { + "epoch": 0.23213093913872557, + "grad_norm": 32920.03515625, + "learning_rate": 4.740317812560935e-05, + "loss": 5.3584, + "step": 913 + }, + { + "epoch": 0.2323851898935325, + "grad_norm": 20057.27734375, + "learning_rate": 4.739332025483127e-05, + "loss": 5.3523, + "step": 914 + }, + { + "epoch": 0.2326394406483394, + "grad_norm": 19211.392578125, + "learning_rate": 4.7383444737614056e-05, + "loss": 5.3464, + "step": 915 + }, + { + "epoch": 0.23289369140314636, + "grad_norm": 29184.7421875, + "learning_rate": 4.7373551581739825e-05, + "loss": 5.3691, + "step": 916 + }, + { + "epoch": 0.2331479421579533, + "grad_norm": 19907.173828125, + "learning_rate": 4.736364079500461e-05, + "loss": 5.3562, + "step": 917 + }, + { + "epoch": 0.2334021929127602, + "grad_norm": 18084.181640625, + "learning_rate": 4.735371238521833e-05, + "loss": 5.3514, + "step": 918 + }, + { + "epoch": 0.23365644366756713, + "grad_norm": 23599.41796875, + "learning_rate": 4.73437663602048e-05, + "loss": 5.3381, + "step": 919 + }, + { + "epoch": 0.23391069442237405, + "grad_norm": 20313.943359375, + "learning_rate": 4.7333802727801706e-05, + "loss": 5.3468, + "step": 920 + }, + { + "epoch": 0.234164945177181, + "grad_norm": 18963.916015625, + "learning_rate": 4.7323821495860616e-05, + "loss": 5.3427, + "step": 921 + }, + { + "epoch": 0.23441919593198793, + "grad_norm": 29278.365234375, + "learning_rate": 4.731382267224697e-05, + "loss": 5.3646, + "step": 922 + }, + { + "epoch": 0.23467344668679485, + "grad_norm": 18982.904296875, + "learning_rate": 4.730380626484005e-05, + "loss": 5.3521, + "step": 923 + }, + { + "epoch": 0.23492769744160177, + "grad_norm": 19954.646484375, + "learning_rate": 4.7293772281533024e-05, + "loss": 5.3511, + "step": 924 + }, + { + "epoch": 0.2351819481964087, + "grad_norm": 20905.38671875, + "learning_rate": 4.7283720730232895e-05, + "loss": 5.3481, + "step": 925 + }, + { + "epoch": 0.23543619895121565, + "grad_norm": 17828.63671875, + "learning_rate": 4.727365161886051e-05, + "loss": 5.3532, + "step": 926 + }, + { + "epoch": 0.23569044970602257, + "grad_norm": 18717.716796875, + "learning_rate": 4.7263564955350546e-05, + "loss": 5.3525, + "step": 927 + }, + { + "epoch": 0.2359447004608295, + "grad_norm": 18535.908203125, + "learning_rate": 4.725346074765154e-05, + "loss": 5.3403, + "step": 928 + }, + { + "epoch": 0.23619895121563642, + "grad_norm": 17812.9609375, + "learning_rate": 4.7243339003725816e-05, + "loss": 5.3547, + "step": 929 + }, + { + "epoch": 0.23645320197044334, + "grad_norm": 18247.92578125, + "learning_rate": 4.723319973154954e-05, + "loss": 5.3259, + "step": 930 + }, + { + "epoch": 0.2367074527252503, + "grad_norm": 18077.51953125, + "learning_rate": 4.7223042939112686e-05, + "loss": 5.3402, + "step": 931 + }, + { + "epoch": 0.2369617034800572, + "grad_norm": 18649.927734375, + "learning_rate": 4.721286863441905e-05, + "loss": 5.3482, + "step": 932 + }, + { + "epoch": 0.23721595423486413, + "grad_norm": 17952.345703125, + "learning_rate": 4.720267682548618e-05, + "loss": 5.3352, + "step": 933 + }, + { + "epoch": 0.23747020498967106, + "grad_norm": 18010.802734375, + "learning_rate": 4.719246752034548e-05, + "loss": 5.3444, + "step": 934 + }, + { + "epoch": 0.23772445574447798, + "grad_norm": 17835.041015625, + "learning_rate": 4.71822407270421e-05, + "loss": 5.3614, + "step": 935 + }, + { + "epoch": 0.23797870649928493, + "grad_norm": 17767.880859375, + "learning_rate": 4.7171996453634984e-05, + "loss": 5.3437, + "step": 936 + }, + { + "epoch": 0.23823295725409185, + "grad_norm": 17838.28515625, + "learning_rate": 4.716173470819684e-05, + "loss": 5.3455, + "step": 937 + }, + { + "epoch": 0.23848720800889878, + "grad_norm": 17781.654296875, + "learning_rate": 4.715145549881417e-05, + "loss": 5.3349, + "step": 938 + }, + { + "epoch": 0.2387414587637057, + "grad_norm": 17865.822265625, + "learning_rate": 4.714115883358722e-05, + "loss": 5.3388, + "step": 939 + }, + { + "epoch": 0.23899570951851262, + "grad_norm": 18114.009765625, + "learning_rate": 4.713084472062998e-05, + "loss": 5.3403, + "step": 940 + }, + { + "epoch": 0.23924996027331957, + "grad_norm": 18377.095703125, + "learning_rate": 4.7120513168070215e-05, + "loss": 5.3401, + "step": 941 + }, + { + "epoch": 0.2395042110281265, + "grad_norm": 18223.400390625, + "learning_rate": 4.711016418404941e-05, + "loss": 5.3463, + "step": 942 + }, + { + "epoch": 0.23975846178293342, + "grad_norm": 17863.875, + "learning_rate": 4.709979777672281e-05, + "loss": 5.3475, + "step": 943 + }, + { + "epoch": 0.24001271253774034, + "grad_norm": 18238.734375, + "learning_rate": 4.708941395425936e-05, + "loss": 5.331, + "step": 944 + }, + { + "epoch": 0.24026696329254726, + "grad_norm": 19287.94921875, + "learning_rate": 4.707901272484177e-05, + "loss": 5.3254, + "step": 945 + }, + { + "epoch": 0.24052121404735421, + "grad_norm": 18871.19921875, + "learning_rate": 4.706859409666642e-05, + "loss": 5.3527, + "step": 946 + }, + { + "epoch": 0.24077546480216114, + "grad_norm": 17860.0859375, + "learning_rate": 4.7058158077943424e-05, + "loss": 5.348, + "step": 947 + }, + { + "epoch": 0.24102971555696806, + "grad_norm": 20285.310546875, + "learning_rate": 4.7047704676896606e-05, + "loss": 5.3342, + "step": 948 + }, + { + "epoch": 0.24128396631177498, + "grad_norm": 19997.982421875, + "learning_rate": 4.703723390176349e-05, + "loss": 5.3364, + "step": 949 + }, + { + "epoch": 0.2415382170665819, + "grad_norm": 18071.662109375, + "learning_rate": 4.702674576079527e-05, + "loss": 5.3293, + "step": 950 + }, + { + "epoch": 0.24179246782138886, + "grad_norm": 23144.951171875, + "learning_rate": 4.7016240262256825e-05, + "loss": 5.332, + "step": 951 + }, + { + "epoch": 0.24204671857619578, + "grad_norm": 19710.435546875, + "learning_rate": 4.700571741442674e-05, + "loss": 5.3289, + "step": 952 + }, + { + "epoch": 0.2423009693310027, + "grad_norm": 18603.517578125, + "learning_rate": 4.699517722559726e-05, + "loss": 5.3384, + "step": 953 + }, + { + "epoch": 0.24255522008580963, + "grad_norm": 22146.8046875, + "learning_rate": 4.698461970407429e-05, + "loss": 5.3405, + "step": 954 + }, + { + "epoch": 0.24280947084061655, + "grad_norm": 19674.232421875, + "learning_rate": 4.697404485817737e-05, + "loss": 5.3255, + "step": 955 + }, + { + "epoch": 0.2430637215954235, + "grad_norm": 18092.431640625, + "learning_rate": 4.696345269623974e-05, + "loss": 5.3357, + "step": 956 + }, + { + "epoch": 0.24331797235023042, + "grad_norm": 21211.541015625, + "learning_rate": 4.695284322660825e-05, + "loss": 5.3314, + "step": 957 + }, + { + "epoch": 0.24357222310503734, + "grad_norm": 20317.283203125, + "learning_rate": 4.694221645764341e-05, + "loss": 5.3308, + "step": 958 + }, + { + "epoch": 0.24382647385984427, + "grad_norm": 17944.79296875, + "learning_rate": 4.6931572397719346e-05, + "loss": 5.3376, + "step": 959 + }, + { + "epoch": 0.2440807246146512, + "grad_norm": 19708.265625, + "learning_rate": 4.6920911055223814e-05, + "loss": 5.3377, + "step": 960 + }, + { + "epoch": 0.24433497536945814, + "grad_norm": 18920.71875, + "learning_rate": 4.69102324385582e-05, + "loss": 5.3346, + "step": 961 + }, + { + "epoch": 0.24458922612426506, + "grad_norm": 18248.060546875, + "learning_rate": 4.689953655613748e-05, + "loss": 5.3216, + "step": 962 + }, + { + "epoch": 0.244843476879072, + "grad_norm": 20516.634765625, + "learning_rate": 4.6888823416390264e-05, + "loss": 5.3206, + "step": 963 + }, + { + "epoch": 0.2450977276338789, + "grad_norm": 18636.779296875, + "learning_rate": 4.687809302775874e-05, + "loss": 5.3257, + "step": 964 + }, + { + "epoch": 0.24535197838868583, + "grad_norm": 18972.439453125, + "learning_rate": 4.6867345398698694e-05, + "loss": 5.3068, + "step": 965 + }, + { + "epoch": 0.24560622914349278, + "grad_norm": 20794.03515625, + "learning_rate": 4.68565805376795e-05, + "loss": 5.316, + "step": 966 + }, + { + "epoch": 0.2458604798982997, + "grad_norm": 18309.11328125, + "learning_rate": 4.684579845318411e-05, + "loss": 5.3131, + "step": 967 + }, + { + "epoch": 0.24611473065310663, + "grad_norm": 18749.728515625, + "learning_rate": 4.6834999153709055e-05, + "loss": 5.3225, + "step": 968 + }, + { + "epoch": 0.24636898140791355, + "grad_norm": 20605.408203125, + "learning_rate": 4.682418264776442e-05, + "loss": 5.3263, + "step": 969 + }, + { + "epoch": 0.24662323216272047, + "grad_norm": 18425.98046875, + "learning_rate": 4.6813348943873844e-05, + "loss": 5.3088, + "step": 970 + }, + { + "epoch": 0.24687748291752742, + "grad_norm": 18449.224609375, + "learning_rate": 4.680249805057455e-05, + "loss": 5.3222, + "step": 971 + }, + { + "epoch": 0.24713173367233435, + "grad_norm": 19419.236328125, + "learning_rate": 4.6791629976417264e-05, + "loss": 5.3248, + "step": 972 + }, + { + "epoch": 0.24738598442714127, + "grad_norm": 18711.662109375, + "learning_rate": 4.678074472996628e-05, + "loss": 5.3164, + "step": 973 + }, + { + "epoch": 0.2476402351819482, + "grad_norm": 18136.14453125, + "learning_rate": 4.676984231979944e-05, + "loss": 5.3279, + "step": 974 + }, + { + "epoch": 0.24789448593675512, + "grad_norm": 18540.59765625, + "learning_rate": 4.675892275450805e-05, + "loss": 5.3159, + "step": 975 + }, + { + "epoch": 0.24814873669156207, + "grad_norm": 18179.595703125, + "learning_rate": 4.6747986042697e-05, + "loss": 5.3181, + "step": 976 + }, + { + "epoch": 0.248402987446369, + "grad_norm": 17983.767578125, + "learning_rate": 4.673703219298465e-05, + "loss": 5.2956, + "step": 977 + }, + { + "epoch": 0.2486572382011759, + "grad_norm": 18342.99609375, + "learning_rate": 4.6726061214002894e-05, + "loss": 5.3249, + "step": 978 + }, + { + "epoch": 0.24891148895598283, + "grad_norm": 18580.19140625, + "learning_rate": 4.671507311439709e-05, + "loss": 5.3274, + "step": 979 + }, + { + "epoch": 0.24916573971078976, + "grad_norm": 18269.54296875, + "learning_rate": 4.670406790282612e-05, + "loss": 5.3286, + "step": 980 + }, + { + "epoch": 0.2494199904655967, + "grad_norm": 18964.98828125, + "learning_rate": 4.669304558796233e-05, + "loss": 5.3143, + "step": 981 + }, + { + "epoch": 0.24967424122040363, + "grad_norm": 18295.220703125, + "learning_rate": 4.668200617849157e-05, + "loss": 5.3232, + "step": 982 + }, + { + "epoch": 0.24992849197521055, + "grad_norm": 18181.0546875, + "learning_rate": 4.667094968311311e-05, + "loss": 5.3061, + "step": 983 + }, + { + "epoch": 0.2501827427300175, + "grad_norm": 18935.896484375, + "learning_rate": 4.665987611053975e-05, + "loss": 5.3209, + "step": 984 + }, + { + "epoch": 0.2504369934848244, + "grad_norm": 18704.439453125, + "learning_rate": 4.6648785469497696e-05, + "loss": 5.3037, + "step": 985 + }, + { + "epoch": 0.25069124423963135, + "grad_norm": 18550.984375, + "learning_rate": 4.663767776872663e-05, + "loss": 5.329, + "step": 986 + }, + { + "epoch": 0.25094549499443825, + "grad_norm": 20367.5078125, + "learning_rate": 4.662655301697966e-05, + "loss": 5.3121, + "step": 987 + }, + { + "epoch": 0.2511997457492452, + "grad_norm": 20519.23046875, + "learning_rate": 4.6615411223023346e-05, + "loss": 5.3071, + "step": 988 + }, + { + "epoch": 0.25145399650405215, + "grad_norm": 18400.6875, + "learning_rate": 4.660425239563767e-05, + "loss": 5.3133, + "step": 989 + }, + { + "epoch": 0.25170824725885904, + "grad_norm": 24820.783203125, + "learning_rate": 4.659307654361605e-05, + "loss": 5.2959, + "step": 990 + }, + { + "epoch": 0.251962498013666, + "grad_norm": 20000.869140625, + "learning_rate": 4.658188367576529e-05, + "loss": 5.3138, + "step": 991 + }, + { + "epoch": 0.2522167487684729, + "grad_norm": 20044.357421875, + "learning_rate": 4.657067380090563e-05, + "loss": 5.3042, + "step": 992 + }, + { + "epoch": 0.25247099952327984, + "grad_norm": 24468.15625, + "learning_rate": 4.65594469278707e-05, + "loss": 5.2929, + "step": 993 + }, + { + "epoch": 0.2527252502780868, + "grad_norm": 18265.525390625, + "learning_rate": 4.6548203065507533e-05, + "loss": 5.2895, + "step": 994 + }, + { + "epoch": 0.2529795010328937, + "grad_norm": 20103.119140625, + "learning_rate": 4.653694222267655e-05, + "loss": 5.319, + "step": 995 + }, + { + "epoch": 0.25323375178770063, + "grad_norm": 18747.2890625, + "learning_rate": 4.6525664408251526e-05, + "loss": 5.2967, + "step": 996 + }, + { + "epoch": 0.25348800254250753, + "grad_norm": 18531.0703125, + "learning_rate": 4.651436963111966e-05, + "loss": 5.2981, + "step": 997 + }, + { + "epoch": 0.2537422532973145, + "grad_norm": 18618.912109375, + "learning_rate": 4.650305790018147e-05, + "loss": 5.3137, + "step": 998 + }, + { + "epoch": 0.25399650405212143, + "grad_norm": 18419.814453125, + "learning_rate": 4.649172922435086e-05, + "loss": 5.2909, + "step": 999 + }, + { + "epoch": 0.2542507548069283, + "grad_norm": 20947.517578125, + "learning_rate": 4.648038361255508e-05, + "loss": 5.297, + "step": 1000 + }, + { + "epoch": 0.2542507548069283, + "eval_loss": 10.676921844482422, + "eval_runtime": 698.7233, + "eval_samples_per_second": 151.685, + "eval_steps_per_second": 9.482, + "step": 1000 + }, + { + "epoch": 0.2545050055617353, + "grad_norm": 18704.73828125, + "learning_rate": 4.646902107373473e-05, + "loss": 5.2888, + "step": 1001 + }, + { + "epoch": 0.25475925631654217, + "grad_norm": 19256.822265625, + "learning_rate": 4.645764161684375e-05, + "loss": 5.2966, + "step": 1002 + }, + { + "epoch": 0.2550135070713491, + "grad_norm": 20010.93359375, + "learning_rate": 4.6446245250849396e-05, + "loss": 5.3064, + "step": 1003 + }, + { + "epoch": 0.25526775782615607, + "grad_norm": 18103.59375, + "learning_rate": 4.6434831984732264e-05, + "loss": 5.2922, + "step": 1004 + }, + { + "epoch": 0.25552200858096297, + "grad_norm": 19176.099609375, + "learning_rate": 4.642340182748627e-05, + "loss": 5.2599, + "step": 1005 + }, + { + "epoch": 0.2557762593357699, + "grad_norm": 18458.234375, + "learning_rate": 4.6411954788118624e-05, + "loss": 5.2888, + "step": 1006 + }, + { + "epoch": 0.2560305100905768, + "grad_norm": 20568.380859375, + "learning_rate": 4.640049087564986e-05, + "loss": 5.2903, + "step": 1007 + }, + { + "epoch": 0.25628476084538376, + "grad_norm": 18988.369140625, + "learning_rate": 4.638901009911379e-05, + "loss": 5.2875, + "step": 1008 + }, + { + "epoch": 0.2565390116001907, + "grad_norm": 18686.9375, + "learning_rate": 4.637751246755753e-05, + "loss": 5.3128, + "step": 1009 + }, + { + "epoch": 0.2567932623549976, + "grad_norm": 18832.705078125, + "learning_rate": 4.636599799004148e-05, + "loss": 5.2974, + "step": 1010 + }, + { + "epoch": 0.25704751310980456, + "grad_norm": 18556.197265625, + "learning_rate": 4.6354466675639285e-05, + "loss": 5.2934, + "step": 1011 + }, + { + "epoch": 0.25730176386461145, + "grad_norm": 19354.244140625, + "learning_rate": 4.634291853343789e-05, + "loss": 5.2936, + "step": 1012 + }, + { + "epoch": 0.2575560146194184, + "grad_norm": 18408.103515625, + "learning_rate": 4.633135357253751e-05, + "loss": 5.2797, + "step": 1013 + }, + { + "epoch": 0.25781026537422536, + "grad_norm": 19626.26953125, + "learning_rate": 4.631977180205156e-05, + "loss": 5.2847, + "step": 1014 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 18935.166015625, + "learning_rate": 4.630817323110676e-05, + "loss": 5.2829, + "step": 1015 + }, + { + "epoch": 0.2583187668838392, + "grad_norm": 18956.142578125, + "learning_rate": 4.629655786884302e-05, + "loss": 5.2931, + "step": 1016 + }, + { + "epoch": 0.2585730176386461, + "grad_norm": 22038.21875, + "learning_rate": 4.6284925724413534e-05, + "loss": 5.2939, + "step": 1017 + }, + { + "epoch": 0.25882726839345305, + "grad_norm": 18735.51171875, + "learning_rate": 4.627327680698468e-05, + "loss": 5.2861, + "step": 1018 + }, + { + "epoch": 0.25908151914826, + "grad_norm": 18768.82421875, + "learning_rate": 4.626161112573606e-05, + "loss": 5.2874, + "step": 1019 + }, + { + "epoch": 0.2593357699030669, + "grad_norm": 19256.115234375, + "learning_rate": 4.6249928689860504e-05, + "loss": 5.2698, + "step": 1020 + }, + { + "epoch": 0.25959002065787384, + "grad_norm": 18492.08984375, + "learning_rate": 4.6238229508564036e-05, + "loss": 5.2819, + "step": 1021 + }, + { + "epoch": 0.25984427141268074, + "grad_norm": 20447.416015625, + "learning_rate": 4.6226513591065856e-05, + "loss": 5.2845, + "step": 1022 + }, + { + "epoch": 0.2600985221674877, + "grad_norm": 18602.169921875, + "learning_rate": 4.6214780946598386e-05, + "loss": 5.2835, + "step": 1023 + }, + { + "epoch": 0.26035277292229464, + "grad_norm": 20133.9140625, + "learning_rate": 4.620303158440721e-05, + "loss": 5.2857, + "step": 1024 + }, + { + "epoch": 0.26060702367710153, + "grad_norm": 18633.75390625, + "learning_rate": 4.619126551375109e-05, + "loss": 5.2786, + "step": 1025 + }, + { + "epoch": 0.2608612744319085, + "grad_norm": 18813.966796875, + "learning_rate": 4.617948274390194e-05, + "loss": 5.2519, + "step": 1026 + }, + { + "epoch": 0.2611155251867154, + "grad_norm": 18847.447265625, + "learning_rate": 4.616768328414487e-05, + "loss": 5.2768, + "step": 1027 + }, + { + "epoch": 0.26136977594152233, + "grad_norm": 18288.798828125, + "learning_rate": 4.6155867143778096e-05, + "loss": 5.2885, + "step": 1028 + }, + { + "epoch": 0.2616240266963293, + "grad_norm": 18775.12890625, + "learning_rate": 4.614403433211303e-05, + "loss": 5.2616, + "step": 1029 + }, + { + "epoch": 0.2618782774511362, + "grad_norm": 18532.947265625, + "learning_rate": 4.613218485847416e-05, + "loss": 5.2927, + "step": 1030 + }, + { + "epoch": 0.2621325282059431, + "grad_norm": 19250.541015625, + "learning_rate": 4.612031873219916e-05, + "loss": 5.2784, + "step": 1031 + }, + { + "epoch": 0.26238677896075, + "grad_norm": 19046.58984375, + "learning_rate": 4.6108435962638805e-05, + "loss": 5.2806, + "step": 1032 + }, + { + "epoch": 0.262641029715557, + "grad_norm": 18557.748046875, + "learning_rate": 4.6096536559156976e-05, + "loss": 5.2698, + "step": 1033 + }, + { + "epoch": 0.2628952804703639, + "grad_norm": 18484.72265625, + "learning_rate": 4.6084620531130665e-05, + "loss": 5.2833, + "step": 1034 + }, + { + "epoch": 0.2631495312251708, + "grad_norm": 18384.3515625, + "learning_rate": 4.6072687887949986e-05, + "loss": 5.2771, + "step": 1035 + }, + { + "epoch": 0.26340378197997777, + "grad_norm": 18433.171875, + "learning_rate": 4.606073863901811e-05, + "loss": 5.2523, + "step": 1036 + }, + { + "epoch": 0.26365803273478466, + "grad_norm": 18639.447265625, + "learning_rate": 4.6048772793751324e-05, + "loss": 5.2709, + "step": 1037 + }, + { + "epoch": 0.2639122834895916, + "grad_norm": 18573.83984375, + "learning_rate": 4.603679036157899e-05, + "loss": 5.2708, + "step": 1038 + }, + { + "epoch": 0.26416653424439857, + "grad_norm": 18574.572265625, + "learning_rate": 4.602479135194352e-05, + "loss": 5.2711, + "step": 1039 + }, + { + "epoch": 0.26442078499920546, + "grad_norm": 18720.8046875, + "learning_rate": 4.601277577430041e-05, + "loss": 5.2689, + "step": 1040 + }, + { + "epoch": 0.2646750357540124, + "grad_norm": 19043.833984375, + "learning_rate": 4.6000743638118206e-05, + "loss": 5.2603, + "step": 1041 + }, + { + "epoch": 0.2649292865088193, + "grad_norm": 18659.544921875, + "learning_rate": 4.598869495287849e-05, + "loss": 5.2842, + "step": 1042 + }, + { + "epoch": 0.26518353726362626, + "grad_norm": 19066.189453125, + "learning_rate": 4.5976629728075913e-05, + "loss": 5.2879, + "step": 1043 + }, + { + "epoch": 0.2654377880184332, + "grad_norm": 18618.068359375, + "learning_rate": 4.596454797321813e-05, + "loss": 5.2625, + "step": 1044 + }, + { + "epoch": 0.2656920387732401, + "grad_norm": 18596.009765625, + "learning_rate": 4.595244969782585e-05, + "loss": 5.2554, + "step": 1045 + }, + { + "epoch": 0.26594628952804705, + "grad_norm": 19555.931640625, + "learning_rate": 4.594033491143277e-05, + "loss": 5.2776, + "step": 1046 + }, + { + "epoch": 0.26620054028285395, + "grad_norm": 18995.42578125, + "learning_rate": 4.592820362358562e-05, + "loss": 5.2615, + "step": 1047 + }, + { + "epoch": 0.2664547910376609, + "grad_norm": 18991.349609375, + "learning_rate": 4.591605584384413e-05, + "loss": 5.2549, + "step": 1048 + }, + { + "epoch": 0.2667090417924678, + "grad_norm": 18617.767578125, + "learning_rate": 4.590389158178102e-05, + "loss": 5.2746, + "step": 1049 + }, + { + "epoch": 0.26696329254727474, + "grad_norm": 18747.291015625, + "learning_rate": 4.5891710846982e-05, + "loss": 5.263, + "step": 1050 + }, + { + "epoch": 0.2672175433020817, + "grad_norm": 18721.935546875, + "learning_rate": 4.587951364904576e-05, + "loss": 5.262, + "step": 1051 + }, + { + "epoch": 0.2674717940568886, + "grad_norm": 18589.1875, + "learning_rate": 4.586729999758398e-05, + "loss": 5.2491, + "step": 1052 + }, + { + "epoch": 0.26772604481169554, + "grad_norm": 18682.291015625, + "learning_rate": 4.585506990222127e-05, + "loss": 5.2676, + "step": 1053 + }, + { + "epoch": 0.26798029556650244, + "grad_norm": 18647.712890625, + "learning_rate": 4.584282337259524e-05, + "loss": 5.27, + "step": 1054 + }, + { + "epoch": 0.2682345463213094, + "grad_norm": 18895.015625, + "learning_rate": 4.583056041835643e-05, + "loss": 5.2683, + "step": 1055 + }, + { + "epoch": 0.26848879707611634, + "grad_norm": 18877.19921875, + "learning_rate": 4.58182810491683e-05, + "loss": 5.2757, + "step": 1056 + }, + { + "epoch": 0.26874304783092323, + "grad_norm": 18682.20703125, + "learning_rate": 4.580598527470729e-05, + "loss": 5.2627, + "step": 1057 + }, + { + "epoch": 0.2689972985857302, + "grad_norm": 18800.166015625, + "learning_rate": 4.5793673104662746e-05, + "loss": 5.2508, + "step": 1058 + }, + { + "epoch": 0.2692515493405371, + "grad_norm": 18938.908203125, + "learning_rate": 4.578134454873692e-05, + "loss": 5.2507, + "step": 1059 + }, + { + "epoch": 0.26950580009534403, + "grad_norm": 23650.009765625, + "learning_rate": 4.5768999616645006e-05, + "loss": 5.2504, + "step": 1060 + }, + { + "epoch": 0.269760050850151, + "grad_norm": 19527.74609375, + "learning_rate": 4.5756638318115074e-05, + "loss": 5.2633, + "step": 1061 + }, + { + "epoch": 0.2700143016049579, + "grad_norm": 19006.771484375, + "learning_rate": 4.574426066288812e-05, + "loss": 5.2599, + "step": 1062 + }, + { + "epoch": 0.2702685523597648, + "grad_norm": 19969.28125, + "learning_rate": 4.5731866660717997e-05, + "loss": 5.2504, + "step": 1063 + }, + { + "epoch": 0.2705228031145717, + "grad_norm": 20093.740234375, + "learning_rate": 4.571945632137147e-05, + "loss": 5.2544, + "step": 1064 + }, + { + "epoch": 0.27077705386937867, + "grad_norm": 18873.5078125, + "learning_rate": 4.570702965462817e-05, + "loss": 5.2385, + "step": 1065 + }, + { + "epoch": 0.2710313046241856, + "grad_norm": 25937.205078125, + "learning_rate": 4.5694586670280566e-05, + "loss": 5.264, + "step": 1066 + }, + { + "epoch": 0.2712855553789925, + "grad_norm": 20998.25390625, + "learning_rate": 4.568212737813403e-05, + "loss": 5.2587, + "step": 1067 + }, + { + "epoch": 0.27153980613379947, + "grad_norm": 20501.58203125, + "learning_rate": 4.566965178800676e-05, + "loss": 5.2149, + "step": 1068 + }, + { + "epoch": 0.27179405688860636, + "grad_norm": 28938.4921875, + "learning_rate": 4.56571599097298e-05, + "loss": 5.2487, + "step": 1069 + }, + { + "epoch": 0.2720483076434133, + "grad_norm": 19291.228515625, + "learning_rate": 4.5644651753147015e-05, + "loss": 5.2612, + "step": 1070 + }, + { + "epoch": 0.27230255839822026, + "grad_norm": 21341.572265625, + "learning_rate": 4.5632127328115146e-05, + "loss": 5.2482, + "step": 1071 + }, + { + "epoch": 0.27255680915302716, + "grad_norm": 20495.69140625, + "learning_rate": 4.561958664450369e-05, + "loss": 5.2533, + "step": 1072 + }, + { + "epoch": 0.2728110599078341, + "grad_norm": 19533.490234375, + "learning_rate": 4.5607029712195004e-05, + "loss": 5.2511, + "step": 1073 + }, + { + "epoch": 0.273065310662641, + "grad_norm": 19501.73046875, + "learning_rate": 4.559445654108424e-05, + "loss": 5.2442, + "step": 1074 + }, + { + "epoch": 0.27331956141744795, + "grad_norm": 18751.208984375, + "learning_rate": 4.5581867141079315e-05, + "loss": 5.2471, + "step": 1075 + }, + { + "epoch": 0.2735738121722549, + "grad_norm": 23359.640625, + "learning_rate": 4.556926152210097e-05, + "loss": 5.2689, + "step": 1076 + }, + { + "epoch": 0.2738280629270618, + "grad_norm": 20858.423828125, + "learning_rate": 4.555663969408273e-05, + "loss": 5.2448, + "step": 1077 + }, + { + "epoch": 0.27408231368186875, + "grad_norm": 20229.2734375, + "learning_rate": 4.5544001666970845e-05, + "loss": 5.2411, + "step": 1078 + }, + { + "epoch": 0.27433656443667565, + "grad_norm": 22775.314453125, + "learning_rate": 4.5531347450724396e-05, + "loss": 5.232, + "step": 1079 + }, + { + "epoch": 0.2745908151914826, + "grad_norm": 18757.3671875, + "learning_rate": 4.551867705531519e-05, + "loss": 5.246, + "step": 1080 + }, + { + "epoch": 0.27484506594628955, + "grad_norm": 20306.38671875, + "learning_rate": 4.550599049072776e-05, + "loss": 5.2362, + "step": 1081 + }, + { + "epoch": 0.27509931670109644, + "grad_norm": 18759.568359375, + "learning_rate": 4.549328776695941e-05, + "loss": 5.227, + "step": 1082 + }, + { + "epoch": 0.2753535674559034, + "grad_norm": 22638.13671875, + "learning_rate": 4.548056889402019e-05, + "loss": 5.2403, + "step": 1083 + }, + { + "epoch": 0.2756078182107103, + "grad_norm": 19456.306640625, + "learning_rate": 4.5467833881932835e-05, + "loss": 5.2447, + "step": 1084 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 20195.033203125, + "learning_rate": 4.5455082740732835e-05, + "loss": 5.2475, + "step": 1085 + }, + { + "epoch": 0.2761163197203242, + "grad_norm": 19552.236328125, + "learning_rate": 4.5442315480468365e-05, + "loss": 5.241, + "step": 1086 + }, + { + "epoch": 0.2763705704751311, + "grad_norm": 19486.474609375, + "learning_rate": 4.542953211120033e-05, + "loss": 5.2518, + "step": 1087 + }, + { + "epoch": 0.27662482122993803, + "grad_norm": 19451.001953125, + "learning_rate": 4.541673264300229e-05, + "loss": 5.2324, + "step": 1088 + }, + { + "epoch": 0.27687907198474493, + "grad_norm": 19479.142578125, + "learning_rate": 4.540391708596053e-05, + "loss": 5.2207, + "step": 1089 + }, + { + "epoch": 0.2771333227395519, + "grad_norm": 21534.453125, + "learning_rate": 4.539108545017399e-05, + "loss": 5.2264, + "step": 1090 + }, + { + "epoch": 0.27738757349435883, + "grad_norm": 18740.244140625, + "learning_rate": 4.537823774575428e-05, + "loss": 5.2392, + "step": 1091 + }, + { + "epoch": 0.2776418242491657, + "grad_norm": 19559.61328125, + "learning_rate": 4.5365373982825695e-05, + "loss": 5.2292, + "step": 1092 + }, + { + "epoch": 0.2778960750039727, + "grad_norm": 19390.04296875, + "learning_rate": 4.5352494171525155e-05, + "loss": 5.2284, + "step": 1093 + }, + { + "epoch": 0.27815032575877957, + "grad_norm": 22003.71875, + "learning_rate": 4.5339598322002255e-05, + "loss": 5.2337, + "step": 1094 + }, + { + "epoch": 0.2784045765135865, + "grad_norm": 18859.396484375, + "learning_rate": 4.532668644441919e-05, + "loss": 5.2227, + "step": 1095 + }, + { + "epoch": 0.27865882726839347, + "grad_norm": 20484.791015625, + "learning_rate": 4.5313758548950837e-05, + "loss": 5.2304, + "step": 1096 + }, + { + "epoch": 0.27891307802320037, + "grad_norm": 18884.150390625, + "learning_rate": 4.530081464578465e-05, + "loss": 5.2221, + "step": 1097 + }, + { + "epoch": 0.2791673287780073, + "grad_norm": 22696.0078125, + "learning_rate": 4.5287854745120726e-05, + "loss": 5.2395, + "step": 1098 + }, + { + "epoch": 0.2794215795328142, + "grad_norm": 19195.8359375, + "learning_rate": 4.527487885717175e-05, + "loss": 5.2301, + "step": 1099 + }, + { + "epoch": 0.27967583028762116, + "grad_norm": 20194.6171875, + "learning_rate": 4.526188699216301e-05, + "loss": 5.2476, + "step": 1100 + }, + { + "epoch": 0.27967583028762116, + "eval_loss": 10.538366317749023, + "eval_runtime": 698.9295, + "eval_samples_per_second": 151.64, + "eval_steps_per_second": 9.479, + "step": 1100 + }, + { + "epoch": 0.2799300810424281, + "grad_norm": 18768.498046875, + "learning_rate": 4.524887916033241e-05, + "loss": 5.2272, + "step": 1101 + }, + { + "epoch": 0.280184331797235, + "grad_norm": 20945.138671875, + "learning_rate": 4.523585537193039e-05, + "loss": 5.2288, + "step": 1102 + }, + { + "epoch": 0.28043858255204196, + "grad_norm": 19218.41796875, + "learning_rate": 4.5222815637219984e-05, + "loss": 5.2257, + "step": 1103 + }, + { + "epoch": 0.28069283330684885, + "grad_norm": 19959.6171875, + "learning_rate": 4.5209759966476814e-05, + "loss": 5.2119, + "step": 1104 + }, + { + "epoch": 0.2809470840616558, + "grad_norm": 18951.61328125, + "learning_rate": 4.519668836998904e-05, + "loss": 5.2283, + "step": 1105 + }, + { + "epoch": 0.28120133481646276, + "grad_norm": 20073.236328125, + "learning_rate": 4.518360085805735e-05, + "loss": 5.2236, + "step": 1106 + }, + { + "epoch": 0.28145558557126965, + "grad_norm": 19036.421875, + "learning_rate": 4.517049744099503e-05, + "loss": 5.2235, + "step": 1107 + }, + { + "epoch": 0.2817098363260766, + "grad_norm": 19566.7265625, + "learning_rate": 4.515737812912785e-05, + "loss": 5.2267, + "step": 1108 + }, + { + "epoch": 0.2819640870808835, + "grad_norm": 19226.5078125, + "learning_rate": 4.5144242932794114e-05, + "loss": 5.227, + "step": 1109 + }, + { + "epoch": 0.28221833783569045, + "grad_norm": 19637.94140625, + "learning_rate": 4.513109186234467e-05, + "loss": 5.2195, + "step": 1110 + }, + { + "epoch": 0.2824725885904974, + "grad_norm": 19243.708984375, + "learning_rate": 4.511792492814284e-05, + "loss": 5.2085, + "step": 1111 + }, + { + "epoch": 0.2827268393453043, + "grad_norm": 18864.64453125, + "learning_rate": 4.5104742140564484e-05, + "loss": 5.2228, + "step": 1112 + }, + { + "epoch": 0.28298109010011124, + "grad_norm": 19017.24609375, + "learning_rate": 4.509154350999791e-05, + "loss": 5.2061, + "step": 1113 + }, + { + "epoch": 0.28323534085491814, + "grad_norm": 19696.044921875, + "learning_rate": 4.507832904684395e-05, + "loss": 5.2219, + "step": 1114 + }, + { + "epoch": 0.2834895916097251, + "grad_norm": 18992.564453125, + "learning_rate": 4.50650987615159e-05, + "loss": 5.2164, + "step": 1115 + }, + { + "epoch": 0.28374384236453204, + "grad_norm": 19503.03125, + "learning_rate": 4.505185266443952e-05, + "loss": 5.2318, + "step": 1116 + }, + { + "epoch": 0.28399809311933893, + "grad_norm": 19167.876953125, + "learning_rate": 4.5038590766053025e-05, + "loss": 5.2245, + "step": 1117 + }, + { + "epoch": 0.2842523438741459, + "grad_norm": 19578.37890625, + "learning_rate": 4.5025313076807084e-05, + "loss": 5.2269, + "step": 1118 + }, + { + "epoch": 0.2845065946289528, + "grad_norm": 19599.34375, + "learning_rate": 4.501201960716483e-05, + "loss": 5.2073, + "step": 1119 + }, + { + "epoch": 0.28476084538375973, + "grad_norm": 19364.05859375, + "learning_rate": 4.499871036760182e-05, + "loss": 5.2067, + "step": 1120 + }, + { + "epoch": 0.2850150961385667, + "grad_norm": 19509.134765625, + "learning_rate": 4.498538536860601e-05, + "loss": 5.215, + "step": 1121 + }, + { + "epoch": 0.2852693468933736, + "grad_norm": 18876.78515625, + "learning_rate": 4.497204462067781e-05, + "loss": 5.2048, + "step": 1122 + }, + { + "epoch": 0.2855235976481805, + "grad_norm": 19477.646484375, + "learning_rate": 4.4958688134330034e-05, + "loss": 5.2243, + "step": 1123 + }, + { + "epoch": 0.2857778484029874, + "grad_norm": 19461.203125, + "learning_rate": 4.494531592008789e-05, + "loss": 5.2169, + "step": 1124 + }, + { + "epoch": 0.2860320991577944, + "grad_norm": 19409.619140625, + "learning_rate": 4.493192798848898e-05, + "loss": 5.213, + "step": 1125 + }, + { + "epoch": 0.2862863499126013, + "grad_norm": 19104.021484375, + "learning_rate": 4.491852435008329e-05, + "loss": 5.2026, + "step": 1126 + }, + { + "epoch": 0.2865406006674082, + "grad_norm": 19291.71484375, + "learning_rate": 4.49051050154332e-05, + "loss": 5.2102, + "step": 1127 + }, + { + "epoch": 0.28679485142221517, + "grad_norm": 19089.091796875, + "learning_rate": 4.489166999511344e-05, + "loss": 5.199, + "step": 1128 + }, + { + "epoch": 0.28704910217702206, + "grad_norm": 19295.65234375, + "learning_rate": 4.487821929971111e-05, + "loss": 5.1955, + "step": 1129 + }, + { + "epoch": 0.287303352931829, + "grad_norm": 19036.705078125, + "learning_rate": 4.486475293982566e-05, + "loss": 5.1987, + "step": 1130 + }, + { + "epoch": 0.28755760368663597, + "grad_norm": 19167.36328125, + "learning_rate": 4.485127092606889e-05, + "loss": 5.2207, + "step": 1131 + }, + { + "epoch": 0.28781185444144286, + "grad_norm": 19268.767578125, + "learning_rate": 4.483777326906491e-05, + "loss": 5.2053, + "step": 1132 + }, + { + "epoch": 0.2880661051962498, + "grad_norm": 19050.767578125, + "learning_rate": 4.482425997945019e-05, + "loss": 5.2184, + "step": 1133 + }, + { + "epoch": 0.2883203559510567, + "grad_norm": 19141.060546875, + "learning_rate": 4.4810731067873515e-05, + "loss": 5.1966, + "step": 1134 + }, + { + "epoch": 0.28857460670586366, + "grad_norm": 18897.35546875, + "learning_rate": 4.4797186544995954e-05, + "loss": 5.2057, + "step": 1135 + }, + { + "epoch": 0.2888288574606706, + "grad_norm": 19116.806640625, + "learning_rate": 4.47836264214909e-05, + "loss": 5.2033, + "step": 1136 + }, + { + "epoch": 0.2890831082154775, + "grad_norm": 19259.021484375, + "learning_rate": 4.477005070804404e-05, + "loss": 5.2097, + "step": 1137 + }, + { + "epoch": 0.28933735897028445, + "grad_norm": 19018.482421875, + "learning_rate": 4.475645941535334e-05, + "loss": 5.2091, + "step": 1138 + }, + { + "epoch": 0.28959160972509135, + "grad_norm": 19302.724609375, + "learning_rate": 4.474285255412904e-05, + "loss": 5.2009, + "step": 1139 + }, + { + "epoch": 0.2898458604798983, + "grad_norm": 19230.236328125, + "learning_rate": 4.4729230135093645e-05, + "loss": 5.1969, + "step": 1140 + }, + { + "epoch": 0.29010011123470525, + "grad_norm": 19891.123046875, + "learning_rate": 4.471559216898195e-05, + "loss": 5.1995, + "step": 1141 + }, + { + "epoch": 0.29035436198951214, + "grad_norm": 19324.708984375, + "learning_rate": 4.470193866654096e-05, + "loss": 5.2058, + "step": 1142 + }, + { + "epoch": 0.2906086127443191, + "grad_norm": 19443.1171875, + "learning_rate": 4.4688269638529945e-05, + "loss": 5.2078, + "step": 1143 + }, + { + "epoch": 0.290862863499126, + "grad_norm": 19438.568359375, + "learning_rate": 4.46745850957204e-05, + "loss": 5.1818, + "step": 1144 + }, + { + "epoch": 0.29111711425393294, + "grad_norm": 19019.572265625, + "learning_rate": 4.466088504889607e-05, + "loss": 5.2027, + "step": 1145 + }, + { + "epoch": 0.2913713650087399, + "grad_norm": 20348.5703125, + "learning_rate": 4.4647169508852885e-05, + "loss": 5.2083, + "step": 1146 + }, + { + "epoch": 0.2916256157635468, + "grad_norm": 19141.072265625, + "learning_rate": 4.4633438486398996e-05, + "loss": 5.2125, + "step": 1147 + }, + { + "epoch": 0.29187986651835374, + "grad_norm": 19181.828125, + "learning_rate": 4.461969199235477e-05, + "loss": 5.21, + "step": 1148 + }, + { + "epoch": 0.29213411727316063, + "grad_norm": 19225.615234375, + "learning_rate": 4.4605930037552746e-05, + "loss": 5.2035, + "step": 1149 + }, + { + "epoch": 0.2923883680279676, + "grad_norm": 19277.08203125, + "learning_rate": 4.4592152632837646e-05, + "loss": 5.2011, + "step": 1150 + }, + { + "epoch": 0.29264261878277453, + "grad_norm": 19206.990234375, + "learning_rate": 4.4578359789066384e-05, + "loss": 5.1859, + "step": 1151 + }, + { + "epoch": 0.29289686953758143, + "grad_norm": 19416.390625, + "learning_rate": 4.4564551517108034e-05, + "loss": 5.1956, + "step": 1152 + }, + { + "epoch": 0.2931511202923884, + "grad_norm": 19266.19140625, + "learning_rate": 4.455072782784381e-05, + "loss": 5.1942, + "step": 1153 + }, + { + "epoch": 0.2934053710471953, + "grad_norm": 19319.84765625, + "learning_rate": 4.4536888732167105e-05, + "loss": 5.1801, + "step": 1154 + }, + { + "epoch": 0.2936596218020022, + "grad_norm": 19243.646484375, + "learning_rate": 4.452303424098342e-05, + "loss": 5.2009, + "step": 1155 + }, + { + "epoch": 0.2939138725568092, + "grad_norm": 19192.30078125, + "learning_rate": 4.4509164365210424e-05, + "loss": 5.195, + "step": 1156 + }, + { + "epoch": 0.29416812331161607, + "grad_norm": 19331.48828125, + "learning_rate": 4.4495279115777874e-05, + "loss": 5.1922, + "step": 1157 + }, + { + "epoch": 0.294422374066423, + "grad_norm": 19333.892578125, + "learning_rate": 4.448137850362768e-05, + "loss": 5.1939, + "step": 1158 + }, + { + "epoch": 0.2946766248212299, + "grad_norm": 20555.83984375, + "learning_rate": 4.446746253971381e-05, + "loss": 5.1916, + "step": 1159 + }, + { + "epoch": 0.29493087557603687, + "grad_norm": 19440.171875, + "learning_rate": 4.4453531235002375e-05, + "loss": 5.1783, + "step": 1160 + }, + { + "epoch": 0.2951851263308438, + "grad_norm": 19383.220703125, + "learning_rate": 4.4439584600471546e-05, + "loss": 5.1831, + "step": 1161 + }, + { + "epoch": 0.2954393770856507, + "grad_norm": 19301.896484375, + "learning_rate": 4.4425622647111586e-05, + "loss": 5.192, + "step": 1162 + }, + { + "epoch": 0.29569362784045766, + "grad_norm": 19287.884765625, + "learning_rate": 4.441164538592483e-05, + "loss": 5.1934, + "step": 1163 + }, + { + "epoch": 0.29594787859526456, + "grad_norm": 19611.091796875, + "learning_rate": 4.439765282792567e-05, + "loss": 5.1989, + "step": 1164 + }, + { + "epoch": 0.2962021293500715, + "grad_norm": 19401.994140625, + "learning_rate": 4.4383644984140565e-05, + "loss": 5.1804, + "step": 1165 + }, + { + "epoch": 0.29645638010487846, + "grad_norm": 20315.923828125, + "learning_rate": 4.4369621865608e-05, + "loss": 5.1868, + "step": 1166 + }, + { + "epoch": 0.29671063085968535, + "grad_norm": 19917.0078125, + "learning_rate": 4.4355583483378514e-05, + "loss": 5.1973, + "step": 1167 + }, + { + "epoch": 0.2969648816144923, + "grad_norm": 19760.30859375, + "learning_rate": 4.434152984851466e-05, + "loss": 5.1819, + "step": 1168 + }, + { + "epoch": 0.2972191323692992, + "grad_norm": 20807.515625, + "learning_rate": 4.432746097209103e-05, + "loss": 5.1769, + "step": 1169 + }, + { + "epoch": 0.29747338312410615, + "grad_norm": 19250.61328125, + "learning_rate": 4.4313376865194204e-05, + "loss": 5.1855, + "step": 1170 + }, + { + "epoch": 0.2977276338789131, + "grad_norm": 19682.90625, + "learning_rate": 4.4299277538922776e-05, + "loss": 5.167, + "step": 1171 + }, + { + "epoch": 0.29798188463372, + "grad_norm": 19363.853515625, + "learning_rate": 4.428516300438733e-05, + "loss": 5.19, + "step": 1172 + }, + { + "epoch": 0.29823613538852695, + "grad_norm": 19313.884765625, + "learning_rate": 4.4271033272710444e-05, + "loss": 5.1887, + "step": 1173 + }, + { + "epoch": 0.29849038614333384, + "grad_norm": 19418.591796875, + "learning_rate": 4.425688835502666e-05, + "loss": 5.1811, + "step": 1174 + }, + { + "epoch": 0.2987446368981408, + "grad_norm": 19325.841796875, + "learning_rate": 4.424272826248248e-05, + "loss": 5.1706, + "step": 1175 + }, + { + "epoch": 0.29899888765294774, + "grad_norm": 19616.07421875, + "learning_rate": 4.4228553006236395e-05, + "loss": 5.18, + "step": 1176 + }, + { + "epoch": 0.29925313840775464, + "grad_norm": 19263.18359375, + "learning_rate": 4.4214362597458813e-05, + "loss": 5.1958, + "step": 1177 + }, + { + "epoch": 0.2995073891625616, + "grad_norm": 19283.693359375, + "learning_rate": 4.420015704733209e-05, + "loss": 5.185, + "step": 1178 + }, + { + "epoch": 0.2997616399173685, + "grad_norm": 19800.904296875, + "learning_rate": 4.418593636705054e-05, + "loss": 5.175, + "step": 1179 + }, + { + "epoch": 0.30001589067217543, + "grad_norm": 19762.6953125, + "learning_rate": 4.417170056782035e-05, + "loss": 5.1804, + "step": 1180 + }, + { + "epoch": 0.3002701414269824, + "grad_norm": 24224.61328125, + "learning_rate": 4.4157449660859665e-05, + "loss": 5.1558, + "step": 1181 + }, + { + "epoch": 0.3005243921817893, + "grad_norm": 19985.32421875, + "learning_rate": 4.414318365739852e-05, + "loss": 5.1691, + "step": 1182 + }, + { + "epoch": 0.30077864293659623, + "grad_norm": 19891.27734375, + "learning_rate": 4.412890256867884e-05, + "loss": 5.1742, + "step": 1183 + }, + { + "epoch": 0.3010328936914031, + "grad_norm": 19686.4296875, + "learning_rate": 4.411460640595445e-05, + "loss": 5.1642, + "step": 1184 + }, + { + "epoch": 0.3012871444462101, + "grad_norm": 19818.65234375, + "learning_rate": 4.410029518049105e-05, + "loss": 5.1734, + "step": 1185 + }, + { + "epoch": 0.301541395201017, + "grad_norm": 19511.919921875, + "learning_rate": 4.4085968903566186e-05, + "loss": 5.1815, + "step": 1186 + }, + { + "epoch": 0.3017956459558239, + "grad_norm": 19616.92578125, + "learning_rate": 4.407162758646931e-05, + "loss": 5.1736, + "step": 1187 + }, + { + "epoch": 0.30204989671063087, + "grad_norm": 19353.744140625, + "learning_rate": 4.405727124050169e-05, + "loss": 5.1773, + "step": 1188 + }, + { + "epoch": 0.30230414746543777, + "grad_norm": 20175.39453125, + "learning_rate": 4.4042899876976465e-05, + "loss": 5.1626, + "step": 1189 + }, + { + "epoch": 0.3025583982202447, + "grad_norm": 20587.6328125, + "learning_rate": 4.402851350721856e-05, + "loss": 5.1821, + "step": 1190 + }, + { + "epoch": 0.30281264897505167, + "grad_norm": 19521.3203125, + "learning_rate": 4.401411214256479e-05, + "loss": 5.1803, + "step": 1191 + }, + { + "epoch": 0.30306689972985856, + "grad_norm": 22237.19921875, + "learning_rate": 4.399969579436374e-05, + "loss": 5.1714, + "step": 1192 + }, + { + "epoch": 0.3033211504846655, + "grad_norm": 20076.607421875, + "learning_rate": 4.398526447397581e-05, + "loss": 5.187, + "step": 1193 + }, + { + "epoch": 0.3035754012394724, + "grad_norm": 20269.62890625, + "learning_rate": 4.397081819277321e-05, + "loss": 5.1684, + "step": 1194 + }, + { + "epoch": 0.30382965199427936, + "grad_norm": 19471.693359375, + "learning_rate": 4.395635696213993e-05, + "loss": 5.1591, + "step": 1195 + }, + { + "epoch": 0.3040839027490863, + "grad_norm": 21970.646484375, + "learning_rate": 4.394188079347176e-05, + "loss": 5.1679, + "step": 1196 + }, + { + "epoch": 0.3043381535038932, + "grad_norm": 20995.275390625, + "learning_rate": 4.3927389698176237e-05, + "loss": 5.1672, + "step": 1197 + }, + { + "epoch": 0.30459240425870016, + "grad_norm": 21099.685546875, + "learning_rate": 4.3912883687672654e-05, + "loss": 5.1701, + "step": 1198 + }, + { + "epoch": 0.30484665501350705, + "grad_norm": 23570.384765625, + "learning_rate": 4.3898362773392095e-05, + "loss": 5.1486, + "step": 1199 + }, + { + "epoch": 0.305100905768314, + "grad_norm": 24977.611328125, + "learning_rate": 4.388382696677735e-05, + "loss": 5.1557, + "step": 1200 + }, + { + "epoch": 0.305100905768314, + "eval_loss": 10.403265953063965, + "eval_runtime": 699.978, + "eval_samples_per_second": 151.413, + "eval_steps_per_second": 9.465, + "step": 1200 + }, + { + "epoch": 0.30535515652312095, + "grad_norm": 20541.41015625, + "learning_rate": 4.3869276279282976e-05, + "loss": 5.1478, + "step": 1201 + }, + { + "epoch": 0.30560940727792785, + "grad_norm": 19590.240234375, + "learning_rate": 4.3854710722375237e-05, + "loss": 5.1466, + "step": 1202 + }, + { + "epoch": 0.3058636580327348, + "grad_norm": 21604.7265625, + "learning_rate": 4.384013030753211e-05, + "loss": 5.1583, + "step": 1203 + }, + { + "epoch": 0.3061179087875417, + "grad_norm": 20013.283203125, + "learning_rate": 4.382553504624331e-05, + "loss": 5.1588, + "step": 1204 + }, + { + "epoch": 0.30637215954234864, + "grad_norm": 20195.771484375, + "learning_rate": 4.3810924950010195e-05, + "loss": 5.1697, + "step": 1205 + }, + { + "epoch": 0.3066264102971556, + "grad_norm": 19981.65625, + "learning_rate": 4.3796300030345876e-05, + "loss": 5.1526, + "step": 1206 + }, + { + "epoch": 0.3068806610519625, + "grad_norm": 20020.1875, + "learning_rate": 4.3781660298775116e-05, + "loss": 5.1384, + "step": 1207 + }, + { + "epoch": 0.30713491180676944, + "grad_norm": 20344.783203125, + "learning_rate": 4.3767005766834346e-05, + "loss": 5.1641, + "step": 1208 + }, + { + "epoch": 0.30738916256157633, + "grad_norm": 19750.4453125, + "learning_rate": 4.3752336446071677e-05, + "loss": 5.1328, + "step": 1209 + }, + { + "epoch": 0.3076434133163833, + "grad_norm": 22305.890625, + "learning_rate": 4.373765234804684e-05, + "loss": 5.1588, + "step": 1210 + }, + { + "epoch": 0.30789766407119024, + "grad_norm": 19771.02734375, + "learning_rate": 4.372295348433125e-05, + "loss": 5.142, + "step": 1211 + }, + { + "epoch": 0.30815191482599713, + "grad_norm": 20072.134765625, + "learning_rate": 4.370823986650795e-05, + "loss": 5.1655, + "step": 1212 + }, + { + "epoch": 0.3084061655808041, + "grad_norm": 19482.8203125, + "learning_rate": 4.369351150617158e-05, + "loss": 5.1608, + "step": 1213 + }, + { + "epoch": 0.308660416335611, + "grad_norm": 20071.876953125, + "learning_rate": 4.367876841492844e-05, + "loss": 5.1387, + "step": 1214 + }, + { + "epoch": 0.3089146670904179, + "grad_norm": 19533.78125, + "learning_rate": 4.3664010604396404e-05, + "loss": 5.1422, + "step": 1215 + }, + { + "epoch": 0.3091689178452249, + "grad_norm": 20021.111328125, + "learning_rate": 4.3649238086204955e-05, + "loss": 5.1504, + "step": 1216 + }, + { + "epoch": 0.3094231686000318, + "grad_norm": 19802.34375, + "learning_rate": 4.363445087199518e-05, + "loss": 5.1623, + "step": 1217 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 21246.015625, + "learning_rate": 4.361964897341973e-05, + "loss": 5.1468, + "step": 1218 + }, + { + "epoch": 0.3099316701096456, + "grad_norm": 19995.50390625, + "learning_rate": 4.360483240214284e-05, + "loss": 5.164, + "step": 1219 + }, + { + "epoch": 0.31018592086445257, + "grad_norm": 22513.32421875, + "learning_rate": 4.359000116984029e-05, + "loss": 5.1565, + "step": 1220 + }, + { + "epoch": 0.3104401716192595, + "grad_norm": 19838.423828125, + "learning_rate": 4.357515528819942e-05, + "loss": 5.1462, + "step": 1221 + }, + { + "epoch": 0.3106944223740664, + "grad_norm": 20792.4765625, + "learning_rate": 4.356029476891914e-05, + "loss": 5.13, + "step": 1222 + }, + { + "epoch": 0.31094867312887337, + "grad_norm": 19451.251953125, + "learning_rate": 4.354541962370985e-05, + "loss": 5.1464, + "step": 1223 + }, + { + "epoch": 0.31120292388368026, + "grad_norm": 20367.1484375, + "learning_rate": 4.353052986429351e-05, + "loss": 5.1608, + "step": 1224 + }, + { + "epoch": 0.3114571746384872, + "grad_norm": 19801.705078125, + "learning_rate": 4.351562550240359e-05, + "loss": 5.1471, + "step": 1225 + }, + { + "epoch": 0.31171142539329416, + "grad_norm": 21052.32421875, + "learning_rate": 4.3500706549785056e-05, + "loss": 5.1335, + "step": 1226 + }, + { + "epoch": 0.31196567614810106, + "grad_norm": 19839.130859375, + "learning_rate": 4.3485773018194365e-05, + "loss": 5.1526, + "step": 1227 + }, + { + "epoch": 0.312219926902908, + "grad_norm": 19741.162109375, + "learning_rate": 4.347082491939949e-05, + "loss": 5.1347, + "step": 1228 + }, + { + "epoch": 0.3124741776577149, + "grad_norm": 19714.0703125, + "learning_rate": 4.345586226517987e-05, + "loss": 5.1496, + "step": 1229 + }, + { + "epoch": 0.31272842841252185, + "grad_norm": 19635.5390625, + "learning_rate": 4.3440885067326405e-05, + "loss": 5.1477, + "step": 1230 + }, + { + "epoch": 0.3129826791673288, + "grad_norm": 19685.90234375, + "learning_rate": 4.342589333764146e-05, + "loss": 5.1358, + "step": 1231 + }, + { + "epoch": 0.3132369299221357, + "grad_norm": 20508.802734375, + "learning_rate": 4.3410887087938865e-05, + "loss": 5.1542, + "step": 1232 + }, + { + "epoch": 0.31349118067694265, + "grad_norm": 19723.970703125, + "learning_rate": 4.339586633004388e-05, + "loss": 5.1351, + "step": 1233 + }, + { + "epoch": 0.31374543143174954, + "grad_norm": 19800.984375, + "learning_rate": 4.3380831075793194e-05, + "loss": 5.1356, + "step": 1234 + }, + { + "epoch": 0.3139996821865565, + "grad_norm": 19727.21484375, + "learning_rate": 4.336578133703493e-05, + "loss": 5.1421, + "step": 1235 + }, + { + "epoch": 0.31425393294136345, + "grad_norm": 19545.599609375, + "learning_rate": 4.335071712562862e-05, + "loss": 5.1404, + "step": 1236 + }, + { + "epoch": 0.31450818369617034, + "grad_norm": 19616.6796875, + "learning_rate": 4.333563845344518e-05, + "loss": 5.1315, + "step": 1237 + }, + { + "epoch": 0.3147624344509773, + "grad_norm": 19754.3984375, + "learning_rate": 4.3320545332366976e-05, + "loss": 5.1533, + "step": 1238 + }, + { + "epoch": 0.3150166852057842, + "grad_norm": 19678.0078125, + "learning_rate": 4.330543777428771e-05, + "loss": 5.1401, + "step": 1239 + }, + { + "epoch": 0.31527093596059114, + "grad_norm": 19605.453125, + "learning_rate": 4.329031579111248e-05, + "loss": 5.1304, + "step": 1240 + }, + { + "epoch": 0.3155251867153981, + "grad_norm": 19767.70703125, + "learning_rate": 4.327517939475774e-05, + "loss": 5.1347, + "step": 1241 + }, + { + "epoch": 0.315779437470205, + "grad_norm": 20732.2265625, + "learning_rate": 4.3260028597151315e-05, + "loss": 5.1308, + "step": 1242 + }, + { + "epoch": 0.31603368822501193, + "grad_norm": 19815.3515625, + "learning_rate": 4.3244863410232383e-05, + "loss": 5.1272, + "step": 1243 + }, + { + "epoch": 0.31628793897981883, + "grad_norm": 19953.953125, + "learning_rate": 4.322968384595143e-05, + "loss": 5.1406, + "step": 1244 + }, + { + "epoch": 0.3165421897346258, + "grad_norm": 20573.7421875, + "learning_rate": 4.3214489916270316e-05, + "loss": 5.1237, + "step": 1245 + }, + { + "epoch": 0.31679644048943273, + "grad_norm": 19799.109375, + "learning_rate": 4.3199281633162196e-05, + "loss": 5.1371, + "step": 1246 + }, + { + "epoch": 0.3170506912442396, + "grad_norm": 19982.7734375, + "learning_rate": 4.318405900861152e-05, + "loss": 5.136, + "step": 1247 + }, + { + "epoch": 0.3173049419990466, + "grad_norm": 20054.2734375, + "learning_rate": 4.3168822054614075e-05, + "loss": 5.1295, + "step": 1248 + }, + { + "epoch": 0.31755919275385347, + "grad_norm": 19859.669921875, + "learning_rate": 4.315357078317692e-05, + "loss": 5.1281, + "step": 1249 + }, + { + "epoch": 0.3178134435086604, + "grad_norm": 19729.4375, + "learning_rate": 4.3138305206318395e-05, + "loss": 5.1198, + "step": 1250 + }, + { + "epoch": 0.31806769426346737, + "grad_norm": 20013.509765625, + "learning_rate": 4.312302533606813e-05, + "loss": 5.1245, + "step": 1251 + }, + { + "epoch": 0.31832194501827427, + "grad_norm": 19770.373046875, + "learning_rate": 4.310773118446699e-05, + "loss": 5.1169, + "step": 1252 + }, + { + "epoch": 0.3185761957730812, + "grad_norm": 19846.98046875, + "learning_rate": 4.309242276356711e-05, + "loss": 5.1464, + "step": 1253 + }, + { + "epoch": 0.3188304465278881, + "grad_norm": 19756.466796875, + "learning_rate": 4.307710008543187e-05, + "loss": 5.137, + "step": 1254 + }, + { + "epoch": 0.31908469728269506, + "grad_norm": 19736.759765625, + "learning_rate": 4.30617631621359e-05, + "loss": 5.1307, + "step": 1255 + }, + { + "epoch": 0.319338948037502, + "grad_norm": 19902.673828125, + "learning_rate": 4.304641200576502e-05, + "loss": 5.1296, + "step": 1256 + }, + { + "epoch": 0.3195931987923089, + "grad_norm": 19743.802734375, + "learning_rate": 4.3031046628416306e-05, + "loss": 5.1295, + "step": 1257 + }, + { + "epoch": 0.31984744954711586, + "grad_norm": 20099.115234375, + "learning_rate": 4.301566704219801e-05, + "loss": 5.1267, + "step": 1258 + }, + { + "epoch": 0.32010170030192275, + "grad_norm": 19752.916015625, + "learning_rate": 4.3000273259229583e-05, + "loss": 5.1342, + "step": 1259 + }, + { + "epoch": 0.3203559510567297, + "grad_norm": 20736.10546875, + "learning_rate": 4.298486529164168e-05, + "loss": 5.1347, + "step": 1260 + }, + { + "epoch": 0.32061020181153665, + "grad_norm": 19977.865234375, + "learning_rate": 4.2969443151576126e-05, + "loss": 5.113, + "step": 1261 + }, + { + "epoch": 0.32086445256634355, + "grad_norm": 19813.3125, + "learning_rate": 4.2954006851185915e-05, + "loss": 5.1096, + "step": 1262 + }, + { + "epoch": 0.3211187033211505, + "grad_norm": 19996.478515625, + "learning_rate": 4.293855640263519e-05, + "loss": 5.1153, + "step": 1263 + }, + { + "epoch": 0.3213729540759574, + "grad_norm": 19746.96484375, + "learning_rate": 4.292309181809926e-05, + "loss": 5.1153, + "step": 1264 + }, + { + "epoch": 0.32162720483076435, + "grad_norm": 19771.24609375, + "learning_rate": 4.290761310976456e-05, + "loss": 5.1201, + "step": 1265 + }, + { + "epoch": 0.3218814555855713, + "grad_norm": 19841.5625, + "learning_rate": 4.2892120289828664e-05, + "loss": 5.1215, + "step": 1266 + }, + { + "epoch": 0.3221357063403782, + "grad_norm": 19981.927734375, + "learning_rate": 4.287661337050026e-05, + "loss": 5.1351, + "step": 1267 + }, + { + "epoch": 0.32238995709518514, + "grad_norm": 19845.1484375, + "learning_rate": 4.286109236399914e-05, + "loss": 5.1024, + "step": 1268 + }, + { + "epoch": 0.32264420784999204, + "grad_norm": 19998.423828125, + "learning_rate": 4.284555728255622e-05, + "loss": 5.1124, + "step": 1269 + }, + { + "epoch": 0.322898458604799, + "grad_norm": 19892.732421875, + "learning_rate": 4.283000813841349e-05, + "loss": 5.1113, + "step": 1270 + }, + { + "epoch": 0.32315270935960594, + "grad_norm": 21815.525390625, + "learning_rate": 4.2814444943824014e-05, + "loss": 5.1093, + "step": 1271 + }, + { + "epoch": 0.32340696011441283, + "grad_norm": 20048.7109375, + "learning_rate": 4.279886771105195e-05, + "loss": 5.1214, + "step": 1272 + }, + { + "epoch": 0.3236612108692198, + "grad_norm": 20031.232421875, + "learning_rate": 4.27832764523725e-05, + "loss": 5.1167, + "step": 1273 + }, + { + "epoch": 0.3239154616240267, + "grad_norm": 20546.447265625, + "learning_rate": 4.2767671180071935e-05, + "loss": 5.105, + "step": 1274 + }, + { + "epoch": 0.32416971237883363, + "grad_norm": 20309.35546875, + "learning_rate": 4.275205190644756e-05, + "loss": 5.1325, + "step": 1275 + }, + { + "epoch": 0.3244239631336406, + "grad_norm": 20087.349609375, + "learning_rate": 4.273641864380769e-05, + "loss": 5.1169, + "step": 1276 + }, + { + "epoch": 0.3246782138884475, + "grad_norm": 19972.22265625, + "learning_rate": 4.272077140447172e-05, + "loss": 5.1187, + "step": 1277 + }, + { + "epoch": 0.3249324646432544, + "grad_norm": 20051.26171875, + "learning_rate": 4.2705110200769996e-05, + "loss": 5.1063, + "step": 1278 + }, + { + "epoch": 0.3251867153980613, + "grad_norm": 20097.4765625, + "learning_rate": 4.2689435045043925e-05, + "loss": 5.1235, + "step": 1279 + }, + { + "epoch": 0.32544096615286827, + "grad_norm": 20417.92578125, + "learning_rate": 4.267374594964586e-05, + "loss": 5.107, + "step": 1280 + }, + { + "epoch": 0.3256952169076752, + "grad_norm": 21291.435546875, + "learning_rate": 4.2658042926939175e-05, + "loss": 5.1075, + "step": 1281 + }, + { + "epoch": 0.3259494676624821, + "grad_norm": 19855.05078125, + "learning_rate": 4.2642325989298194e-05, + "loss": 5.1059, + "step": 1282 + }, + { + "epoch": 0.32620371841728907, + "grad_norm": 20626.611328125, + "learning_rate": 4.262659514910823e-05, + "loss": 5.1122, + "step": 1283 + }, + { + "epoch": 0.32645796917209596, + "grad_norm": 19825.59765625, + "learning_rate": 4.261085041876552e-05, + "loss": 5.1066, + "step": 1284 + }, + { + "epoch": 0.3267122199269029, + "grad_norm": 20555.48046875, + "learning_rate": 4.259509181067728e-05, + "loss": 5.0968, + "step": 1285 + }, + { + "epoch": 0.3269664706817098, + "grad_norm": 19933.44921875, + "learning_rate": 4.2579319337261644e-05, + "loss": 5.1054, + "step": 1286 + }, + { + "epoch": 0.32722072143651676, + "grad_norm": 20873.533203125, + "learning_rate": 4.256353301094767e-05, + "loss": 5.0964, + "step": 1287 + }, + { + "epoch": 0.3274749721913237, + "grad_norm": 19889.3125, + "learning_rate": 4.254773284417534e-05, + "loss": 5.1, + "step": 1288 + }, + { + "epoch": 0.3277292229461306, + "grad_norm": 21461.48828125, + "learning_rate": 4.253191884939554e-05, + "loss": 5.1067, + "step": 1289 + }, + { + "epoch": 0.32798347370093756, + "grad_norm": 20288.03515625, + "learning_rate": 4.251609103907006e-05, + "loss": 5.0986, + "step": 1290 + }, + { + "epoch": 0.32823772445574445, + "grad_norm": 20197.896484375, + "learning_rate": 4.250024942567156e-05, + "loss": 5.1063, + "step": 1291 + }, + { + "epoch": 0.3284919752105514, + "grad_norm": 20193.87890625, + "learning_rate": 4.2484394021683596e-05, + "loss": 5.1088, + "step": 1292 + }, + { + "epoch": 0.32874622596535835, + "grad_norm": 20334.603515625, + "learning_rate": 4.2468524839600566e-05, + "loss": 5.0992, + "step": 1293 + }, + { + "epoch": 0.32900047672016525, + "grad_norm": 20315.34375, + "learning_rate": 4.245264189192776e-05, + "loss": 5.1087, + "step": 1294 + }, + { + "epoch": 0.3292547274749722, + "grad_norm": 20287.775390625, + "learning_rate": 4.243674519118129e-05, + "loss": 5.0836, + "step": 1295 + }, + { + "epoch": 0.3295089782297791, + "grad_norm": 20205.9453125, + "learning_rate": 4.242083474988812e-05, + "loss": 5.0961, + "step": 1296 + }, + { + "epoch": 0.32976322898458604, + "grad_norm": 20001.400390625, + "learning_rate": 4.240491058058601e-05, + "loss": 5.0987, + "step": 1297 + }, + { + "epoch": 0.330017479739393, + "grad_norm": 20167.29296875, + "learning_rate": 4.2388972695823594e-05, + "loss": 5.0903, + "step": 1298 + }, + { + "epoch": 0.3302717304941999, + "grad_norm": 20238.060546875, + "learning_rate": 4.237302110816027e-05, + "loss": 5.1022, + "step": 1299 + }, + { + "epoch": 0.33052598124900684, + "grad_norm": 20412.294921875, + "learning_rate": 4.235705583016625e-05, + "loss": 5.0886, + "step": 1300 + }, + { + "epoch": 0.33052598124900684, + "eval_loss": 10.275012016296387, + "eval_runtime": 699.146, + "eval_samples_per_second": 151.594, + "eval_steps_per_second": 9.476, + "step": 1300 + }, + { + "epoch": 0.33078023200381373, + "grad_norm": 20100.3828125, + "learning_rate": 4.234107687442252e-05, + "loss": 5.0906, + "step": 1301 + }, + { + "epoch": 0.3310344827586207, + "grad_norm": 20233.59765625, + "learning_rate": 4.232508425352087e-05, + "loss": 5.1064, + "step": 1302 + }, + { + "epoch": 0.33128873351342764, + "grad_norm": 20024.859375, + "learning_rate": 4.230907798006384e-05, + "loss": 5.1014, + "step": 1303 + }, + { + "epoch": 0.33154298426823453, + "grad_norm": 20425.4375, + "learning_rate": 4.2293058066664734e-05, + "loss": 5.0751, + "step": 1304 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 20071.107421875, + "learning_rate": 4.227702452594759e-05, + "loss": 5.0973, + "step": 1305 + }, + { + "epoch": 0.3320514857778484, + "grad_norm": 20287.640625, + "learning_rate": 4.2260977370547225e-05, + "loss": 5.094, + "step": 1306 + }, + { + "epoch": 0.3323057365326553, + "grad_norm": 20121.212890625, + "learning_rate": 4.2244916613109135e-05, + "loss": 5.0985, + "step": 1307 + }, + { + "epoch": 0.3325599872874623, + "grad_norm": 20336.966796875, + "learning_rate": 4.222884226628957e-05, + "loss": 5.0827, + "step": 1308 + }, + { + "epoch": 0.3328142380422692, + "grad_norm": 20287.17578125, + "learning_rate": 4.2212754342755464e-05, + "loss": 5.1025, + "step": 1309 + }, + { + "epoch": 0.3330684887970761, + "grad_norm": 20628.662109375, + "learning_rate": 4.219665285518447e-05, + "loss": 5.0942, + "step": 1310 + }, + { + "epoch": 0.333322739551883, + "grad_norm": 20041.59375, + "learning_rate": 4.218053781626493e-05, + "loss": 5.0862, + "step": 1311 + }, + { + "epoch": 0.33357699030668997, + "grad_norm": 20977.61328125, + "learning_rate": 4.216440923869584e-05, + "loss": 5.0848, + "step": 1312 + }, + { + "epoch": 0.3338312410614969, + "grad_norm": 20465.666015625, + "learning_rate": 4.214826713518689e-05, + "loss": 5.0858, + "step": 1313 + }, + { + "epoch": 0.3340854918163038, + "grad_norm": 20362.083984375, + "learning_rate": 4.213211151845842e-05, + "loss": 5.1007, + "step": 1314 + }, + { + "epoch": 0.33433974257111077, + "grad_norm": 20163.548828125, + "learning_rate": 4.211594240124141e-05, + "loss": 5.1008, + "step": 1315 + }, + { + "epoch": 0.33459399332591766, + "grad_norm": 20307.2890625, + "learning_rate": 4.209975979627751e-05, + "loss": 5.0741, + "step": 1316 + }, + { + "epoch": 0.3348482440807246, + "grad_norm": 20222.36328125, + "learning_rate": 4.208356371631894e-05, + "loss": 5.0628, + "step": 1317 + }, + { + "epoch": 0.33510249483553156, + "grad_norm": 20340.34375, + "learning_rate": 4.2067354174128606e-05, + "loss": 5.0871, + "step": 1318 + }, + { + "epoch": 0.33535674559033846, + "grad_norm": 20454.966796875, + "learning_rate": 4.205113118247999e-05, + "loss": 5.0944, + "step": 1319 + }, + { + "epoch": 0.3356109963451454, + "grad_norm": 20080.46484375, + "learning_rate": 4.203489475415714e-05, + "loss": 5.1005, + "step": 1320 + }, + { + "epoch": 0.3358652470999523, + "grad_norm": 20567.080078125, + "learning_rate": 4.2018644901954765e-05, + "loss": 5.0768, + "step": 1321 + }, + { + "epoch": 0.33611949785475925, + "grad_norm": 20374.1875, + "learning_rate": 4.20023816386781e-05, + "loss": 5.1077, + "step": 1322 + }, + { + "epoch": 0.3363737486095662, + "grad_norm": 20328.609375, + "learning_rate": 4.198610497714296e-05, + "loss": 5.0923, + "step": 1323 + }, + { + "epoch": 0.3366279993643731, + "grad_norm": 20210.048828125, + "learning_rate": 4.196981493017572e-05, + "loss": 5.0756, + "step": 1324 + }, + { + "epoch": 0.33688225011918005, + "grad_norm": 20224.494140625, + "learning_rate": 4.19535115106133e-05, + "loss": 5.0841, + "step": 1325 + }, + { + "epoch": 0.33713650087398694, + "grad_norm": 20655.853515625, + "learning_rate": 4.193719473130317e-05, + "loss": 5.07, + "step": 1326 + }, + { + "epoch": 0.3373907516287939, + "grad_norm": 20672.8046875, + "learning_rate": 4.1920864605103304e-05, + "loss": 5.075, + "step": 1327 + }, + { + "epoch": 0.33764500238360085, + "grad_norm": 20118.31640625, + "learning_rate": 4.190452114488222e-05, + "loss": 5.0704, + "step": 1328 + }, + { + "epoch": 0.33789925313840774, + "grad_norm": 20429.97265625, + "learning_rate": 4.1888164363518926e-05, + "loss": 5.0864, + "step": 1329 + }, + { + "epoch": 0.3381535038932147, + "grad_norm": 19991.333984375, + "learning_rate": 4.187179427390293e-05, + "loss": 5.0901, + "step": 1330 + }, + { + "epoch": 0.3384077546480216, + "grad_norm": 20147.642578125, + "learning_rate": 4.1855410888934244e-05, + "loss": 5.0817, + "step": 1331 + }, + { + "epoch": 0.33866200540282854, + "grad_norm": 20130.478515625, + "learning_rate": 4.183901422152332e-05, + "loss": 5.071, + "step": 1332 + }, + { + "epoch": 0.3389162561576355, + "grad_norm": 20706.8828125, + "learning_rate": 4.182260428459113e-05, + "loss": 5.0844, + "step": 1333 + }, + { + "epoch": 0.3391705069124424, + "grad_norm": 20779.1875, + "learning_rate": 4.1806181091069046e-05, + "loss": 5.066, + "step": 1334 + }, + { + "epoch": 0.33942475766724933, + "grad_norm": 20123.12109375, + "learning_rate": 4.178974465389893e-05, + "loss": 5.0754, + "step": 1335 + }, + { + "epoch": 0.33967900842205623, + "grad_norm": 20410.642578125, + "learning_rate": 4.177329498603305e-05, + "loss": 5.0684, + "step": 1336 + }, + { + "epoch": 0.3399332591768632, + "grad_norm": 20151.8515625, + "learning_rate": 4.175683210043413e-05, + "loss": 5.0753, + "step": 1337 + }, + { + "epoch": 0.34018750993167013, + "grad_norm": 20529.083984375, + "learning_rate": 4.174035601007528e-05, + "loss": 5.0828, + "step": 1338 + }, + { + "epoch": 0.340441760686477, + "grad_norm": 20459.279296875, + "learning_rate": 4.1723866727940036e-05, + "loss": 5.0828, + "step": 1339 + }, + { + "epoch": 0.340696011441284, + "grad_norm": 21032.62890625, + "learning_rate": 4.170736426702232e-05, + "loss": 5.0667, + "step": 1340 + }, + { + "epoch": 0.34095026219609087, + "grad_norm": 20187.595703125, + "learning_rate": 4.1690848640326444e-05, + "loss": 5.0701, + "step": 1341 + }, + { + "epoch": 0.3412045129508978, + "grad_norm": 20655.5234375, + "learning_rate": 4.167431986086708e-05, + "loss": 5.0878, + "step": 1342 + }, + { + "epoch": 0.34145876370570477, + "grad_norm": 20282.087890625, + "learning_rate": 4.16577779416693e-05, + "loss": 5.0669, + "step": 1343 + }, + { + "epoch": 0.34171301446051167, + "grad_norm": 20813.08984375, + "learning_rate": 4.164122289576849e-05, + "loss": 5.0834, + "step": 1344 + }, + { + "epoch": 0.3419672652153186, + "grad_norm": 20602.091796875, + "learning_rate": 4.16246547362104e-05, + "loss": 5.0712, + "step": 1345 + }, + { + "epoch": 0.3422215159701255, + "grad_norm": 20427.025390625, + "learning_rate": 4.160807347605112e-05, + "loss": 5.0769, + "step": 1346 + }, + { + "epoch": 0.34247576672493246, + "grad_norm": 21232.017578125, + "learning_rate": 4.1591479128357054e-05, + "loss": 5.0635, + "step": 1347 + }, + { + "epoch": 0.3427300174797394, + "grad_norm": 20302.38671875, + "learning_rate": 4.157487170620491e-05, + "loss": 5.0631, + "step": 1348 + }, + { + "epoch": 0.3429842682345463, + "grad_norm": 20684.921875, + "learning_rate": 4.155825122268172e-05, + "loss": 5.0664, + "step": 1349 + }, + { + "epoch": 0.34323851898935326, + "grad_norm": 20399.96484375, + "learning_rate": 4.154161769088479e-05, + "loss": 5.058, + "step": 1350 + }, + { + "epoch": 0.34349276974416015, + "grad_norm": 20703.484375, + "learning_rate": 4.152497112392173e-05, + "loss": 5.0605, + "step": 1351 + }, + { + "epoch": 0.3437470204989671, + "grad_norm": 20663.806640625, + "learning_rate": 4.1508311534910394e-05, + "loss": 5.0588, + "step": 1352 + }, + { + "epoch": 0.34400127125377405, + "grad_norm": 20704.51953125, + "learning_rate": 4.149163893697893e-05, + "loss": 5.0786, + "step": 1353 + }, + { + "epoch": 0.34425552200858095, + "grad_norm": 21822.69140625, + "learning_rate": 4.147495334326569e-05, + "loss": 5.0637, + "step": 1354 + }, + { + "epoch": 0.3445097727633879, + "grad_norm": 21407.75390625, + "learning_rate": 4.145825476691932e-05, + "loss": 5.0656, + "step": 1355 + }, + { + "epoch": 0.3447640235181948, + "grad_norm": 21587.5703125, + "learning_rate": 4.144154322109867e-05, + "loss": 5.0581, + "step": 1356 + }, + { + "epoch": 0.34501827427300175, + "grad_norm": 20367.529296875, + "learning_rate": 4.142481871897281e-05, + "loss": 5.0524, + "step": 1357 + }, + { + "epoch": 0.3452725250278087, + "grad_norm": 25677.013671875, + "learning_rate": 4.1408081273721023e-05, + "loss": 5.0546, + "step": 1358 + }, + { + "epoch": 0.3455267757826156, + "grad_norm": 22312.373046875, + "learning_rate": 4.1391330898532794e-05, + "loss": 5.07, + "step": 1359 + }, + { + "epoch": 0.34578102653742254, + "grad_norm": 22724.1171875, + "learning_rate": 4.137456760660779e-05, + "loss": 5.0625, + "step": 1360 + }, + { + "epoch": 0.34603527729222944, + "grad_norm": 20273.291015625, + "learning_rate": 4.1357791411155865e-05, + "loss": 5.0526, + "step": 1361 + }, + { + "epoch": 0.3462895280470364, + "grad_norm": 27532.322265625, + "learning_rate": 4.134100232539704e-05, + "loss": 5.072, + "step": 1362 + }, + { + "epoch": 0.34654377880184334, + "grad_norm": 23138.115234375, + "learning_rate": 4.132420036256148e-05, + "loss": 5.0542, + "step": 1363 + }, + { + "epoch": 0.34679802955665023, + "grad_norm": 21453.220703125, + "learning_rate": 4.130738553588953e-05, + "loss": 5.061, + "step": 1364 + }, + { + "epoch": 0.3470522803114572, + "grad_norm": 20350.22265625, + "learning_rate": 4.129055785863163e-05, + "loss": 5.0696, + "step": 1365 + }, + { + "epoch": 0.3473065310662641, + "grad_norm": 22019.0078125, + "learning_rate": 4.1273717344048375e-05, + "loss": 5.0538, + "step": 1366 + }, + { + "epoch": 0.34756078182107103, + "grad_norm": 21566.451171875, + "learning_rate": 4.125686400541047e-05, + "loss": 5.0669, + "step": 1367 + }, + { + "epoch": 0.347815032575878, + "grad_norm": 20886.859375, + "learning_rate": 4.123999785599873e-05, + "loss": 5.0543, + "step": 1368 + }, + { + "epoch": 0.3480692833306849, + "grad_norm": 20517.46484375, + "learning_rate": 4.1223118909104055e-05, + "loss": 5.0622, + "step": 1369 + }, + { + "epoch": 0.3483235340854918, + "grad_norm": 21522.505859375, + "learning_rate": 4.1206227178027426e-05, + "loss": 5.048, + "step": 1370 + }, + { + "epoch": 0.3485777848402987, + "grad_norm": 20795.326171875, + "learning_rate": 4.118932267607991e-05, + "loss": 5.0577, + "step": 1371 + }, + { + "epoch": 0.34883203559510567, + "grad_norm": 21447.560546875, + "learning_rate": 4.117240541658264e-05, + "loss": 5.0348, + "step": 1372 + }, + { + "epoch": 0.3490862863499126, + "grad_norm": 20870.556640625, + "learning_rate": 4.11554754128668e-05, + "loss": 5.0614, + "step": 1373 + }, + { + "epoch": 0.3493405371047195, + "grad_norm": 24859.7109375, + "learning_rate": 4.11385326782736e-05, + "loss": 5.0334, + "step": 1374 + }, + { + "epoch": 0.34959478785952647, + "grad_norm": 21025.8984375, + "learning_rate": 4.11215772261543e-05, + "loss": 5.0341, + "step": 1375 + }, + { + "epoch": 0.34984903861433336, + "grad_norm": 20546.03125, + "learning_rate": 4.110460906987018e-05, + "loss": 5.0444, + "step": 1376 + }, + { + "epoch": 0.3501032893691403, + "grad_norm": 20573.177734375, + "learning_rate": 4.108762822279253e-05, + "loss": 5.0456, + "step": 1377 + }, + { + "epoch": 0.35035754012394726, + "grad_norm": 21535.056640625, + "learning_rate": 4.107063469830263e-05, + "loss": 5.0614, + "step": 1378 + }, + { + "epoch": 0.35061179087875416, + "grad_norm": 20411.982421875, + "learning_rate": 4.1053628509791766e-05, + "loss": 5.0659, + "step": 1379 + }, + { + "epoch": 0.3508660416335611, + "grad_norm": 21210.41015625, + "learning_rate": 4.1036609670661196e-05, + "loss": 5.0465, + "step": 1380 + }, + { + "epoch": 0.351120292388368, + "grad_norm": 20453.76171875, + "learning_rate": 4.101957819432215e-05, + "loss": 5.0596, + "step": 1381 + }, + { + "epoch": 0.35137454314317496, + "grad_norm": 21562.908203125, + "learning_rate": 4.10025340941958e-05, + "loss": 5.0398, + "step": 1382 + }, + { + "epoch": 0.3516287938979819, + "grad_norm": 20612.826171875, + "learning_rate": 4.098547738371329e-05, + "loss": 5.0448, + "step": 1383 + }, + { + "epoch": 0.3518830446527888, + "grad_norm": 20957.23828125, + "learning_rate": 4.09684080763157e-05, + "loss": 5.0404, + "step": 1384 + }, + { + "epoch": 0.35213729540759575, + "grad_norm": 21259.423828125, + "learning_rate": 4.095132618545401e-05, + "loss": 5.0433, + "step": 1385 + }, + { + "epoch": 0.35239154616240265, + "grad_norm": 21018.7890625, + "learning_rate": 4.093423172458914e-05, + "loss": 5.0539, + "step": 1386 + }, + { + "epoch": 0.3526457969172096, + "grad_norm": 20516.033203125, + "learning_rate": 4.0917124707191915e-05, + "loss": 5.0491, + "step": 1387 + }, + { + "epoch": 0.35290004767201655, + "grad_norm": 20672.13671875, + "learning_rate": 4.0900005146743035e-05, + "loss": 5.0401, + "step": 1388 + }, + { + "epoch": 0.35315429842682344, + "grad_norm": 20577.83203125, + "learning_rate": 4.0882873056733116e-05, + "loss": 5.0303, + "step": 1389 + }, + { + "epoch": 0.3534085491816304, + "grad_norm": 20534.044921875, + "learning_rate": 4.086572845066262e-05, + "loss": 5.0558, + "step": 1390 + }, + { + "epoch": 0.3536627999364373, + "grad_norm": 20514.53125, + "learning_rate": 4.084857134204187e-05, + "loss": 5.0317, + "step": 1391 + }, + { + "epoch": 0.35391705069124424, + "grad_norm": 20635.29296875, + "learning_rate": 4.0831401744391087e-05, + "loss": 5.0513, + "step": 1392 + }, + { + "epoch": 0.3541713014460512, + "grad_norm": 20619.875, + "learning_rate": 4.081421967124026e-05, + "loss": 5.0604, + "step": 1393 + }, + { + "epoch": 0.3544255522008581, + "grad_norm": 20880.6640625, + "learning_rate": 4.079702513612927e-05, + "loss": 5.0263, + "step": 1394 + }, + { + "epoch": 0.35467980295566504, + "grad_norm": 20556.16015625, + "learning_rate": 4.077981815260779e-05, + "loss": 5.0342, + "step": 1395 + }, + { + "epoch": 0.35493405371047193, + "grad_norm": 20635.166015625, + "learning_rate": 4.0762598734235314e-05, + "loss": 5.0356, + "step": 1396 + }, + { + "epoch": 0.3551883044652789, + "grad_norm": 20488.4921875, + "learning_rate": 4.0745366894581126e-05, + "loss": 5.0288, + "step": 1397 + }, + { + "epoch": 0.35544255522008583, + "grad_norm": 20495.84765625, + "learning_rate": 4.072812264722431e-05, + "loss": 5.0337, + "step": 1398 + }, + { + "epoch": 0.3556968059748927, + "grad_norm": 20507.818359375, + "learning_rate": 4.071086600575371e-05, + "loss": 5.0293, + "step": 1399 + }, + { + "epoch": 0.3559510567296997, + "grad_norm": 20516.603515625, + "learning_rate": 4.069359698376795e-05, + "loss": 5.0313, + "step": 1400 + }, + { + "epoch": 0.3559510567296997, + "eval_loss": 10.153711318969727, + "eval_runtime": 696.649, + "eval_samples_per_second": 152.137, + "eval_steps_per_second": 9.51, + "step": 1400 + }, + { + "epoch": 0.3562053074845066, + "grad_norm": 21380.99609375, + "learning_rate": 4.0676315594875416e-05, + "loss": 5.0426, + "step": 1401 + }, + { + "epoch": 0.3564595582393135, + "grad_norm": 24253.44140625, + "learning_rate": 4.0659021852694226e-05, + "loss": 5.0363, + "step": 1402 + }, + { + "epoch": 0.3567138089941205, + "grad_norm": 20590.5078125, + "learning_rate": 4.0641715770852215e-05, + "loss": 5.0254, + "step": 1403 + }, + { + "epoch": 0.35696805974892737, + "grad_norm": 21081.021484375, + "learning_rate": 4.0624397362987e-05, + "loss": 5.0305, + "step": 1404 + }, + { + "epoch": 0.3572223105037343, + "grad_norm": 20629.21484375, + "learning_rate": 4.060706664274585e-05, + "loss": 5.0423, + "step": 1405 + }, + { + "epoch": 0.3574765612585412, + "grad_norm": 21007.060546875, + "learning_rate": 4.058972362378578e-05, + "loss": 5.0327, + "step": 1406 + }, + { + "epoch": 0.35773081201334817, + "grad_norm": 20600.265625, + "learning_rate": 4.057236831977346e-05, + "loss": 5.0392, + "step": 1407 + }, + { + "epoch": 0.3579850627681551, + "grad_norm": 20932.49609375, + "learning_rate": 4.0555000744385274e-05, + "loss": 5.0392, + "step": 1408 + }, + { + "epoch": 0.358239313522962, + "grad_norm": 20723.2265625, + "learning_rate": 4.053762091130725e-05, + "loss": 5.037, + "step": 1409 + }, + { + "epoch": 0.35849356427776896, + "grad_norm": 20771.119140625, + "learning_rate": 4.052022883423509e-05, + "loss": 5.0302, + "step": 1410 + }, + { + "epoch": 0.35874781503257586, + "grad_norm": 20619.994140625, + "learning_rate": 4.050282452687415e-05, + "loss": 5.0469, + "step": 1411 + }, + { + "epoch": 0.3590020657873828, + "grad_norm": 20759.640625, + "learning_rate": 4.04854080029394e-05, + "loss": 5.0293, + "step": 1412 + }, + { + "epoch": 0.35925631654218976, + "grad_norm": 20662.76953125, + "learning_rate": 4.0467979276155464e-05, + "loss": 5.0319, + "step": 1413 + }, + { + "epoch": 0.35951056729699665, + "grad_norm": 20976.1875, + "learning_rate": 4.045053836025656e-05, + "loss": 5.0432, + "step": 1414 + }, + { + "epoch": 0.3597648180518036, + "grad_norm": 20431.185546875, + "learning_rate": 4.043308526898654e-05, + "loss": 5.0419, + "step": 1415 + }, + { + "epoch": 0.3600190688066105, + "grad_norm": 20750.216796875, + "learning_rate": 4.041562001609881e-05, + "loss": 5.0167, + "step": 1416 + }, + { + "epoch": 0.36027331956141745, + "grad_norm": 20800.796875, + "learning_rate": 4.0398142615356396e-05, + "loss": 5.0224, + "step": 1417 + }, + { + "epoch": 0.3605275703162244, + "grad_norm": 20754.34765625, + "learning_rate": 4.038065308053187e-05, + "loss": 5.0426, + "step": 1418 + }, + { + "epoch": 0.3607818210710313, + "grad_norm": 20627.396484375, + "learning_rate": 4.036315142540739e-05, + "loss": 5.0126, + "step": 1419 + }, + { + "epoch": 0.36103607182583825, + "grad_norm": 24564.541015625, + "learning_rate": 4.0345637663774635e-05, + "loss": 5.0212, + "step": 1420 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 37871.4765625, + "learning_rate": 4.032811180943487e-05, + "loss": 5.0426, + "step": 1421 + }, + { + "epoch": 0.3615445733354521, + "grad_norm": 20797.484375, + "learning_rate": 4.0310573876198846e-05, + "loss": 5.0191, + "step": 1422 + }, + { + "epoch": 0.36179882409025904, + "grad_norm": 21103.12109375, + "learning_rate": 4.0293023877886846e-05, + "loss": 5.0256, + "step": 1423 + }, + { + "epoch": 0.36205307484506594, + "grad_norm": 20829.919921875, + "learning_rate": 4.027546182832866e-05, + "loss": 5.0255, + "step": 1424 + }, + { + "epoch": 0.3623073255998729, + "grad_norm": 21079.755859375, + "learning_rate": 4.0257887741363585e-05, + "loss": 5.0259, + "step": 1425 + }, + { + "epoch": 0.3625615763546798, + "grad_norm": 20816.08984375, + "learning_rate": 4.02403016308404e-05, + "loss": 5.034, + "step": 1426 + }, + { + "epoch": 0.36281582710948673, + "grad_norm": 21079.033203125, + "learning_rate": 4.022270351061735e-05, + "loss": 5.0219, + "step": 1427 + }, + { + "epoch": 0.3630700778642937, + "grad_norm": 21122.498046875, + "learning_rate": 4.020509339456214e-05, + "loss": 5.0042, + "step": 1428 + }, + { + "epoch": 0.3633243286191006, + "grad_norm": 20782.990234375, + "learning_rate": 4.0187471296551956e-05, + "loss": 5.0208, + "step": 1429 + }, + { + "epoch": 0.36357857937390753, + "grad_norm": 20848.810546875, + "learning_rate": 4.0169837230473386e-05, + "loss": 5.0285, + "step": 1430 + }, + { + "epoch": 0.3638328301287144, + "grad_norm": 20720.689453125, + "learning_rate": 4.0152191210222485e-05, + "loss": 5.0277, + "step": 1431 + }, + { + "epoch": 0.3640870808835214, + "grad_norm": 20801.99609375, + "learning_rate": 4.013453324970471e-05, + "loss": 5.0248, + "step": 1432 + }, + { + "epoch": 0.3643413316383283, + "grad_norm": 20609.95703125, + "learning_rate": 4.011686336283492e-05, + "loss": 5.0061, + "step": 1433 + }, + { + "epoch": 0.3645955823931352, + "grad_norm": 21219.537109375, + "learning_rate": 4.0099181563537395e-05, + "loss": 5.0149, + "step": 1434 + }, + { + "epoch": 0.36484983314794217, + "grad_norm": 20856.98828125, + "learning_rate": 4.008148786574579e-05, + "loss": 5.0149, + "step": 1435 + }, + { + "epoch": 0.36510408390274907, + "grad_norm": 20848.314453125, + "learning_rate": 4.006378228340313e-05, + "loss": 5.0069, + "step": 1436 + }, + { + "epoch": 0.365358334657556, + "grad_norm": 21045.689453125, + "learning_rate": 4.0046064830461816e-05, + "loss": 5.0127, + "step": 1437 + }, + { + "epoch": 0.36561258541236297, + "grad_norm": 20940.07421875, + "learning_rate": 4.002833552088359e-05, + "loss": 5.0187, + "step": 1438 + }, + { + "epoch": 0.36586683616716986, + "grad_norm": 20717.06640625, + "learning_rate": 4.001059436863955e-05, + "loss": 5.0132, + "step": 1439 + }, + { + "epoch": 0.3661210869219768, + "grad_norm": 21017.23828125, + "learning_rate": 3.999284138771013e-05, + "loss": 4.994, + "step": 1440 + }, + { + "epoch": 0.3663753376767837, + "grad_norm": 20876.576171875, + "learning_rate": 3.997507659208507e-05, + "loss": 5.017, + "step": 1441 + }, + { + "epoch": 0.36662958843159066, + "grad_norm": 21044.259765625, + "learning_rate": 3.995729999576343e-05, + "loss": 5.0022, + "step": 1442 + }, + { + "epoch": 0.3668838391863976, + "grad_norm": 20672.845703125, + "learning_rate": 3.9939511612753564e-05, + "loss": 5.0136, + "step": 1443 + }, + { + "epoch": 0.3671380899412045, + "grad_norm": 20896.17578125, + "learning_rate": 3.9921711457073125e-05, + "loss": 5.0031, + "step": 1444 + }, + { + "epoch": 0.36739234069601145, + "grad_norm": 21340.353515625, + "learning_rate": 3.9903899542749026e-05, + "loss": 5.0142, + "step": 1445 + }, + { + "epoch": 0.36764659145081835, + "grad_norm": 20945.185546875, + "learning_rate": 3.988607588381746e-05, + "loss": 5.0071, + "step": 1446 + }, + { + "epoch": 0.3679008422056253, + "grad_norm": 21055.23828125, + "learning_rate": 3.986824049432387e-05, + "loss": 4.9991, + "step": 1447 + }, + { + "epoch": 0.36815509296043225, + "grad_norm": 20799.30859375, + "learning_rate": 3.985039338832295e-05, + "loss": 5.0117, + "step": 1448 + }, + { + "epoch": 0.36840934371523915, + "grad_norm": 20687.353515625, + "learning_rate": 3.983253457987861e-05, + "loss": 4.9938, + "step": 1449 + }, + { + "epoch": 0.3686635944700461, + "grad_norm": 20659.6875, + "learning_rate": 3.981466408306399e-05, + "loss": 4.9841, + "step": 1450 + }, + { + "epoch": 0.368917845224853, + "grad_norm": 20908.5078125, + "learning_rate": 3.979678191196146e-05, + "loss": 5.0204, + "step": 1451 + }, + { + "epoch": 0.36917209597965994, + "grad_norm": 20579.8984375, + "learning_rate": 3.9778888080662555e-05, + "loss": 4.9961, + "step": 1452 + }, + { + "epoch": 0.3694263467344669, + "grad_norm": 20834.25390625, + "learning_rate": 3.976098260326802e-05, + "loss": 5.0062, + "step": 1453 + }, + { + "epoch": 0.3696805974892738, + "grad_norm": 20722.501953125, + "learning_rate": 3.9743065493887774e-05, + "loss": 5.0081, + "step": 1454 + }, + { + "epoch": 0.36993484824408074, + "grad_norm": 20788.0703125, + "learning_rate": 3.97251367666409e-05, + "loss": 5.0124, + "step": 1455 + }, + { + "epoch": 0.37018909899888763, + "grad_norm": 20767.16015625, + "learning_rate": 3.970719643565565e-05, + "loss": 5.0078, + "step": 1456 + }, + { + "epoch": 0.3704433497536946, + "grad_norm": 20994.658203125, + "learning_rate": 3.968924451506939e-05, + "loss": 5.0139, + "step": 1457 + }, + { + "epoch": 0.37069760050850153, + "grad_norm": 20918.130859375, + "learning_rate": 3.9671281019028645e-05, + "loss": 5.0012, + "step": 1458 + }, + { + "epoch": 0.37095185126330843, + "grad_norm": 20856.056640625, + "learning_rate": 3.9653305961689044e-05, + "loss": 5.0008, + "step": 1459 + }, + { + "epoch": 0.3712061020181154, + "grad_norm": 21184.412109375, + "learning_rate": 3.9635319357215365e-05, + "loss": 5.0242, + "step": 1460 + }, + { + "epoch": 0.3714603527729223, + "grad_norm": 20952.85546875, + "learning_rate": 3.961732121978142e-05, + "loss": 5.0063, + "step": 1461 + }, + { + "epoch": 0.3717146035277292, + "grad_norm": 20994.27734375, + "learning_rate": 3.959931156357016e-05, + "loss": 4.9989, + "step": 1462 + }, + { + "epoch": 0.3719688542825362, + "grad_norm": 20795.455078125, + "learning_rate": 3.9581290402773605e-05, + "loss": 4.9806, + "step": 1463 + }, + { + "epoch": 0.37222310503734307, + "grad_norm": 21110.5234375, + "learning_rate": 3.956325775159282e-05, + "loss": 4.9924, + "step": 1464 + }, + { + "epoch": 0.37247735579215, + "grad_norm": 21154.40625, + "learning_rate": 3.954521362423795e-05, + "loss": 4.9912, + "step": 1465 + }, + { + "epoch": 0.3727316065469569, + "grad_norm": 21014.50390625, + "learning_rate": 3.952715803492818e-05, + "loss": 5.0042, + "step": 1466 + }, + { + "epoch": 0.37298585730176387, + "grad_norm": 21070.060546875, + "learning_rate": 3.95090909978917e-05, + "loss": 5.0039, + "step": 1467 + }, + { + "epoch": 0.3732401080565708, + "grad_norm": 21246.404296875, + "learning_rate": 3.9491012527365753e-05, + "loss": 5.0018, + "step": 1468 + }, + { + "epoch": 0.3734943588113777, + "grad_norm": 21604.51953125, + "learning_rate": 3.9472922637596576e-05, + "loss": 5.0051, + "step": 1469 + }, + { + "epoch": 0.37374860956618466, + "grad_norm": 20936.728515625, + "learning_rate": 3.945482134283941e-05, + "loss": 4.9825, + "step": 1470 + }, + { + "epoch": 0.37400286032099156, + "grad_norm": 20951.236328125, + "learning_rate": 3.943670865735849e-05, + "loss": 4.9922, + "step": 1471 + }, + { + "epoch": 0.3742571110757985, + "grad_norm": 20979.51171875, + "learning_rate": 3.9418584595427e-05, + "loss": 4.997, + "step": 1472 + }, + { + "epoch": 0.37451136183060546, + "grad_norm": 21166.302734375, + "learning_rate": 3.9400449171327115e-05, + "loss": 4.9898, + "step": 1473 + }, + { + "epoch": 0.37476561258541236, + "grad_norm": 20928.44140625, + "learning_rate": 3.938230239934997e-05, + "loss": 4.9839, + "step": 1474 + }, + { + "epoch": 0.3750198633402193, + "grad_norm": 21149.673828125, + "learning_rate": 3.93641442937956e-05, + "loss": 4.9893, + "step": 1475 + }, + { + "epoch": 0.3752741140950262, + "grad_norm": 20992.947265625, + "learning_rate": 3.934597486897303e-05, + "loss": 4.9882, + "step": 1476 + }, + { + "epoch": 0.37552836484983315, + "grad_norm": 21158.515625, + "learning_rate": 3.932779413920017e-05, + "loss": 5.0042, + "step": 1477 + }, + { + "epoch": 0.3757826156046401, + "grad_norm": 20933.39453125, + "learning_rate": 3.9309602118803824e-05, + "loss": 4.9999, + "step": 1478 + }, + { + "epoch": 0.376036866359447, + "grad_norm": 21171.5234375, + "learning_rate": 3.9291398822119725e-05, + "loss": 5.0124, + "step": 1479 + }, + { + "epoch": 0.37629111711425395, + "grad_norm": 21032.017578125, + "learning_rate": 3.927318426349248e-05, + "loss": 5.0032, + "step": 1480 + }, + { + "epoch": 0.37654536786906084, + "grad_norm": 20867.19140625, + "learning_rate": 3.925495845727557e-05, + "loss": 4.9924, + "step": 1481 + }, + { + "epoch": 0.3767996186238678, + "grad_norm": 21033.208984375, + "learning_rate": 3.923672141783133e-05, + "loss": 4.9901, + "step": 1482 + }, + { + "epoch": 0.37705386937867474, + "grad_norm": 21061.41015625, + "learning_rate": 3.9218473159530975e-05, + "loss": 4.9939, + "step": 1483 + }, + { + "epoch": 0.37730812013348164, + "grad_norm": 21026.09765625, + "learning_rate": 3.920021369675453e-05, + "loss": 4.9824, + "step": 1484 + }, + { + "epoch": 0.3775623708882886, + "grad_norm": 21149.673828125, + "learning_rate": 3.9181943043890865e-05, + "loss": 4.982, + "step": 1485 + }, + { + "epoch": 0.3778166216430955, + "grad_norm": 20940.236328125, + "learning_rate": 3.916366121533767e-05, + "loss": 4.9815, + "step": 1486 + }, + { + "epoch": 0.37807087239790244, + "grad_norm": 21251.8515625, + "learning_rate": 3.914536822550141e-05, + "loss": 4.9913, + "step": 1487 + }, + { + "epoch": 0.3783251231527094, + "grad_norm": 20905.994140625, + "learning_rate": 3.9127064088797406e-05, + "loss": 4.9715, + "step": 1488 + }, + { + "epoch": 0.3785793739075163, + "grad_norm": 21418.8125, + "learning_rate": 3.910874881964971e-05, + "loss": 4.9821, + "step": 1489 + }, + { + "epoch": 0.37883362466232323, + "grad_norm": 21253.865234375, + "learning_rate": 3.909042243249117e-05, + "loss": 4.984, + "step": 1490 + }, + { + "epoch": 0.3790878754171301, + "grad_norm": 20961.84375, + "learning_rate": 3.9072084941763395e-05, + "loss": 4.9671, + "step": 1491 + }, + { + "epoch": 0.3793421261719371, + "grad_norm": 21087.693359375, + "learning_rate": 3.905373636191673e-05, + "loss": 4.9792, + "step": 1492 + }, + { + "epoch": 0.37959637692674403, + "grad_norm": 20970.341796875, + "learning_rate": 3.903537670741026e-05, + "loss": 4.9965, + "step": 1493 + }, + { + "epoch": 0.3798506276815509, + "grad_norm": 21103.857421875, + "learning_rate": 3.901700599271184e-05, + "loss": 4.9863, + "step": 1494 + }, + { + "epoch": 0.3801048784363579, + "grad_norm": 20976.533203125, + "learning_rate": 3.8998624232297975e-05, + "loss": 4.9838, + "step": 1495 + }, + { + "epoch": 0.38035912919116477, + "grad_norm": 20968.71875, + "learning_rate": 3.89802314406539e-05, + "loss": 4.978, + "step": 1496 + }, + { + "epoch": 0.3806133799459717, + "grad_norm": 21097.630859375, + "learning_rate": 3.896182763227358e-05, + "loss": 4.9771, + "step": 1497 + }, + { + "epoch": 0.38086763070077867, + "grad_norm": 21137.41796875, + "learning_rate": 3.89434128216596e-05, + "loss": 4.9686, + "step": 1498 + }, + { + "epoch": 0.38112188145558556, + "grad_norm": 20979.57421875, + "learning_rate": 3.892498702332326e-05, + "loss": 4.97, + "step": 1499 + }, + { + "epoch": 0.3813761322103925, + "grad_norm": 21282.443359375, + "learning_rate": 3.890655025178449e-05, + "loss": 4.9794, + "step": 1500 + }, + { + "epoch": 0.3813761322103925, + "eval_loss": 10.037830352783203, + "eval_runtime": 699.4502, + "eval_samples_per_second": 151.528, + "eval_steps_per_second": 9.472, + "step": 1500 + }, + { + "epoch": 0.3816303829651994, + "grad_norm": 21108.099609375, + "learning_rate": 3.888810252157189e-05, + "loss": 4.9901, + "step": 1501 + }, + { + "epoch": 0.38188463372000636, + "grad_norm": 21029.072265625, + "learning_rate": 3.886964384722268e-05, + "loss": 4.9658, + "step": 1502 + }, + { + "epoch": 0.3821388844748133, + "grad_norm": 20969.4140625, + "learning_rate": 3.885117424328271e-05, + "loss": 4.9668, + "step": 1503 + }, + { + "epoch": 0.3823931352296202, + "grad_norm": 21315.923828125, + "learning_rate": 3.8832693724306444e-05, + "loss": 4.9728, + "step": 1504 + }, + { + "epoch": 0.38264738598442716, + "grad_norm": 21029.814453125, + "learning_rate": 3.881420230485696e-05, + "loss": 4.9772, + "step": 1505 + }, + { + "epoch": 0.38290163673923405, + "grad_norm": 20990.447265625, + "learning_rate": 3.879569999950589e-05, + "loss": 4.9651, + "step": 1506 + }, + { + "epoch": 0.383155887494041, + "grad_norm": 21186.33984375, + "learning_rate": 3.877718682283347e-05, + "loss": 4.9908, + "step": 1507 + }, + { + "epoch": 0.38341013824884795, + "grad_norm": 21121.689453125, + "learning_rate": 3.8758662789428515e-05, + "loss": 4.9748, + "step": 1508 + }, + { + "epoch": 0.38366438900365485, + "grad_norm": 21179.07421875, + "learning_rate": 3.8740127913888356e-05, + "loss": 4.9754, + "step": 1509 + }, + { + "epoch": 0.3839186397584618, + "grad_norm": 21273.123046875, + "learning_rate": 3.872158221081891e-05, + "loss": 4.9709, + "step": 1510 + }, + { + "epoch": 0.3841728905132687, + "grad_norm": 21022.14453125, + "learning_rate": 3.870302569483459e-05, + "loss": 4.9545, + "step": 1511 + }, + { + "epoch": 0.38442714126807565, + "grad_norm": 21070.720703125, + "learning_rate": 3.868445838055836e-05, + "loss": 4.9764, + "step": 1512 + }, + { + "epoch": 0.3846813920228826, + "grad_norm": 20866.03125, + "learning_rate": 3.866588028262169e-05, + "loss": 4.9572, + "step": 1513 + }, + { + "epoch": 0.3849356427776895, + "grad_norm": 21212.82421875, + "learning_rate": 3.864729141566452e-05, + "loss": 4.9774, + "step": 1514 + }, + { + "epoch": 0.38518989353249644, + "grad_norm": 21061.98828125, + "learning_rate": 3.8628691794335294e-05, + "loss": 4.9789, + "step": 1515 + }, + { + "epoch": 0.38544414428730334, + "grad_norm": 21465.978515625, + "learning_rate": 3.861008143329095e-05, + "loss": 4.9567, + "step": 1516 + }, + { + "epoch": 0.3856983950421103, + "grad_norm": 21075.642578125, + "learning_rate": 3.859146034719684e-05, + "loss": 4.9722, + "step": 1517 + }, + { + "epoch": 0.3859526457969172, + "grad_norm": 21462.427734375, + "learning_rate": 3.857282855072683e-05, + "loss": 4.9852, + "step": 1518 + }, + { + "epoch": 0.38620689655172413, + "grad_norm": 21176.634765625, + "learning_rate": 3.855418605856318e-05, + "loss": 4.9805, + "step": 1519 + }, + { + "epoch": 0.3864611473065311, + "grad_norm": 21378.16015625, + "learning_rate": 3.8535532885396585e-05, + "loss": 4.9789, + "step": 1520 + }, + { + "epoch": 0.386715398061338, + "grad_norm": 21267.939453125, + "learning_rate": 3.851686904592617e-05, + "loss": 4.9743, + "step": 1521 + }, + { + "epoch": 0.38696964881614493, + "grad_norm": 21190.212890625, + "learning_rate": 3.8498194554859476e-05, + "loss": 4.9752, + "step": 1522 + }, + { + "epoch": 0.3872238995709518, + "grad_norm": 21358.046875, + "learning_rate": 3.84795094269124e-05, + "loss": 4.9709, + "step": 1523 + }, + { + "epoch": 0.3874781503257588, + "grad_norm": 21233.3203125, + "learning_rate": 3.846081367680924e-05, + "loss": 4.9649, + "step": 1524 + }, + { + "epoch": 0.3877324010805657, + "grad_norm": 21131.3046875, + "learning_rate": 3.844210731928268e-05, + "loss": 4.9719, + "step": 1525 + }, + { + "epoch": 0.3879866518353726, + "grad_norm": 21464.201171875, + "learning_rate": 3.842339036907375e-05, + "loss": 4.9638, + "step": 1526 + }, + { + "epoch": 0.38824090259017957, + "grad_norm": 21253.546875, + "learning_rate": 3.840466284093183e-05, + "loss": 4.9535, + "step": 1527 + }, + { + "epoch": 0.38849515334498647, + "grad_norm": 21201.64453125, + "learning_rate": 3.838592474961461e-05, + "loss": 4.9583, + "step": 1528 + }, + { + "epoch": 0.3887494040997934, + "grad_norm": 21423.244140625, + "learning_rate": 3.836717610988815e-05, + "loss": 4.9622, + "step": 1529 + }, + { + "epoch": 0.38900365485460037, + "grad_norm": 21263.2265625, + "learning_rate": 3.834841693652679e-05, + "loss": 4.9578, + "step": 1530 + }, + { + "epoch": 0.38925790560940726, + "grad_norm": 21437.376953125, + "learning_rate": 3.832964724431318e-05, + "loss": 4.9623, + "step": 1531 + }, + { + "epoch": 0.3895121563642142, + "grad_norm": 21214.513671875, + "learning_rate": 3.8310867048038256e-05, + "loss": 4.9606, + "step": 1532 + }, + { + "epoch": 0.3897664071190211, + "grad_norm": 21181.826171875, + "learning_rate": 3.829207636250124e-05, + "loss": 4.962, + "step": 1533 + }, + { + "epoch": 0.39002065787382806, + "grad_norm": 21346.517578125, + "learning_rate": 3.8273275202509616e-05, + "loss": 4.9658, + "step": 1534 + }, + { + "epoch": 0.390274908628635, + "grad_norm": 21517.15234375, + "learning_rate": 3.8254463582879105e-05, + "loss": 4.9629, + "step": 1535 + }, + { + "epoch": 0.3905291593834419, + "grad_norm": 21421.583984375, + "learning_rate": 3.823564151843371e-05, + "loss": 4.9644, + "step": 1536 + }, + { + "epoch": 0.39078341013824885, + "grad_norm": 21565.421875, + "learning_rate": 3.821680902400562e-05, + "loss": 4.9637, + "step": 1537 + }, + { + "epoch": 0.39103766089305575, + "grad_norm": 21515.404296875, + "learning_rate": 3.8197966114435265e-05, + "loss": 4.9657, + "step": 1538 + }, + { + "epoch": 0.3912919116478627, + "grad_norm": 21260.091796875, + "learning_rate": 3.81791128045713e-05, + "loss": 4.9557, + "step": 1539 + }, + { + "epoch": 0.39154616240266965, + "grad_norm": 21503.732421875, + "learning_rate": 3.816024910927054e-05, + "loss": 4.9738, + "step": 1540 + }, + { + "epoch": 0.39180041315747655, + "grad_norm": 21256.716796875, + "learning_rate": 3.8141375043397996e-05, + "loss": 4.9709, + "step": 1541 + }, + { + "epoch": 0.3920546639122835, + "grad_norm": 21197.525390625, + "learning_rate": 3.812249062182687e-05, + "loss": 4.9463, + "step": 1542 + }, + { + "epoch": 0.3923089146670904, + "grad_norm": 21073.4140625, + "learning_rate": 3.8103595859438495e-05, + "loss": 4.9453, + "step": 1543 + }, + { + "epoch": 0.39256316542189734, + "grad_norm": 21267.0703125, + "learning_rate": 3.808469077112238e-05, + "loss": 4.9545, + "step": 1544 + }, + { + "epoch": 0.3928174161767043, + "grad_norm": 21215.130859375, + "learning_rate": 3.8065775371776166e-05, + "loss": 4.9625, + "step": 1545 + }, + { + "epoch": 0.3930716669315112, + "grad_norm": 21569.37890625, + "learning_rate": 3.8046849676305587e-05, + "loss": 4.9701, + "step": 1546 + }, + { + "epoch": 0.39332591768631814, + "grad_norm": 21321.40625, + "learning_rate": 3.802791369962453e-05, + "loss": 4.9533, + "step": 1547 + }, + { + "epoch": 0.39358016844112503, + "grad_norm": 21263.0625, + "learning_rate": 3.800896745665498e-05, + "loss": 4.9449, + "step": 1548 + }, + { + "epoch": 0.393834419195932, + "grad_norm": 21279.13671875, + "learning_rate": 3.799001096232699e-05, + "loss": 4.9602, + "step": 1549 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 21376.54296875, + "learning_rate": 3.7971044231578706e-05, + "loss": 4.947, + "step": 1550 + }, + { + "epoch": 0.39434292070554583, + "grad_norm": 21252.875, + "learning_rate": 3.795206727935633e-05, + "loss": 4.947, + "step": 1551 + }, + { + "epoch": 0.3945971714603528, + "grad_norm": 21414.9765625, + "learning_rate": 3.793308012061414e-05, + "loss": 4.954, + "step": 1552 + }, + { + "epoch": 0.3948514222151597, + "grad_norm": 21248.7109375, + "learning_rate": 3.7914082770314436e-05, + "loss": 4.9502, + "step": 1553 + }, + { + "epoch": 0.3951056729699666, + "grad_norm": 21366.3125, + "learning_rate": 3.789507524342756e-05, + "loss": 4.9418, + "step": 1554 + }, + { + "epoch": 0.3953599237247736, + "grad_norm": 21357.92578125, + "learning_rate": 3.787605755493186e-05, + "loss": 4.9515, + "step": 1555 + }, + { + "epoch": 0.39561417447958047, + "grad_norm": 21427.646484375, + "learning_rate": 3.785702971981372e-05, + "loss": 4.956, + "step": 1556 + }, + { + "epoch": 0.3958684252343874, + "grad_norm": 21325.81640625, + "learning_rate": 3.783799175306747e-05, + "loss": 4.9588, + "step": 1557 + }, + { + "epoch": 0.3961226759891943, + "grad_norm": 21325.96875, + "learning_rate": 3.7818943669695496e-05, + "loss": 4.9478, + "step": 1558 + }, + { + "epoch": 0.39637692674400127, + "grad_norm": 21331.484375, + "learning_rate": 3.779988548470809e-05, + "loss": 4.9434, + "step": 1559 + }, + { + "epoch": 0.3966311774988082, + "grad_norm": 21457.83203125, + "learning_rate": 3.7780817213123534e-05, + "loss": 4.9562, + "step": 1560 + }, + { + "epoch": 0.3968854282536151, + "grad_norm": 21139.501953125, + "learning_rate": 3.7761738869968043e-05, + "loss": 4.9551, + "step": 1561 + }, + { + "epoch": 0.39713967900842206, + "grad_norm": 21467.375, + "learning_rate": 3.7742650470275806e-05, + "loss": 4.942, + "step": 1562 + }, + { + "epoch": 0.39739392976322896, + "grad_norm": 21406.478515625, + "learning_rate": 3.772355202908889e-05, + "loss": 4.9522, + "step": 1563 + }, + { + "epoch": 0.3976481805180359, + "grad_norm": 21372.96875, + "learning_rate": 3.77044435614573e-05, + "loss": 4.9374, + "step": 1564 + }, + { + "epoch": 0.39790243127284286, + "grad_norm": 21493.8125, + "learning_rate": 3.7685325082438943e-05, + "loss": 4.9512, + "step": 1565 + }, + { + "epoch": 0.39815668202764976, + "grad_norm": 21326.302734375, + "learning_rate": 3.76661966070996e-05, + "loss": 4.9539, + "step": 1566 + }, + { + "epoch": 0.3984109327824567, + "grad_norm": 21405.16796875, + "learning_rate": 3.764705815051295e-05, + "loss": 4.9479, + "step": 1567 + }, + { + "epoch": 0.3986651835372636, + "grad_norm": 21500.7734375, + "learning_rate": 3.762790972776052e-05, + "loss": 4.9305, + "step": 1568 + }, + { + "epoch": 0.39891943429207055, + "grad_norm": 21364.056640625, + "learning_rate": 3.76087513539317e-05, + "loss": 4.9309, + "step": 1569 + }, + { + "epoch": 0.3991736850468775, + "grad_norm": 21395.208984375, + "learning_rate": 3.758958304412372e-05, + "loss": 4.9331, + "step": 1570 + }, + { + "epoch": 0.3994279358016844, + "grad_norm": 21496.416015625, + "learning_rate": 3.7570404813441626e-05, + "loss": 4.942, + "step": 1571 + }, + { + "epoch": 0.39968218655649135, + "grad_norm": 21445.732421875, + "learning_rate": 3.7551216676998304e-05, + "loss": 4.9369, + "step": 1572 + }, + { + "epoch": 0.39993643731129824, + "grad_norm": 21446.796875, + "learning_rate": 3.753201864991444e-05, + "loss": 4.9327, + "step": 1573 + }, + { + "epoch": 0.4001906880661052, + "grad_norm": 21526.1015625, + "learning_rate": 3.7512810747318506e-05, + "loss": 4.9376, + "step": 1574 + }, + { + "epoch": 0.40044493882091214, + "grad_norm": 21728.33203125, + "learning_rate": 3.749359298434677e-05, + "loss": 4.9299, + "step": 1575 + }, + { + "epoch": 0.40069918957571904, + "grad_norm": 21494.74609375, + "learning_rate": 3.747436537614324e-05, + "loss": 4.9459, + "step": 1576 + }, + { + "epoch": 0.400953440330526, + "grad_norm": 21572.361328125, + "learning_rate": 3.745512793785972e-05, + "loss": 4.9343, + "step": 1577 + }, + { + "epoch": 0.4012076910853329, + "grad_norm": 21482.47265625, + "learning_rate": 3.743588068465573e-05, + "loss": 4.9482, + "step": 1578 + }, + { + "epoch": 0.40146194184013984, + "grad_norm": 21443.158203125, + "learning_rate": 3.741662363169856e-05, + "loss": 4.9344, + "step": 1579 + }, + { + "epoch": 0.4017161925949468, + "grad_norm": 21497.869140625, + "learning_rate": 3.739735679416317e-05, + "loss": 4.9174, + "step": 1580 + }, + { + "epoch": 0.4019704433497537, + "grad_norm": 21446.015625, + "learning_rate": 3.737808018723229e-05, + "loss": 4.9439, + "step": 1581 + }, + { + "epoch": 0.40222469410456063, + "grad_norm": 21972.1796875, + "learning_rate": 3.73587938260963e-05, + "loss": 4.9502, + "step": 1582 + }, + { + "epoch": 0.4024789448593675, + "grad_norm": 21463.888671875, + "learning_rate": 3.733949772595329e-05, + "loss": 4.9278, + "step": 1583 + }, + { + "epoch": 0.4027331956141745, + "grad_norm": 21688.326171875, + "learning_rate": 3.732019190200902e-05, + "loss": 4.9353, + "step": 1584 + }, + { + "epoch": 0.40298744636898143, + "grad_norm": 21537.6015625, + "learning_rate": 3.730087636947692e-05, + "loss": 4.9182, + "step": 1585 + }, + { + "epoch": 0.4032416971237883, + "grad_norm": 21491.390625, + "learning_rate": 3.728155114357805e-05, + "loss": 4.9369, + "step": 1586 + }, + { + "epoch": 0.4034959478785953, + "grad_norm": 23047.10546875, + "learning_rate": 3.7262216239541135e-05, + "loss": 4.9517, + "step": 1587 + }, + { + "epoch": 0.40375019863340217, + "grad_norm": 21860.529296875, + "learning_rate": 3.72428716726025e-05, + "loss": 4.9283, + "step": 1588 + }, + { + "epoch": 0.4040044493882091, + "grad_norm": 21623.978515625, + "learning_rate": 3.722351745800611e-05, + "loss": 4.9307, + "step": 1589 + }, + { + "epoch": 0.40425870014301607, + "grad_norm": 21743.81640625, + "learning_rate": 3.720415361100352e-05, + "loss": 4.9369, + "step": 1590 + }, + { + "epoch": 0.40451295089782296, + "grad_norm": 21573.21484375, + "learning_rate": 3.718478014685387e-05, + "loss": 4.9247, + "step": 1591 + }, + { + "epoch": 0.4047672016526299, + "grad_norm": 21771.232421875, + "learning_rate": 3.716539708082389e-05, + "loss": 4.9362, + "step": 1592 + }, + { + "epoch": 0.4050214524074368, + "grad_norm": 21642.990234375, + "learning_rate": 3.7146004428187864e-05, + "loss": 4.9422, + "step": 1593 + }, + { + "epoch": 0.40527570316224376, + "grad_norm": 21645.24609375, + "learning_rate": 3.7126602204227636e-05, + "loss": 4.9285, + "step": 1594 + }, + { + "epoch": 0.4055299539170507, + "grad_norm": 21890.04296875, + "learning_rate": 3.710719042423262e-05, + "loss": 4.9309, + "step": 1595 + }, + { + "epoch": 0.4057842046718576, + "grad_norm": 21528.333984375, + "learning_rate": 3.70877691034997e-05, + "loss": 4.9295, + "step": 1596 + }, + { + "epoch": 0.40603845542666456, + "grad_norm": 21905.609375, + "learning_rate": 3.706833825733333e-05, + "loss": 4.9203, + "step": 1597 + }, + { + "epoch": 0.40629270618147145, + "grad_norm": 22114.712890625, + "learning_rate": 3.704889790104545e-05, + "loss": 4.9165, + "step": 1598 + }, + { + "epoch": 0.4065469569362784, + "grad_norm": 21702.681640625, + "learning_rate": 3.7029448049955496e-05, + "loss": 4.9239, + "step": 1599 + }, + { + "epoch": 0.40680120769108535, + "grad_norm": 21679.171875, + "learning_rate": 3.7009988719390395e-05, + "loss": 4.9188, + "step": 1600 + }, + { + "epoch": 0.40680120769108535, + "eval_loss": 9.927664756774902, + "eval_runtime": 699.0872, + "eval_samples_per_second": 151.606, + "eval_steps_per_second": 9.477, + "step": 1600 + }, + { + "epoch": 0.40705545844589225, + "grad_norm": 21939.634765625, + "learning_rate": 3.699051992468453e-05, + "loss": 4.9212, + "step": 1601 + }, + { + "epoch": 0.4073097092006992, + "grad_norm": 21534.181640625, + "learning_rate": 3.6971041681179743e-05, + "loss": 4.9232, + "step": 1602 + }, + { + "epoch": 0.4075639599555061, + "grad_norm": 21565.546875, + "learning_rate": 3.695155400422534e-05, + "loss": 4.9285, + "step": 1603 + }, + { + "epoch": 0.40781821071031304, + "grad_norm": 21511.466796875, + "learning_rate": 3.693205690917804e-05, + "loss": 4.9153, + "step": 1604 + }, + { + "epoch": 0.40807246146512, + "grad_norm": 21691.6640625, + "learning_rate": 3.6912550411401984e-05, + "loss": 4.9164, + "step": 1605 + }, + { + "epoch": 0.4083267122199269, + "grad_norm": 21525.65234375, + "learning_rate": 3.6893034526268747e-05, + "loss": 4.9253, + "step": 1606 + }, + { + "epoch": 0.40858096297473384, + "grad_norm": 21488.17578125, + "learning_rate": 3.687350926915728e-05, + "loss": 4.9109, + "step": 1607 + }, + { + "epoch": 0.40883521372954074, + "grad_norm": 21611.3828125, + "learning_rate": 3.6853974655453914e-05, + "loss": 4.9272, + "step": 1608 + }, + { + "epoch": 0.4090894644843477, + "grad_norm": 21736.541015625, + "learning_rate": 3.683443070055237e-05, + "loss": 4.9366, + "step": 1609 + }, + { + "epoch": 0.40934371523915464, + "grad_norm": 21618.369140625, + "learning_rate": 3.681487741985373e-05, + "loss": 4.9196, + "step": 1610 + }, + { + "epoch": 0.40959796599396153, + "grad_norm": 21595.90625, + "learning_rate": 3.6795314828766405e-05, + "loss": 4.9213, + "step": 1611 + }, + { + "epoch": 0.4098522167487685, + "grad_norm": 21640.361328125, + "learning_rate": 3.677574294270617e-05, + "loss": 4.9271, + "step": 1612 + }, + { + "epoch": 0.4101064675035754, + "grad_norm": 21779.6015625, + "learning_rate": 3.675616177709609e-05, + "loss": 4.9261, + "step": 1613 + }, + { + "epoch": 0.41036071825838233, + "grad_norm": 21612.134765625, + "learning_rate": 3.673657134736658e-05, + "loss": 4.9255, + "step": 1614 + }, + { + "epoch": 0.4106149690131893, + "grad_norm": 21874.701171875, + "learning_rate": 3.6716971668955344e-05, + "loss": 4.9146, + "step": 1615 + }, + { + "epoch": 0.4108692197679962, + "grad_norm": 21585.548828125, + "learning_rate": 3.669736275730735e-05, + "loss": 4.9197, + "step": 1616 + }, + { + "epoch": 0.4111234705228031, + "grad_norm": 21718.55078125, + "learning_rate": 3.667774462787487e-05, + "loss": 4.9247, + "step": 1617 + }, + { + "epoch": 0.41137772127761, + "grad_norm": 21887.751953125, + "learning_rate": 3.665811729611744e-05, + "loss": 4.922, + "step": 1618 + }, + { + "epoch": 0.41163197203241697, + "grad_norm": 21929.798828125, + "learning_rate": 3.663848077750182e-05, + "loss": 4.9263, + "step": 1619 + }, + { + "epoch": 0.4118862227872239, + "grad_norm": 21495.984375, + "learning_rate": 3.661883508750203e-05, + "loss": 4.9005, + "step": 1620 + }, + { + "epoch": 0.4121404735420308, + "grad_norm": 21489.89453125, + "learning_rate": 3.6599180241599335e-05, + "loss": 4.9052, + "step": 1621 + }, + { + "epoch": 0.41239472429683777, + "grad_norm": 21656.25390625, + "learning_rate": 3.657951625528217e-05, + "loss": 4.8971, + "step": 1622 + }, + { + "epoch": 0.41264897505164466, + "grad_norm": 21682.71875, + "learning_rate": 3.655984314404621e-05, + "loss": 4.9104, + "step": 1623 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 21776.6171875, + "learning_rate": 3.654016092339432e-05, + "loss": 4.9059, + "step": 1624 + }, + { + "epoch": 0.41315747656125856, + "grad_norm": 21720.146484375, + "learning_rate": 3.652046960883651e-05, + "loss": 4.9184, + "step": 1625 + }, + { + "epoch": 0.41341172731606546, + "grad_norm": 21610.6875, + "learning_rate": 3.650076921588998e-05, + "loss": 4.915, + "step": 1626 + }, + { + "epoch": 0.4136659780708724, + "grad_norm": 21891.4453125, + "learning_rate": 3.648105976007909e-05, + "loss": 4.9165, + "step": 1627 + }, + { + "epoch": 0.4139202288256793, + "grad_norm": 21603.302734375, + "learning_rate": 3.646134125693534e-05, + "loss": 4.9212, + "step": 1628 + }, + { + "epoch": 0.41417447958048625, + "grad_norm": 21650.87109375, + "learning_rate": 3.644161372199735e-05, + "loss": 4.9054, + "step": 1629 + }, + { + "epoch": 0.4144287303352932, + "grad_norm": 21877.341796875, + "learning_rate": 3.6421877170810836e-05, + "loss": 4.913, + "step": 1630 + }, + { + "epoch": 0.4146829810901001, + "grad_norm": 21647.607421875, + "learning_rate": 3.6402131618928675e-05, + "loss": 4.903, + "step": 1631 + }, + { + "epoch": 0.41493723184490705, + "grad_norm": 22283.400390625, + "learning_rate": 3.638237708191079e-05, + "loss": 4.9145, + "step": 1632 + }, + { + "epoch": 0.41519148259971395, + "grad_norm": 21666.384765625, + "learning_rate": 3.636261357532421e-05, + "loss": 4.9037, + "step": 1633 + }, + { + "epoch": 0.4154457333545209, + "grad_norm": 21531.67578125, + "learning_rate": 3.634284111474301e-05, + "loss": 4.9037, + "step": 1634 + }, + { + "epoch": 0.41569998410932785, + "grad_norm": 21632.896484375, + "learning_rate": 3.632305971574834e-05, + "loss": 4.9046, + "step": 1635 + }, + { + "epoch": 0.41595423486413474, + "grad_norm": 21658.61328125, + "learning_rate": 3.630326939392838e-05, + "loss": 4.9074, + "step": 1636 + }, + { + "epoch": 0.4162084856189417, + "grad_norm": 21766.416015625, + "learning_rate": 3.628347016487836e-05, + "loss": 4.8985, + "step": 1637 + }, + { + "epoch": 0.4164627363737486, + "grad_norm": 21578.001953125, + "learning_rate": 3.626366204420051e-05, + "loss": 4.9071, + "step": 1638 + }, + { + "epoch": 0.41671698712855554, + "grad_norm": 21805.52734375, + "learning_rate": 3.624384504750407e-05, + "loss": 4.8926, + "step": 1639 + }, + { + "epoch": 0.4169712378833625, + "grad_norm": 21873.283203125, + "learning_rate": 3.622401919040528e-05, + "loss": 4.9103, + "step": 1640 + }, + { + "epoch": 0.4172254886381694, + "grad_norm": 21696.416015625, + "learning_rate": 3.620418448852737e-05, + "loss": 4.9028, + "step": 1641 + }, + { + "epoch": 0.41747973939297633, + "grad_norm": 21697.595703125, + "learning_rate": 3.618434095750051e-05, + "loss": 4.8983, + "step": 1642 + }, + { + "epoch": 0.41773399014778323, + "grad_norm": 21964.34765625, + "learning_rate": 3.616448861296187e-05, + "loss": 4.9248, + "step": 1643 + }, + { + "epoch": 0.4179882409025902, + "grad_norm": 21953.5625, + "learning_rate": 3.6144627470555534e-05, + "loss": 4.9121, + "step": 1644 + }, + { + "epoch": 0.41824249165739713, + "grad_norm": 21809.197265625, + "learning_rate": 3.612475754593253e-05, + "loss": 4.9091, + "step": 1645 + }, + { + "epoch": 0.418496742412204, + "grad_norm": 21916.5078125, + "learning_rate": 3.6104878854750787e-05, + "loss": 4.9011, + "step": 1646 + }, + { + "epoch": 0.418750993167011, + "grad_norm": 21816.2734375, + "learning_rate": 3.608499141267519e-05, + "loss": 4.8906, + "step": 1647 + }, + { + "epoch": 0.41900524392181787, + "grad_norm": 21631.5546875, + "learning_rate": 3.606509523537748e-05, + "loss": 4.9051, + "step": 1648 + }, + { + "epoch": 0.4192594946766248, + "grad_norm": 21687.5234375, + "learning_rate": 3.604519033853628e-05, + "loss": 4.8971, + "step": 1649 + }, + { + "epoch": 0.4195137454314318, + "grad_norm": 21873.890625, + "learning_rate": 3.60252767378371e-05, + "loss": 4.9144, + "step": 1650 + }, + { + "epoch": 0.41976799618623867, + "grad_norm": 21899.76953125, + "learning_rate": 3.600535444897231e-05, + "loss": 4.8783, + "step": 1651 + }, + { + "epoch": 0.4200222469410456, + "grad_norm": 21633.9375, + "learning_rate": 3.5985423487641115e-05, + "loss": 4.8773, + "step": 1652 + }, + { + "epoch": 0.4202764976958525, + "grad_norm": 32092.7109375, + "learning_rate": 3.596548386954956e-05, + "loss": 4.9003, + "step": 1653 + }, + { + "epoch": 0.42053074845065946, + "grad_norm": 21750.923828125, + "learning_rate": 3.594553561041053e-05, + "loss": 4.9002, + "step": 1654 + }, + { + "epoch": 0.4207849992054664, + "grad_norm": 21614.08203125, + "learning_rate": 3.592557872594368e-05, + "loss": 4.881, + "step": 1655 + }, + { + "epoch": 0.4210392499602733, + "grad_norm": 21781.92578125, + "learning_rate": 3.590561323187548e-05, + "loss": 4.8867, + "step": 1656 + }, + { + "epoch": 0.42129350071508026, + "grad_norm": 21969.76953125, + "learning_rate": 3.58856391439392e-05, + "loss": 4.8942, + "step": 1657 + }, + { + "epoch": 0.42154775146988716, + "grad_norm": 22522.05859375, + "learning_rate": 3.586565647787488e-05, + "loss": 4.8803, + "step": 1658 + }, + { + "epoch": 0.4218020022246941, + "grad_norm": 22621.322265625, + "learning_rate": 3.584566524942928e-05, + "loss": 4.9009, + "step": 1659 + }, + { + "epoch": 0.42205625297950106, + "grad_norm": 21819.041015625, + "learning_rate": 3.582566547435596e-05, + "loss": 4.8935, + "step": 1660 + }, + { + "epoch": 0.42231050373430795, + "grad_norm": 23410.6796875, + "learning_rate": 3.580565716841517e-05, + "loss": 4.8864, + "step": 1661 + }, + { + "epoch": 0.4225647544891149, + "grad_norm": 22864.201171875, + "learning_rate": 3.578564034737394e-05, + "loss": 4.9081, + "step": 1662 + }, + { + "epoch": 0.4228190052439218, + "grad_norm": 22266.919921875, + "learning_rate": 3.576561502700594e-05, + "loss": 4.8892, + "step": 1663 + }, + { + "epoch": 0.42307325599872875, + "grad_norm": 26095.134765625, + "learning_rate": 3.57455812230916e-05, + "loss": 4.8954, + "step": 1664 + }, + { + "epoch": 0.4233275067535357, + "grad_norm": 22740.720703125, + "learning_rate": 3.5725538951417973e-05, + "loss": 4.9033, + "step": 1665 + }, + { + "epoch": 0.4235817575083426, + "grad_norm": 22555.861328125, + "learning_rate": 3.570548822777885e-05, + "loss": 4.8892, + "step": 1666 + }, + { + "epoch": 0.42383600826314954, + "grad_norm": 22874.27734375, + "learning_rate": 3.568542906797463e-05, + "loss": 4.8997, + "step": 1667 + }, + { + "epoch": 0.42409025901795644, + "grad_norm": 21495.142578125, + "learning_rate": 3.5665361487812406e-05, + "loss": 4.8847, + "step": 1668 + }, + { + "epoch": 0.4243445097727634, + "grad_norm": 22216.201171875, + "learning_rate": 3.5645285503105866e-05, + "loss": 4.8894, + "step": 1669 + }, + { + "epoch": 0.42459876052757034, + "grad_norm": 21716.841796875, + "learning_rate": 3.562520112967533e-05, + "loss": 4.8733, + "step": 1670 + }, + { + "epoch": 0.42485301128237724, + "grad_norm": 22870.220703125, + "learning_rate": 3.560510838334774e-05, + "loss": 4.8936, + "step": 1671 + }, + { + "epoch": 0.4251072620371842, + "grad_norm": 22410.798828125, + "learning_rate": 3.5585007279956645e-05, + "loss": 4.8818, + "step": 1672 + }, + { + "epoch": 0.4253615127919911, + "grad_norm": 21816.15234375, + "learning_rate": 3.5564897835342145e-05, + "loss": 4.8735, + "step": 1673 + }, + { + "epoch": 0.42561576354679803, + "grad_norm": 22911.541015625, + "learning_rate": 3.554478006535096e-05, + "loss": 4.8915, + "step": 1674 + }, + { + "epoch": 0.425870014301605, + "grad_norm": 21875.9296875, + "learning_rate": 3.552465398583631e-05, + "loss": 4.9038, + "step": 1675 + }, + { + "epoch": 0.4261242650564119, + "grad_norm": 22147.525390625, + "learning_rate": 3.550451961265803e-05, + "loss": 4.8788, + "step": 1676 + }, + { + "epoch": 0.42637851581121883, + "grad_norm": 22352.337890625, + "learning_rate": 3.548437696168243e-05, + "loss": 4.9178, + "step": 1677 + }, + { + "epoch": 0.4266327665660257, + "grad_norm": 21874.51953125, + "learning_rate": 3.546422604878239e-05, + "loss": 4.8719, + "step": 1678 + }, + { + "epoch": 0.4268870173208327, + "grad_norm": 24230.541015625, + "learning_rate": 3.544406688983728e-05, + "loss": 4.8812, + "step": 1679 + }, + { + "epoch": 0.4271412680756396, + "grad_norm": 22601.33984375, + "learning_rate": 3.542389950073297e-05, + "loss": 4.8804, + "step": 1680 + }, + { + "epoch": 0.4273955188304465, + "grad_norm": 21862.162109375, + "learning_rate": 3.5403723897361806e-05, + "loss": 4.8903, + "step": 1681 + }, + { + "epoch": 0.42764976958525347, + "grad_norm": 23466.4921875, + "learning_rate": 3.538354009562263e-05, + "loss": 4.8762, + "step": 1682 + }, + { + "epoch": 0.42790402034006036, + "grad_norm": 22503.498046875, + "learning_rate": 3.536334811142071e-05, + "loss": 4.895, + "step": 1683 + }, + { + "epoch": 0.4281582710948673, + "grad_norm": 22191.65625, + "learning_rate": 3.534314796066781e-05, + "loss": 4.8867, + "step": 1684 + }, + { + "epoch": 0.42841252184967427, + "grad_norm": 22442.6640625, + "learning_rate": 3.5322939659282084e-05, + "loss": 4.8732, + "step": 1685 + }, + { + "epoch": 0.42866677260448116, + "grad_norm": 22191.251953125, + "learning_rate": 3.530272322318814e-05, + "loss": 4.8897, + "step": 1686 + }, + { + "epoch": 0.4289210233592881, + "grad_norm": 22047.107421875, + "learning_rate": 3.5282498668316965e-05, + "loss": 4.8758, + "step": 1687 + }, + { + "epoch": 0.429175274114095, + "grad_norm": 22093.28125, + "learning_rate": 3.5262266010605974e-05, + "loss": 4.8955, + "step": 1688 + }, + { + "epoch": 0.42942952486890196, + "grad_norm": 22349.4921875, + "learning_rate": 3.5242025265998955e-05, + "loss": 4.8688, + "step": 1689 + }, + { + "epoch": 0.4296837756237089, + "grad_norm": 21869.244140625, + "learning_rate": 3.522177645044607e-05, + "loss": 4.8715, + "step": 1690 + }, + { + "epoch": 0.4299380263785158, + "grad_norm": 21924.68359375, + "learning_rate": 3.520151957990385e-05, + "loss": 4.8713, + "step": 1691 + }, + { + "epoch": 0.43019227713332275, + "grad_norm": 21897.029296875, + "learning_rate": 3.518125467033515e-05, + "loss": 4.871, + "step": 1692 + }, + { + "epoch": 0.43044652788812965, + "grad_norm": 22098.861328125, + "learning_rate": 3.516098173770916e-05, + "loss": 4.8773, + "step": 1693 + }, + { + "epoch": 0.4307007786429366, + "grad_norm": 33320.83203125, + "learning_rate": 3.5140700798001436e-05, + "loss": 4.8591, + "step": 1694 + }, + { + "epoch": 0.43095502939774355, + "grad_norm": 22015.482421875, + "learning_rate": 3.5120411867193795e-05, + "loss": 4.8804, + "step": 1695 + }, + { + "epoch": 0.43120928015255044, + "grad_norm": 22019.46484375, + "learning_rate": 3.510011496127438e-05, + "loss": 4.8766, + "step": 1696 + }, + { + "epoch": 0.4314635309073574, + "grad_norm": 21862.5546875, + "learning_rate": 3.50798100962376e-05, + "loss": 4.8634, + "step": 1697 + }, + { + "epoch": 0.4317177816621643, + "grad_norm": 22089.81640625, + "learning_rate": 3.505949728808415e-05, + "loss": 4.8711, + "step": 1698 + }, + { + "epoch": 0.43197203241697124, + "grad_norm": 22051.53515625, + "learning_rate": 3.5039176552820975e-05, + "loss": 4.8763, + "step": 1699 + }, + { + "epoch": 0.4322262831717782, + "grad_norm": 22092.03515625, + "learning_rate": 3.501884790646128e-05, + "loss": 4.8664, + "step": 1700 + }, + { + "epoch": 0.4322262831717782, + "eval_loss": 9.825671195983887, + "eval_runtime": 699.7335, + "eval_samples_per_second": 151.466, + "eval_steps_per_second": 9.468, + "step": 1700 + }, + { + "epoch": 0.4324805339265851, + "grad_norm": 21989.029296875, + "learning_rate": 3.499851136502449e-05, + "loss": 4.8864, + "step": 1701 + }, + { + "epoch": 0.43273478468139204, + "grad_norm": 22010.939453125, + "learning_rate": 3.497816694453624e-05, + "loss": 4.8646, + "step": 1702 + }, + { + "epoch": 0.43298903543619893, + "grad_norm": 22087.806640625, + "learning_rate": 3.495781466102841e-05, + "loss": 4.86, + "step": 1703 + }, + { + "epoch": 0.4332432861910059, + "grad_norm": 22119.8359375, + "learning_rate": 3.493745453053906e-05, + "loss": 4.8739, + "step": 1704 + }, + { + "epoch": 0.43349753694581283, + "grad_norm": 22071.79296875, + "learning_rate": 3.491708656911242e-05, + "loss": 4.8769, + "step": 1705 + }, + { + "epoch": 0.43375178770061973, + "grad_norm": 21819.615234375, + "learning_rate": 3.489671079279889e-05, + "loss": 4.864, + "step": 1706 + }, + { + "epoch": 0.4340060384554267, + "grad_norm": 22032.8984375, + "learning_rate": 3.487632721765506e-05, + "loss": 4.8787, + "step": 1707 + }, + { + "epoch": 0.4342602892102336, + "grad_norm": 21978.28125, + "learning_rate": 3.4855935859743634e-05, + "loss": 4.8666, + "step": 1708 + }, + { + "epoch": 0.4345145399650405, + "grad_norm": 21969.259765625, + "learning_rate": 3.483553673513346e-05, + "loss": 4.8635, + "step": 1709 + }, + { + "epoch": 0.4347687907198475, + "grad_norm": 22161.021484375, + "learning_rate": 3.481512985989951e-05, + "loss": 4.8755, + "step": 1710 + }, + { + "epoch": 0.43502304147465437, + "grad_norm": 21971.896484375, + "learning_rate": 3.4794715250122854e-05, + "loss": 4.8616, + "step": 1711 + }, + { + "epoch": 0.4352772922294613, + "grad_norm": 22054.76171875, + "learning_rate": 3.477429292189067e-05, + "loss": 4.8706, + "step": 1712 + }, + { + "epoch": 0.4355315429842682, + "grad_norm": 22108.994140625, + "learning_rate": 3.475386289129621e-05, + "loss": 4.8798, + "step": 1713 + }, + { + "epoch": 0.43578579373907517, + "grad_norm": 22375.142578125, + "learning_rate": 3.473342517443878e-05, + "loss": 4.8703, + "step": 1714 + }, + { + "epoch": 0.4360400444938821, + "grad_norm": 21990.71484375, + "learning_rate": 3.471297978742379e-05, + "loss": 4.8656, + "step": 1715 + }, + { + "epoch": 0.436294295248689, + "grad_norm": 22038.552734375, + "learning_rate": 3.469252674636264e-05, + "loss": 4.8728, + "step": 1716 + }, + { + "epoch": 0.43654854600349596, + "grad_norm": 21873.314453125, + "learning_rate": 3.46720660673728e-05, + "loss": 4.8584, + "step": 1717 + }, + { + "epoch": 0.43680279675830286, + "grad_norm": 21927.455078125, + "learning_rate": 3.465159776657774e-05, + "loss": 4.8366, + "step": 1718 + }, + { + "epoch": 0.4370570475131098, + "grad_norm": 22044.982421875, + "learning_rate": 3.4631121860106926e-05, + "loss": 4.8651, + "step": 1719 + }, + { + "epoch": 0.43731129826791676, + "grad_norm": 21996.814453125, + "learning_rate": 3.461063836409585e-05, + "loss": 4.852, + "step": 1720 + }, + { + "epoch": 0.43756554902272365, + "grad_norm": 22560.287109375, + "learning_rate": 3.459014729468597e-05, + "loss": 4.8768, + "step": 1721 + }, + { + "epoch": 0.4378197997775306, + "grad_norm": 22825.607421875, + "learning_rate": 3.4569648668024704e-05, + "loss": 4.8617, + "step": 1722 + }, + { + "epoch": 0.4380740505323375, + "grad_norm": 22108.912109375, + "learning_rate": 3.454914250026542e-05, + "loss": 4.8654, + "step": 1723 + }, + { + "epoch": 0.43832830128714445, + "grad_norm": 22084.595703125, + "learning_rate": 3.452862880756743e-05, + "loss": 4.8596, + "step": 1724 + }, + { + "epoch": 0.4385825520419514, + "grad_norm": 22076.978515625, + "learning_rate": 3.450810760609601e-05, + "loss": 4.8691, + "step": 1725 + }, + { + "epoch": 0.4388368027967583, + "grad_norm": 22177.564453125, + "learning_rate": 3.448757891202232e-05, + "loss": 4.8576, + "step": 1726 + }, + { + "epoch": 0.43909105355156525, + "grad_norm": 22192.54296875, + "learning_rate": 3.446704274152343e-05, + "loss": 4.8428, + "step": 1727 + }, + { + "epoch": 0.43934530430637214, + "grad_norm": 22073.0859375, + "learning_rate": 3.44464991107823e-05, + "loss": 4.8525, + "step": 1728 + }, + { + "epoch": 0.4395995550611791, + "grad_norm": 21942.03515625, + "learning_rate": 3.442594803598778e-05, + "loss": 4.8546, + "step": 1729 + }, + { + "epoch": 0.43985380581598604, + "grad_norm": 21952.177734375, + "learning_rate": 3.440538953333456e-05, + "loss": 4.8412, + "step": 1730 + }, + { + "epoch": 0.44010805657079294, + "grad_norm": 22270.20703125, + "learning_rate": 3.4384823619023224e-05, + "loss": 4.8562, + "step": 1731 + }, + { + "epoch": 0.4403623073255999, + "grad_norm": 22065.0703125, + "learning_rate": 3.436425030926017e-05, + "loss": 4.8524, + "step": 1732 + }, + { + "epoch": 0.4406165580804068, + "grad_norm": 22071.935546875, + "learning_rate": 3.434366962025761e-05, + "loss": 4.852, + "step": 1733 + }, + { + "epoch": 0.44087080883521373, + "grad_norm": 22116.904296875, + "learning_rate": 3.432308156823361e-05, + "loss": 4.8633, + "step": 1734 + }, + { + "epoch": 0.4411250595900207, + "grad_norm": 22021.4140625, + "learning_rate": 3.4302486169412004e-05, + "loss": 4.854, + "step": 1735 + }, + { + "epoch": 0.4413793103448276, + "grad_norm": 22173.978515625, + "learning_rate": 3.428188344002244e-05, + "loss": 4.8527, + "step": 1736 + }, + { + "epoch": 0.44163356109963453, + "grad_norm": 22006.560546875, + "learning_rate": 3.426127339630032e-05, + "loss": 4.862, + "step": 1737 + }, + { + "epoch": 0.4418878118544414, + "grad_norm": 22062.224609375, + "learning_rate": 3.424065605448683e-05, + "loss": 4.8532, + "step": 1738 + }, + { + "epoch": 0.4421420626092484, + "grad_norm": 22106.29296875, + "learning_rate": 3.4220031430828895e-05, + "loss": 4.8619, + "step": 1739 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 21951.20703125, + "learning_rate": 3.419939954157917e-05, + "loss": 4.8424, + "step": 1740 + }, + { + "epoch": 0.4426505641188622, + "grad_norm": 22118.111328125, + "learning_rate": 3.4178760402996066e-05, + "loss": 4.8613, + "step": 1741 + }, + { + "epoch": 0.4429048148736692, + "grad_norm": 22128.494140625, + "learning_rate": 3.415811403134369e-05, + "loss": 4.8609, + "step": 1742 + }, + { + "epoch": 0.44315906562847607, + "grad_norm": 22074.3671875, + "learning_rate": 3.4137460442891824e-05, + "loss": 4.856, + "step": 1743 + }, + { + "epoch": 0.443413316383283, + "grad_norm": 22072.138671875, + "learning_rate": 3.411679965391598e-05, + "loss": 4.8403, + "step": 1744 + }, + { + "epoch": 0.44366756713808997, + "grad_norm": 22199.03125, + "learning_rate": 3.4096131680697305e-05, + "loss": 4.8395, + "step": 1745 + }, + { + "epoch": 0.44392181789289686, + "grad_norm": 22152.58984375, + "learning_rate": 3.407545653952264e-05, + "loss": 4.8591, + "step": 1746 + }, + { + "epoch": 0.4441760686477038, + "grad_norm": 22074.8359375, + "learning_rate": 3.405477424668445e-05, + "loss": 4.8495, + "step": 1747 + }, + { + "epoch": 0.4444303194025107, + "grad_norm": 22346.208984375, + "learning_rate": 3.4034084818480865e-05, + "loss": 4.8546, + "step": 1748 + }, + { + "epoch": 0.44468457015731766, + "grad_norm": 22437.583984375, + "learning_rate": 3.40133882712156e-05, + "loss": 4.8456, + "step": 1749 + }, + { + "epoch": 0.44493882091212456, + "grad_norm": 22261.560546875, + "learning_rate": 3.3992684621198006e-05, + "loss": 4.8663, + "step": 1750 + }, + { + "epoch": 0.4451930716669315, + "grad_norm": 22399.09765625, + "learning_rate": 3.397197388474302e-05, + "loss": 4.8698, + "step": 1751 + }, + { + "epoch": 0.44544732242173846, + "grad_norm": 22262.453125, + "learning_rate": 3.395125607817118e-05, + "loss": 4.854, + "step": 1752 + }, + { + "epoch": 0.44570157317654535, + "grad_norm": 22260.6484375, + "learning_rate": 3.393053121780857e-05, + "loss": 4.8502, + "step": 1753 + }, + { + "epoch": 0.4459558239313523, + "grad_norm": 22395.189453125, + "learning_rate": 3.390979931998685e-05, + "loss": 4.8319, + "step": 1754 + }, + { + "epoch": 0.4462100746861592, + "grad_norm": 22180.734375, + "learning_rate": 3.388906040104322e-05, + "loss": 4.8537, + "step": 1755 + }, + { + "epoch": 0.44646432544096615, + "grad_norm": 22464.212890625, + "learning_rate": 3.386831447732041e-05, + "loss": 4.8603, + "step": 1756 + }, + { + "epoch": 0.4467185761957731, + "grad_norm": 22336.3828125, + "learning_rate": 3.384756156516667e-05, + "loss": 4.846, + "step": 1757 + }, + { + "epoch": 0.44697282695058, + "grad_norm": 22041.79296875, + "learning_rate": 3.3826801680935785e-05, + "loss": 4.8461, + "step": 1758 + }, + { + "epoch": 0.44722707770538694, + "grad_norm": 22224.439453125, + "learning_rate": 3.380603484098698e-05, + "loss": 4.8454, + "step": 1759 + }, + { + "epoch": 0.44748132846019384, + "grad_norm": 23013.74609375, + "learning_rate": 3.3785261061685015e-05, + "loss": 4.8579, + "step": 1760 + }, + { + "epoch": 0.4477355792150008, + "grad_norm": 22225.5390625, + "learning_rate": 3.376448035940007e-05, + "loss": 4.8504, + "step": 1761 + }, + { + "epoch": 0.44798982996980774, + "grad_norm": 22198.61328125, + "learning_rate": 3.374369275050783e-05, + "loss": 4.8444, + "step": 1762 + }, + { + "epoch": 0.44824408072461464, + "grad_norm": 22202.001953125, + "learning_rate": 3.372289825138938e-05, + "loss": 4.8501, + "step": 1763 + }, + { + "epoch": 0.4484983314794216, + "grad_norm": 22426.322265625, + "learning_rate": 3.370209687843126e-05, + "loss": 4.8384, + "step": 1764 + }, + { + "epoch": 0.4487525822342285, + "grad_norm": 22443.2109375, + "learning_rate": 3.3681288648025415e-05, + "loss": 4.8439, + "step": 1765 + }, + { + "epoch": 0.44900683298903543, + "grad_norm": 22218.0078125, + "learning_rate": 3.36604735765692e-05, + "loss": 4.8364, + "step": 1766 + }, + { + "epoch": 0.4492610837438424, + "grad_norm": 52952.296875, + "learning_rate": 3.363965168046537e-05, + "loss": 4.8266, + "step": 1767 + }, + { + "epoch": 0.4495153344986493, + "grad_norm": 22943.0078125, + "learning_rate": 3.361882297612202e-05, + "loss": 4.8386, + "step": 1768 + }, + { + "epoch": 0.4497695852534562, + "grad_norm": 22271.9609375, + "learning_rate": 3.359798747995266e-05, + "loss": 4.8391, + "step": 1769 + }, + { + "epoch": 0.4500238360082631, + "grad_norm": 25515.71875, + "learning_rate": 3.357714520837612e-05, + "loss": 4.85, + "step": 1770 + }, + { + "epoch": 0.4502780867630701, + "grad_norm": 22855.2578125, + "learning_rate": 3.3556296177816575e-05, + "loss": 4.8389, + "step": 1771 + }, + { + "epoch": 0.450532337517877, + "grad_norm": 23352.083984375, + "learning_rate": 3.353544040470353e-05, + "loss": 4.8464, + "step": 1772 + }, + { + "epoch": 0.4507865882726839, + "grad_norm": 22517.65625, + "learning_rate": 3.351457790547182e-05, + "loss": 4.8351, + "step": 1773 + }, + { + "epoch": 0.45104083902749087, + "grad_norm": 24770.787109375, + "learning_rate": 3.3493708696561545e-05, + "loss": 4.8365, + "step": 1774 + }, + { + "epoch": 0.45129508978229776, + "grad_norm": 23343.32421875, + "learning_rate": 3.34728327944181e-05, + "loss": 4.8194, + "step": 1775 + }, + { + "epoch": 0.4515493405371047, + "grad_norm": 36990.63671875, + "learning_rate": 3.345195021549219e-05, + "loss": 4.8467, + "step": 1776 + }, + { + "epoch": 0.45180359129191167, + "grad_norm": 22373.421875, + "learning_rate": 3.3431060976239736e-05, + "loss": 4.8285, + "step": 1777 + }, + { + "epoch": 0.45205784204671856, + "grad_norm": 25395.486328125, + "learning_rate": 3.341016509312194e-05, + "loss": 4.8273, + "step": 1778 + }, + { + "epoch": 0.4523120928015255, + "grad_norm": 22469.193359375, + "learning_rate": 3.3389262582605224e-05, + "loss": 4.8463, + "step": 1779 + }, + { + "epoch": 0.4525663435563324, + "grad_norm": 23005.87890625, + "learning_rate": 3.3368353461161235e-05, + "loss": 4.8415, + "step": 1780 + }, + { + "epoch": 0.45282059431113936, + "grad_norm": 22361.6015625, + "learning_rate": 3.3347437745266824e-05, + "loss": 4.8372, + "step": 1781 + }, + { + "epoch": 0.4530748450659463, + "grad_norm": 23105.833984375, + "learning_rate": 3.332651545140405e-05, + "loss": 4.8492, + "step": 1782 + }, + { + "epoch": 0.4533290958207532, + "grad_norm": 22482.94921875, + "learning_rate": 3.330558659606015e-05, + "loss": 4.8373, + "step": 1783 + }, + { + "epoch": 0.45358334657556015, + "grad_norm": 22494.169921875, + "learning_rate": 3.3284651195727545e-05, + "loss": 4.8322, + "step": 1784 + }, + { + "epoch": 0.45383759733036705, + "grad_norm": 22108.89453125, + "learning_rate": 3.326370926690378e-05, + "loss": 4.8374, + "step": 1785 + }, + { + "epoch": 0.454091848085174, + "grad_norm": 22578.302734375, + "learning_rate": 3.3242760826091566e-05, + "loss": 4.8284, + "step": 1786 + }, + { + "epoch": 0.45434609883998095, + "grad_norm": 22233.822265625, + "learning_rate": 3.3221805889798754e-05, + "loss": 4.8246, + "step": 1787 + }, + { + "epoch": 0.45460034959478784, + "grad_norm": 22614.87109375, + "learning_rate": 3.3200844474538306e-05, + "loss": 4.8344, + "step": 1788 + }, + { + "epoch": 0.4548546003495948, + "grad_norm": 22526.244140625, + "learning_rate": 3.317987659682828e-05, + "loss": 4.8355, + "step": 1789 + }, + { + "epoch": 0.4551088511044017, + "grad_norm": 22276.392578125, + "learning_rate": 3.3158902273191847e-05, + "loss": 4.8219, + "step": 1790 + }, + { + "epoch": 0.45536310185920864, + "grad_norm": 22414.005859375, + "learning_rate": 3.3137921520157225e-05, + "loss": 4.8356, + "step": 1791 + }, + { + "epoch": 0.4556173526140156, + "grad_norm": 22496.921875, + "learning_rate": 3.3116934354257735e-05, + "loss": 4.8294, + "step": 1792 + }, + { + "epoch": 0.4558716033688225, + "grad_norm": 22519.23046875, + "learning_rate": 3.309594079203173e-05, + "loss": 4.8446, + "step": 1793 + }, + { + "epoch": 0.45612585412362944, + "grad_norm": 22497.74609375, + "learning_rate": 3.307494085002261e-05, + "loss": 4.8341, + "step": 1794 + }, + { + "epoch": 0.45638010487843633, + "grad_norm": 22519.412109375, + "learning_rate": 3.305393454477879e-05, + "loss": 4.8129, + "step": 1795 + }, + { + "epoch": 0.4566343556332433, + "grad_norm": 22552.716796875, + "learning_rate": 3.303292189285373e-05, + "loss": 4.8511, + "step": 1796 + }, + { + "epoch": 0.45688860638805023, + "grad_norm": 22368.73828125, + "learning_rate": 3.301190291080585e-05, + "loss": 4.8399, + "step": 1797 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 22402.310546875, + "learning_rate": 3.29908776151986e-05, + "loss": 4.8448, + "step": 1798 + }, + { + "epoch": 0.4573971078976641, + "grad_norm": 22370.611328125, + "learning_rate": 3.296984602260037e-05, + "loss": 4.8192, + "step": 1799 + }, + { + "epoch": 0.457651358652471, + "grad_norm": 22556.884765625, + "learning_rate": 3.294880814958453e-05, + "loss": 4.8415, + "step": 1800 + }, + { + "epoch": 0.457651358652471, + "eval_loss": 9.730688095092773, + "eval_runtime": 697.963, + "eval_samples_per_second": 151.85, + "eval_steps_per_second": 9.492, + "step": 1800 + }, + { + "epoch": 0.4579056094072779, + "grad_norm": 22225.671875, + "learning_rate": 3.292776401272941e-05, + "loss": 4.8105, + "step": 1801 + }, + { + "epoch": 0.4581598601620849, + "grad_norm": 22374.408203125, + "learning_rate": 3.2906713628618234e-05, + "loss": 4.8204, + "step": 1802 + }, + { + "epoch": 0.45841411091689177, + "grad_norm": 22469.703125, + "learning_rate": 3.28856570138392e-05, + "loss": 4.8198, + "step": 1803 + }, + { + "epoch": 0.4586683616716987, + "grad_norm": 22390.865234375, + "learning_rate": 3.2864594184985396e-05, + "loss": 4.8329, + "step": 1804 + }, + { + "epoch": 0.4589226124265056, + "grad_norm": 22345.4609375, + "learning_rate": 3.2843525158654794e-05, + "loss": 4.8311, + "step": 1805 + }, + { + "epoch": 0.45917686318131257, + "grad_norm": 22406.984375, + "learning_rate": 3.282244995145025e-05, + "loss": 4.8197, + "step": 1806 + }, + { + "epoch": 0.4594311139361195, + "grad_norm": 22343.69140625, + "learning_rate": 3.2801368579979525e-05, + "loss": 4.8246, + "step": 1807 + }, + { + "epoch": 0.4596853646909264, + "grad_norm": 22558.763671875, + "learning_rate": 3.278028106085519e-05, + "loss": 4.8176, + "step": 1808 + }, + { + "epoch": 0.45993961544573336, + "grad_norm": 48236.984375, + "learning_rate": 3.27591874106947e-05, + "loss": 4.8308, + "step": 1809 + }, + { + "epoch": 0.46019386620054026, + "grad_norm": 22446.171875, + "learning_rate": 3.273808764612032e-05, + "loss": 4.8223, + "step": 1810 + }, + { + "epoch": 0.4604481169553472, + "grad_norm": 22481.224609375, + "learning_rate": 3.271698178375913e-05, + "loss": 4.8325, + "step": 1811 + }, + { + "epoch": 0.46070236771015416, + "grad_norm": 22303.732421875, + "learning_rate": 3.269586984024303e-05, + "loss": 4.8268, + "step": 1812 + }, + { + "epoch": 0.46095661846496105, + "grad_norm": 22489.181640625, + "learning_rate": 3.267475183220871e-05, + "loss": 4.8312, + "step": 1813 + }, + { + "epoch": 0.461210869219768, + "grad_norm": 22360.875, + "learning_rate": 3.265362777629763e-05, + "loss": 4.81, + "step": 1814 + }, + { + "epoch": 0.4614651199745749, + "grad_norm": 22496.4765625, + "learning_rate": 3.263249768915602e-05, + "loss": 4.8245, + "step": 1815 + }, + { + "epoch": 0.46171937072938185, + "grad_norm": 22410.193359375, + "learning_rate": 3.261136158743486e-05, + "loss": 4.8223, + "step": 1816 + }, + { + "epoch": 0.4619736214841888, + "grad_norm": 22436.291015625, + "learning_rate": 3.259021948778988e-05, + "loss": 4.8258, + "step": 1817 + }, + { + "epoch": 0.4622278722389957, + "grad_norm": 22534.212890625, + "learning_rate": 3.2569071406881526e-05, + "loss": 4.8136, + "step": 1818 + }, + { + "epoch": 0.46248212299380265, + "grad_norm": 23084.279296875, + "learning_rate": 3.254791736137495e-05, + "loss": 4.8089, + "step": 1819 + }, + { + "epoch": 0.46273637374860954, + "grad_norm": 22466.341796875, + "learning_rate": 3.252675736794003e-05, + "loss": 4.8326, + "step": 1820 + }, + { + "epoch": 0.4629906245034165, + "grad_norm": 22379.46875, + "learning_rate": 3.250559144325132e-05, + "loss": 4.8125, + "step": 1821 + }, + { + "epoch": 0.46324487525822344, + "grad_norm": 22613.451171875, + "learning_rate": 3.2484419603988026e-05, + "loss": 4.8062, + "step": 1822 + }, + { + "epoch": 0.46349912601303034, + "grad_norm": 22390.404296875, + "learning_rate": 3.2463241866834047e-05, + "loss": 4.8103, + "step": 1823 + }, + { + "epoch": 0.4637533767678373, + "grad_norm": 22634.107421875, + "learning_rate": 3.2442058248477905e-05, + "loss": 4.8151, + "step": 1824 + }, + { + "epoch": 0.4640076275226442, + "grad_norm": 22431.23046875, + "learning_rate": 3.2420868765612765e-05, + "loss": 4.8303, + "step": 1825 + }, + { + "epoch": 0.46426187827745113, + "grad_norm": 22459.298828125, + "learning_rate": 3.239967343493643e-05, + "loss": 4.8005, + "step": 1826 + }, + { + "epoch": 0.4645161290322581, + "grad_norm": 22400.962890625, + "learning_rate": 3.237847227315129e-05, + "loss": 4.8165, + "step": 1827 + }, + { + "epoch": 0.464770379787065, + "grad_norm": 22365.119140625, + "learning_rate": 3.235726529696433e-05, + "loss": 4.8205, + "step": 1828 + }, + { + "epoch": 0.46502463054187193, + "grad_norm": 22584.58984375, + "learning_rate": 3.233605252308713e-05, + "loss": 4.827, + "step": 1829 + }, + { + "epoch": 0.4652788812966788, + "grad_norm": 22684.552734375, + "learning_rate": 3.231483396823583e-05, + "loss": 4.8157, + "step": 1830 + }, + { + "epoch": 0.4655331320514858, + "grad_norm": 22396.828125, + "learning_rate": 3.229360964913112e-05, + "loss": 4.8146, + "step": 1831 + }, + { + "epoch": 0.4657873828062927, + "grad_norm": 22500.2578125, + "learning_rate": 3.2272379582498265e-05, + "loss": 4.8154, + "step": 1832 + }, + { + "epoch": 0.4660416335610996, + "grad_norm": 22640.51171875, + "learning_rate": 3.2251143785066996e-05, + "loss": 4.8118, + "step": 1833 + }, + { + "epoch": 0.4662958843159066, + "grad_norm": 22688.171875, + "learning_rate": 3.222990227357163e-05, + "loss": 4.8047, + "step": 1834 + }, + { + "epoch": 0.46655013507071347, + "grad_norm": 22385.5625, + "learning_rate": 3.2208655064750945e-05, + "loss": 4.8106, + "step": 1835 + }, + { + "epoch": 0.4668043858255204, + "grad_norm": 22749.375, + "learning_rate": 3.218740217534822e-05, + "loss": 4.8108, + "step": 1836 + }, + { + "epoch": 0.46705863658032737, + "grad_norm": 22512.611328125, + "learning_rate": 3.21661436221112e-05, + "loss": 4.8068, + "step": 1837 + }, + { + "epoch": 0.46731288733513426, + "grad_norm": 22616.384765625, + "learning_rate": 3.214487942179212e-05, + "loss": 4.7913, + "step": 1838 + }, + { + "epoch": 0.4675671380899412, + "grad_norm": 22521.5, + "learning_rate": 3.2123609591147624e-05, + "loss": 4.7978, + "step": 1839 + }, + { + "epoch": 0.4678213888447481, + "grad_norm": 22541.130859375, + "learning_rate": 3.2102334146938836e-05, + "loss": 4.8018, + "step": 1840 + }, + { + "epoch": 0.46807563959955506, + "grad_norm": 22782.498046875, + "learning_rate": 3.2081053105931274e-05, + "loss": 4.8019, + "step": 1841 + }, + { + "epoch": 0.468329890354362, + "grad_norm": 22556.59765625, + "learning_rate": 3.2059766484894874e-05, + "loss": 4.8125, + "step": 1842 + }, + { + "epoch": 0.4685841411091689, + "grad_norm": 22595.67578125, + "learning_rate": 3.203847430060398e-05, + "loss": 4.8114, + "step": 1843 + }, + { + "epoch": 0.46883839186397586, + "grad_norm": 22675.302734375, + "learning_rate": 3.2017176569837305e-05, + "loss": 4.8037, + "step": 1844 + }, + { + "epoch": 0.46909264261878275, + "grad_norm": 22500.16796875, + "learning_rate": 3.1995873309377946e-05, + "loss": 4.8086, + "step": 1845 + }, + { + "epoch": 0.4693468933735897, + "grad_norm": 22399.734375, + "learning_rate": 3.1974564536013344e-05, + "loss": 4.8002, + "step": 1846 + }, + { + "epoch": 0.46960114412839665, + "grad_norm": 22821.880859375, + "learning_rate": 3.195325026653528e-05, + "loss": 4.8165, + "step": 1847 + }, + { + "epoch": 0.46985539488320355, + "grad_norm": 22599.626953125, + "learning_rate": 3.1931930517739904e-05, + "loss": 4.7915, + "step": 1848 + }, + { + "epoch": 0.4701096456380105, + "grad_norm": 22832.888671875, + "learning_rate": 3.191060530642763e-05, + "loss": 4.8171, + "step": 1849 + }, + { + "epoch": 0.4703638963928174, + "grad_norm": 22452.3046875, + "learning_rate": 3.188927464940323e-05, + "loss": 4.8034, + "step": 1850 + }, + { + "epoch": 0.47061814714762434, + "grad_norm": 22611.71875, + "learning_rate": 3.1867938563475716e-05, + "loss": 4.8099, + "step": 1851 + }, + { + "epoch": 0.4708723979024313, + "grad_norm": 22378.44921875, + "learning_rate": 3.1846597065458414e-05, + "loss": 4.795, + "step": 1852 + }, + { + "epoch": 0.4711266486572382, + "grad_norm": 22750.693359375, + "learning_rate": 3.1825250172168904e-05, + "loss": 4.809, + "step": 1853 + }, + { + "epoch": 0.47138089941204514, + "grad_norm": 22607.263671875, + "learning_rate": 3.180389790042902e-05, + "loss": 4.8168, + "step": 1854 + }, + { + "epoch": 0.47163515016685204, + "grad_norm": 22431.84765625, + "learning_rate": 3.178254026706481e-05, + "loss": 4.8026, + "step": 1855 + }, + { + "epoch": 0.471889400921659, + "grad_norm": 22443.013671875, + "learning_rate": 3.1761177288906594e-05, + "loss": 4.7883, + "step": 1856 + }, + { + "epoch": 0.47214365167646594, + "grad_norm": 22635.654296875, + "learning_rate": 3.1739808982788875e-05, + "loss": 4.8036, + "step": 1857 + }, + { + "epoch": 0.47239790243127283, + "grad_norm": 22704.373046875, + "learning_rate": 3.171843536555035e-05, + "loss": 4.8044, + "step": 1858 + }, + { + "epoch": 0.4726521531860798, + "grad_norm": 22658.59375, + "learning_rate": 3.169705645403391e-05, + "loss": 4.791, + "step": 1859 + }, + { + "epoch": 0.4729064039408867, + "grad_norm": 22853.2890625, + "learning_rate": 3.167567226508663e-05, + "loss": 4.8033, + "step": 1860 + }, + { + "epoch": 0.4731606546956936, + "grad_norm": 22567.544921875, + "learning_rate": 3.1654282815559714e-05, + "loss": 4.8012, + "step": 1861 + }, + { + "epoch": 0.4734149054505006, + "grad_norm": 22865.716796875, + "learning_rate": 3.163288812230852e-05, + "loss": 4.8013, + "step": 1862 + }, + { + "epoch": 0.4736691562053075, + "grad_norm": 22588.98046875, + "learning_rate": 3.1611488202192586e-05, + "loss": 4.7938, + "step": 1863 + }, + { + "epoch": 0.4739234069601144, + "grad_norm": 22473.080078125, + "learning_rate": 3.159008307207549e-05, + "loss": 4.8004, + "step": 1864 + }, + { + "epoch": 0.4741776577149213, + "grad_norm": 22534.927734375, + "learning_rate": 3.156867274882497e-05, + "loss": 4.7811, + "step": 1865 + }, + { + "epoch": 0.47443190846972827, + "grad_norm": 22676.81640625, + "learning_rate": 3.1547257249312856e-05, + "loss": 4.7965, + "step": 1866 + }, + { + "epoch": 0.4746861592245352, + "grad_norm": 22663.265625, + "learning_rate": 3.152583659041501e-05, + "loss": 4.8092, + "step": 1867 + }, + { + "epoch": 0.4749404099793421, + "grad_norm": 22629.4296875, + "learning_rate": 3.1504410789011424e-05, + "loss": 4.8125, + "step": 1868 + }, + { + "epoch": 0.47519466073414907, + "grad_norm": 22685.26953125, + "learning_rate": 3.148297986198609e-05, + "loss": 4.8051, + "step": 1869 + }, + { + "epoch": 0.47544891148895596, + "grad_norm": 22487.66796875, + "learning_rate": 3.146154382622707e-05, + "loss": 4.7956, + "step": 1870 + }, + { + "epoch": 0.4757031622437629, + "grad_norm": 22590.541015625, + "learning_rate": 3.1440102698626435e-05, + "loss": 4.7848, + "step": 1871 + }, + { + "epoch": 0.47595741299856986, + "grad_norm": 22767.001953125, + "learning_rate": 3.1418656496080286e-05, + "loss": 4.8052, + "step": 1872 + }, + { + "epoch": 0.47621166375337676, + "grad_norm": 22659.193359375, + "learning_rate": 3.139720523548869e-05, + "loss": 4.7967, + "step": 1873 + }, + { + "epoch": 0.4764659145081837, + "grad_norm": 22626.56640625, + "learning_rate": 3.137574893375575e-05, + "loss": 4.7849, + "step": 1874 + }, + { + "epoch": 0.4767201652629906, + "grad_norm": 22797.703125, + "learning_rate": 3.135428760778949e-05, + "loss": 4.7939, + "step": 1875 + }, + { + "epoch": 0.47697441601779755, + "grad_norm": 22589.9765625, + "learning_rate": 3.133282127450193e-05, + "loss": 4.7823, + "step": 1876 + }, + { + "epoch": 0.4772286667726045, + "grad_norm": 22667.11328125, + "learning_rate": 3.131134995080902e-05, + "loss": 4.7886, + "step": 1877 + }, + { + "epoch": 0.4774829175274114, + "grad_norm": 22582.736328125, + "learning_rate": 3.1289873653630646e-05, + "loss": 4.773, + "step": 1878 + }, + { + "epoch": 0.47773716828221835, + "grad_norm": 22789.642578125, + "learning_rate": 3.126839239989061e-05, + "loss": 4.7998, + "step": 1879 + }, + { + "epoch": 0.47799141903702524, + "grad_norm": 22740.98828125, + "learning_rate": 3.124690620651661e-05, + "loss": 4.7894, + "step": 1880 + }, + { + "epoch": 0.4782456697918322, + "grad_norm": 22670.673828125, + "learning_rate": 3.122541509044027e-05, + "loss": 4.7907, + "step": 1881 + }, + { + "epoch": 0.47849992054663915, + "grad_norm": 22658.84765625, + "learning_rate": 3.120391906859707e-05, + "loss": 4.8005, + "step": 1882 + }, + { + "epoch": 0.47875417130144604, + "grad_norm": 22749.806640625, + "learning_rate": 3.118241815792635e-05, + "loss": 4.7983, + "step": 1883 + }, + { + "epoch": 0.479008422056253, + "grad_norm": 22623.4375, + "learning_rate": 3.116091237537131e-05, + "loss": 4.7853, + "step": 1884 + }, + { + "epoch": 0.4792626728110599, + "grad_norm": 22501.96484375, + "learning_rate": 3.113940173787899e-05, + "loss": 4.7945, + "step": 1885 + }, + { + "epoch": 0.47951692356586684, + "grad_norm": 22562.8203125, + "learning_rate": 3.1117886262400254e-05, + "loss": 4.7908, + "step": 1886 + }, + { + "epoch": 0.4797711743206738, + "grad_norm": 22628.876953125, + "learning_rate": 3.109636596588978e-05, + "loss": 4.7882, + "step": 1887 + }, + { + "epoch": 0.4800254250754807, + "grad_norm": 22786.875, + "learning_rate": 3.1074840865306056e-05, + "loss": 4.7961, + "step": 1888 + }, + { + "epoch": 0.48027967583028763, + "grad_norm": 22741.66796875, + "learning_rate": 3.105331097761133e-05, + "loss": 4.7988, + "step": 1889 + }, + { + "epoch": 0.48053392658509453, + "grad_norm": 22783.935546875, + "learning_rate": 3.1031776319771645e-05, + "loss": 4.7738, + "step": 1890 + }, + { + "epoch": 0.4807881773399015, + "grad_norm": 22623.73828125, + "learning_rate": 3.101023690875679e-05, + "loss": 4.7721, + "step": 1891 + }, + { + "epoch": 0.48104242809470843, + "grad_norm": 22722.255859375, + "learning_rate": 3.0988692761540314e-05, + "loss": 4.7854, + "step": 1892 + }, + { + "epoch": 0.4812966788495153, + "grad_norm": 22793.50390625, + "learning_rate": 3.096714389509947e-05, + "loss": 4.7948, + "step": 1893 + }, + { + "epoch": 0.4815509296043223, + "grad_norm": 22652.376953125, + "learning_rate": 3.094559032641527e-05, + "loss": 4.7883, + "step": 1894 + }, + { + "epoch": 0.48180518035912917, + "grad_norm": 22778.07421875, + "learning_rate": 3.0924032072472395e-05, + "loss": 4.7888, + "step": 1895 + }, + { + "epoch": 0.4820594311139361, + "grad_norm": 22720.45703125, + "learning_rate": 3.090246915025924e-05, + "loss": 4.7835, + "step": 1896 + }, + { + "epoch": 0.48231368186874307, + "grad_norm": 22859.427734375, + "learning_rate": 3.088090157676787e-05, + "loss": 4.7792, + "step": 1897 + }, + { + "epoch": 0.48256793262354997, + "grad_norm": 22847.28125, + "learning_rate": 3.085932936899402e-05, + "loss": 4.7792, + "step": 1898 + }, + { + "epoch": 0.4828221833783569, + "grad_norm": 22616.79296875, + "learning_rate": 3.083775254393707e-05, + "loss": 4.7692, + "step": 1899 + }, + { + "epoch": 0.4830764341331638, + "grad_norm": 22816.8984375, + "learning_rate": 3.081617111860004e-05, + "loss": 4.7693, + "step": 1900 + }, + { + "epoch": 0.4830764341331638, + "eval_loss": 9.641205787658691, + "eval_runtime": 695.8747, + "eval_samples_per_second": 152.306, + "eval_steps_per_second": 9.52, + "step": 1900 + }, + { + "epoch": 0.48333068488797076, + "grad_norm": 22595.158203125, + "learning_rate": 3.0794585109989583e-05, + "loss": 4.7814, + "step": 1901 + }, + { + "epoch": 0.4835849356427777, + "grad_norm": 22686.86328125, + "learning_rate": 3.077299453511596e-05, + "loss": 4.7804, + "step": 1902 + }, + { + "epoch": 0.4838391863975846, + "grad_norm": 22691.53125, + "learning_rate": 3.0751399410993026e-05, + "loss": 4.785, + "step": 1903 + }, + { + "epoch": 0.48409343715239156, + "grad_norm": 22758.580078125, + "learning_rate": 3.072979975463822e-05, + "loss": 4.7882, + "step": 1904 + }, + { + "epoch": 0.48434768790719845, + "grad_norm": 22638.390625, + "learning_rate": 3.070819558307256e-05, + "loss": 4.7726, + "step": 1905 + }, + { + "epoch": 0.4846019386620054, + "grad_norm": 22842.73046875, + "learning_rate": 3.068658691332063e-05, + "loss": 4.7815, + "step": 1906 + }, + { + "epoch": 0.48485618941681236, + "grad_norm": 22637.6171875, + "learning_rate": 3.066497376241052e-05, + "loss": 4.7837, + "step": 1907 + }, + { + "epoch": 0.48511044017161925, + "grad_norm": 22845.419921875, + "learning_rate": 3.0643356147373906e-05, + "loss": 4.7967, + "step": 1908 + }, + { + "epoch": 0.4853646909264262, + "grad_norm": 22819.27734375, + "learning_rate": 3.062173408524593e-05, + "loss": 4.7798, + "step": 1909 + }, + { + "epoch": 0.4856189416812331, + "grad_norm": 22656.298828125, + "learning_rate": 3.0600107593065274e-05, + "loss": 4.7841, + "step": 1910 + }, + { + "epoch": 0.48587319243604005, + "grad_norm": 22886.1484375, + "learning_rate": 3.05784766878741e-05, + "loss": 4.7769, + "step": 1911 + }, + { + "epoch": 0.486127443190847, + "grad_norm": 22839.2421875, + "learning_rate": 3.055684138671805e-05, + "loss": 4.7807, + "step": 1912 + }, + { + "epoch": 0.4863816939456539, + "grad_norm": 22790.19921875, + "learning_rate": 3.053520170664623e-05, + "loss": 4.7665, + "step": 1913 + }, + { + "epoch": 0.48663594470046084, + "grad_norm": 22900.73046875, + "learning_rate": 3.051355766471118e-05, + "loss": 4.7709, + "step": 1914 + }, + { + "epoch": 0.48689019545526774, + "grad_norm": 22733.4375, + "learning_rate": 3.0491909277968895e-05, + "loss": 4.7778, + "step": 1915 + }, + { + "epoch": 0.4871444462100747, + "grad_norm": 22799.046875, + "learning_rate": 3.0470256563478793e-05, + "loss": 4.7734, + "step": 1916 + }, + { + "epoch": 0.48739869696488164, + "grad_norm": 22931.326171875, + "learning_rate": 3.04485995383037e-05, + "loss": 4.775, + "step": 1917 + }, + { + "epoch": 0.48765294771968853, + "grad_norm": 22791.875, + "learning_rate": 3.0426938219509837e-05, + "loss": 4.7782, + "step": 1918 + }, + { + "epoch": 0.4879071984744955, + "grad_norm": 22681.470703125, + "learning_rate": 3.0405272624166807e-05, + "loss": 4.7911, + "step": 1919 + }, + { + "epoch": 0.4881614492293024, + "grad_norm": 22781.81640625, + "learning_rate": 3.0383602769347595e-05, + "loss": 4.7706, + "step": 1920 + }, + { + "epoch": 0.48841569998410933, + "grad_norm": 22732.724609375, + "learning_rate": 3.0361928672128526e-05, + "loss": 4.7766, + "step": 1921 + }, + { + "epoch": 0.4886699507389163, + "grad_norm": 22938.74609375, + "learning_rate": 3.0340250349589266e-05, + "loss": 4.7704, + "step": 1922 + }, + { + "epoch": 0.4889242014937232, + "grad_norm": 22578.44140625, + "learning_rate": 3.0318567818812836e-05, + "loss": 4.7681, + "step": 1923 + }, + { + "epoch": 0.4891784522485301, + "grad_norm": 22930.767578125, + "learning_rate": 3.029688109688555e-05, + "loss": 4.7608, + "step": 1924 + }, + { + "epoch": 0.489432703003337, + "grad_norm": 22787.478515625, + "learning_rate": 3.0275190200897035e-05, + "loss": 4.7816, + "step": 1925 + }, + { + "epoch": 0.489686953758144, + "grad_norm": 23046.08984375, + "learning_rate": 3.0253495147940197e-05, + "loss": 4.7814, + "step": 1926 + }, + { + "epoch": 0.4899412045129509, + "grad_norm": 22750.03515625, + "learning_rate": 3.023179595511123e-05, + "loss": 4.776, + "step": 1927 + }, + { + "epoch": 0.4901954552677578, + "grad_norm": 22935.765625, + "learning_rate": 3.0210092639509586e-05, + "loss": 4.7705, + "step": 1928 + }, + { + "epoch": 0.49044970602256477, + "grad_norm": 23020.4921875, + "learning_rate": 3.0188385218237957e-05, + "loss": 4.786, + "step": 1929 + }, + { + "epoch": 0.49070395677737166, + "grad_norm": 23021.29296875, + "learning_rate": 3.0166673708402287e-05, + "loss": 4.7778, + "step": 1930 + }, + { + "epoch": 0.4909582075321786, + "grad_norm": 23370.517578125, + "learning_rate": 3.014495812711174e-05, + "loss": 4.76, + "step": 1931 + }, + { + "epoch": 0.49121245828698556, + "grad_norm": 22602.65234375, + "learning_rate": 3.012323849147866e-05, + "loss": 4.7669, + "step": 1932 + }, + { + "epoch": 0.49146670904179246, + "grad_norm": 23087.5390625, + "learning_rate": 3.010151481861862e-05, + "loss": 4.7733, + "step": 1933 + }, + { + "epoch": 0.4917209597965994, + "grad_norm": 22796.1328125, + "learning_rate": 3.0079787125650372e-05, + "loss": 4.7806, + "step": 1934 + }, + { + "epoch": 0.4919752105514063, + "grad_norm": 23238.685546875, + "learning_rate": 3.0058055429695812e-05, + "loss": 4.7664, + "step": 1935 + }, + { + "epoch": 0.49222946130621326, + "grad_norm": 22716.732421875, + "learning_rate": 3.0036319747880003e-05, + "loss": 4.7426, + "step": 1936 + }, + { + "epoch": 0.4924837120610202, + "grad_norm": 23319.64453125, + "learning_rate": 3.0014580097331168e-05, + "loss": 4.7681, + "step": 1937 + }, + { + "epoch": 0.4927379628158271, + "grad_norm": 23150.38671875, + "learning_rate": 2.9992836495180608e-05, + "loss": 4.7724, + "step": 1938 + }, + { + "epoch": 0.49299221357063405, + "grad_norm": 22830.548828125, + "learning_rate": 2.997108895856281e-05, + "loss": 4.7625, + "step": 1939 + }, + { + "epoch": 0.49324646432544095, + "grad_norm": 22915.86328125, + "learning_rate": 2.9949337504615287e-05, + "loss": 4.7656, + "step": 1940 + }, + { + "epoch": 0.4935007150802479, + "grad_norm": 22902.11328125, + "learning_rate": 2.9927582150478688e-05, + "loss": 4.7688, + "step": 1941 + }, + { + "epoch": 0.49375496583505485, + "grad_norm": 22863.533203125, + "learning_rate": 2.9905822913296722e-05, + "loss": 4.7676, + "step": 1942 + }, + { + "epoch": 0.49400921658986174, + "grad_norm": 22825.634765625, + "learning_rate": 2.9884059810216147e-05, + "loss": 4.7644, + "step": 1943 + }, + { + "epoch": 0.4942634673446687, + "grad_norm": 22819.275390625, + "learning_rate": 2.9862292858386782e-05, + "loss": 4.7773, + "step": 1944 + }, + { + "epoch": 0.4945177180994756, + "grad_norm": 23065.78125, + "learning_rate": 2.9840522074961484e-05, + "loss": 4.7668, + "step": 1945 + }, + { + "epoch": 0.49477196885428254, + "grad_norm": 23035.5078125, + "learning_rate": 2.9818747477096103e-05, + "loss": 4.77, + "step": 1946 + }, + { + "epoch": 0.4950262196090895, + "grad_norm": 23054.912109375, + "learning_rate": 2.979696908194952e-05, + "loss": 4.7739, + "step": 1947 + }, + { + "epoch": 0.4952804703638964, + "grad_norm": 22977.8828125, + "learning_rate": 2.9775186906683593e-05, + "loss": 4.7662, + "step": 1948 + }, + { + "epoch": 0.49553472111870334, + "grad_norm": 22949.826171875, + "learning_rate": 2.9753400968463173e-05, + "loss": 4.7534, + "step": 1949 + }, + { + "epoch": 0.49578897187351023, + "grad_norm": 23086.3046875, + "learning_rate": 2.9731611284456068e-05, + "loss": 4.7669, + "step": 1950 + }, + { + "epoch": 0.4960432226283172, + "grad_norm": 22937.015625, + "learning_rate": 2.9709817871833033e-05, + "loss": 4.7744, + "step": 1951 + }, + { + "epoch": 0.49629747338312413, + "grad_norm": 22889.5703125, + "learning_rate": 2.968802074776777e-05, + "loss": 4.7539, + "step": 1952 + }, + { + "epoch": 0.496551724137931, + "grad_norm": 22840.66796875, + "learning_rate": 2.9666219929436896e-05, + "loss": 4.7608, + "step": 1953 + }, + { + "epoch": 0.496805974892738, + "grad_norm": 22922.9453125, + "learning_rate": 2.964441543401995e-05, + "loss": 4.7586, + "step": 1954 + }, + { + "epoch": 0.4970602256475449, + "grad_norm": 22957.939453125, + "learning_rate": 2.9622607278699365e-05, + "loss": 4.7624, + "step": 1955 + }, + { + "epoch": 0.4973144764023518, + "grad_norm": 23087.447265625, + "learning_rate": 2.9600795480660466e-05, + "loss": 4.7625, + "step": 1956 + }, + { + "epoch": 0.4975687271571588, + "grad_norm": 22927.603515625, + "learning_rate": 2.9578980057091414e-05, + "loss": 4.7593, + "step": 1957 + }, + { + "epoch": 0.49782297791196567, + "grad_norm": 22810.44921875, + "learning_rate": 2.9557161025183278e-05, + "loss": 4.7634, + "step": 1958 + }, + { + "epoch": 0.4980772286667726, + "grad_norm": 23032.84375, + "learning_rate": 2.953533840212993e-05, + "loss": 4.7505, + "step": 1959 + }, + { + "epoch": 0.4983314794215795, + "grad_norm": 22964.78125, + "learning_rate": 2.951351220512809e-05, + "loss": 4.7563, + "step": 1960 + }, + { + "epoch": 0.49858573017638647, + "grad_norm": 23036.1796875, + "learning_rate": 2.94916824513773e-05, + "loss": 4.7644, + "step": 1961 + }, + { + "epoch": 0.4988399809311934, + "grad_norm": 22814.412109375, + "learning_rate": 2.9469849158079886e-05, + "loss": 4.7651, + "step": 1962 + }, + { + "epoch": 0.4990942316860003, + "grad_norm": 23342.337890625, + "learning_rate": 2.944801234244098e-05, + "loss": 4.7539, + "step": 1963 + }, + { + "epoch": 0.49934848244080726, + "grad_norm": 23009.666015625, + "learning_rate": 2.9426172021668475e-05, + "loss": 4.7464, + "step": 1964 + }, + { + "epoch": 0.49960273319561416, + "grad_norm": 22917.50390625, + "learning_rate": 2.9404328212973044e-05, + "loss": 4.7566, + "step": 1965 + }, + { + "epoch": 0.4998569839504211, + "grad_norm": 23165.1015625, + "learning_rate": 2.938248093356809e-05, + "loss": 4.7487, + "step": 1966 + }, + { + "epoch": 0.500111234705228, + "grad_norm": 22979.5546875, + "learning_rate": 2.9360630200669766e-05, + "loss": 4.7718, + "step": 1967 + }, + { + "epoch": 0.500365485460035, + "grad_norm": 22906.5234375, + "learning_rate": 2.933877603149694e-05, + "loss": 4.7536, + "step": 1968 + }, + { + "epoch": 0.5006197362148419, + "grad_norm": 22827.6875, + "learning_rate": 2.9316918443271176e-05, + "loss": 4.7588, + "step": 1969 + }, + { + "epoch": 0.5008739869696488, + "grad_norm": 23026.64453125, + "learning_rate": 2.9295057453216758e-05, + "loss": 4.7543, + "step": 1970 + }, + { + "epoch": 0.5011282377244557, + "grad_norm": 23014.537109375, + "learning_rate": 2.9273193078560636e-05, + "loss": 4.7461, + "step": 1971 + }, + { + "epoch": 0.5013824884792627, + "grad_norm": 22945.01953125, + "learning_rate": 2.925132533653242e-05, + "loss": 4.7447, + "step": 1972 + }, + { + "epoch": 0.5016367392340696, + "grad_norm": 23150.68359375, + "learning_rate": 2.9229454244364397e-05, + "loss": 4.7381, + "step": 1973 + }, + { + "epoch": 0.5018909899888765, + "grad_norm": 23043.513671875, + "learning_rate": 2.9207579819291453e-05, + "loss": 4.7532, + "step": 1974 + }, + { + "epoch": 0.5021452407436835, + "grad_norm": 22943.0625, + "learning_rate": 2.9185702078551148e-05, + "loss": 4.7308, + "step": 1975 + }, + { + "epoch": 0.5023994914984904, + "grad_norm": 22949.34375, + "learning_rate": 2.916382103938363e-05, + "loss": 4.7531, + "step": 1976 + }, + { + "epoch": 0.5026537422532973, + "grad_norm": 22955.57421875, + "learning_rate": 2.9141936719031644e-05, + "loss": 4.7441, + "step": 1977 + }, + { + "epoch": 0.5029079930081043, + "grad_norm": 23065.19140625, + "learning_rate": 2.9120049134740522e-05, + "loss": 4.7714, + "step": 1978 + }, + { + "epoch": 0.5031622437629112, + "grad_norm": 23003.017578125, + "learning_rate": 2.909815830375818e-05, + "loss": 4.7425, + "step": 1979 + }, + { + "epoch": 0.5034164945177181, + "grad_norm": 23466.689453125, + "learning_rate": 2.9076264243335083e-05, + "loss": 4.7448, + "step": 1980 + }, + { + "epoch": 0.503670745272525, + "grad_norm": 23075.076171875, + "learning_rate": 2.9054366970724235e-05, + "loss": 4.7547, + "step": 1981 + }, + { + "epoch": 0.503924996027332, + "grad_norm": 23075.654296875, + "learning_rate": 2.9032466503181193e-05, + "loss": 4.755, + "step": 1982 + }, + { + "epoch": 0.5041792467821389, + "grad_norm": 23305.966796875, + "learning_rate": 2.9010562857964e-05, + "loss": 4.7668, + "step": 1983 + }, + { + "epoch": 0.5044334975369458, + "grad_norm": 22911.794921875, + "learning_rate": 2.8988656052333235e-05, + "loss": 4.7482, + "step": 1984 + }, + { + "epoch": 0.5046877482917528, + "grad_norm": 23034.783203125, + "learning_rate": 2.896674610355194e-05, + "loss": 4.7502, + "step": 1985 + }, + { + "epoch": 0.5049419990465597, + "grad_norm": 22959.732421875, + "learning_rate": 2.8944833028885654e-05, + "loss": 4.7473, + "step": 1986 + }, + { + "epoch": 0.5051962498013666, + "grad_norm": 23156.033203125, + "learning_rate": 2.8922916845602377e-05, + "loss": 4.7306, + "step": 1987 + }, + { + "epoch": 0.5054505005561736, + "grad_norm": 23070.759765625, + "learning_rate": 2.8900997570972545e-05, + "loss": 4.7554, + "step": 1988 + }, + { + "epoch": 0.5057047513109805, + "grad_norm": 23160.287109375, + "learning_rate": 2.8879075222269036e-05, + "loss": 4.7437, + "step": 1989 + }, + { + "epoch": 0.5059590020657874, + "grad_norm": 22863.69140625, + "learning_rate": 2.8857149816767158e-05, + "loss": 4.7479, + "step": 1990 + }, + { + "epoch": 0.5062132528205943, + "grad_norm": 23054.64453125, + "learning_rate": 2.883522137174462e-05, + "loss": 4.7566, + "step": 1991 + }, + { + "epoch": 0.5064675035754013, + "grad_norm": 22964.81640625, + "learning_rate": 2.8813289904481535e-05, + "loss": 4.7401, + "step": 1992 + }, + { + "epoch": 0.5067217543302082, + "grad_norm": 23167.4296875, + "learning_rate": 2.8791355432260392e-05, + "loss": 4.7464, + "step": 1993 + }, + { + "epoch": 0.5069760050850151, + "grad_norm": 22849.697265625, + "learning_rate": 2.8769417972366037e-05, + "loss": 4.7356, + "step": 1994 + }, + { + "epoch": 0.5072302558398221, + "grad_norm": 23022.30078125, + "learning_rate": 2.8747477542085686e-05, + "loss": 4.7413, + "step": 1995 + }, + { + "epoch": 0.507484506594629, + "grad_norm": 22905.880859375, + "learning_rate": 2.87255341587089e-05, + "loss": 4.7432, + "step": 1996 + }, + { + "epoch": 0.5077387573494359, + "grad_norm": 23229.41796875, + "learning_rate": 2.8703587839527546e-05, + "loss": 4.7723, + "step": 1997 + }, + { + "epoch": 0.5079930081042429, + "grad_norm": 22982.7421875, + "learning_rate": 2.8681638601835815e-05, + "loss": 4.7399, + "step": 1998 + }, + { + "epoch": 0.5082472588590498, + "grad_norm": 23071.857421875, + "learning_rate": 2.8659686462930212e-05, + "loss": 4.733, + "step": 1999 + }, + { + "epoch": 0.5085015096138567, + "grad_norm": 23090.21484375, + "learning_rate": 2.8637731440109507e-05, + "loss": 4.7424, + "step": 2000 + }, + { + "epoch": 0.5085015096138567, + "eval_loss": 9.557955741882324, + "eval_runtime": 699.1214, + "eval_samples_per_second": 151.599, + "eval_steps_per_second": 9.476, + "step": 2000 + }, + { + "epoch": 0.5087557603686635, + "grad_norm": 23007.51953125, + "learning_rate": 2.8615773550674744e-05, + "loss": 4.7398, + "step": 2001 + }, + { + "epoch": 0.5090100111234706, + "grad_norm": 23056.48046875, + "learning_rate": 2.859381281192925e-05, + "loss": 4.7519, + "step": 2002 + }, + { + "epoch": 0.5092642618782774, + "grad_norm": 23053.72265625, + "learning_rate": 2.857184924117856e-05, + "loss": 4.7312, + "step": 2003 + }, + { + "epoch": 0.5095185126330843, + "grad_norm": 23077.6171875, + "learning_rate": 2.8549882855730485e-05, + "loss": 4.7256, + "step": 2004 + }, + { + "epoch": 0.5097727633878913, + "grad_norm": 23031.88671875, + "learning_rate": 2.8527913672895012e-05, + "loss": 4.7345, + "step": 2005 + }, + { + "epoch": 0.5100270141426982, + "grad_norm": 23169.986328125, + "learning_rate": 2.8505941709984348e-05, + "loss": 4.7439, + "step": 2006 + }, + { + "epoch": 0.5102812648975051, + "grad_norm": 23364.912109375, + "learning_rate": 2.8483966984312906e-05, + "loss": 4.7317, + "step": 2007 + }, + { + "epoch": 0.5105355156523121, + "grad_norm": 23049.017578125, + "learning_rate": 2.8461989513197252e-05, + "loss": 4.7378, + "step": 2008 + }, + { + "epoch": 0.510789766407119, + "grad_norm": 22993.38671875, + "learning_rate": 2.8440009313956133e-05, + "loss": 4.7347, + "step": 2009 + }, + { + "epoch": 0.5110440171619259, + "grad_norm": 23220.9453125, + "learning_rate": 2.841802640391045e-05, + "loss": 4.7469, + "step": 2010 + }, + { + "epoch": 0.5112982679167328, + "grad_norm": 23500.806640625, + "learning_rate": 2.83960408003832e-05, + "loss": 4.7359, + "step": 2011 + }, + { + "epoch": 0.5115525186715398, + "grad_norm": 23078.642578125, + "learning_rate": 2.8374052520699557e-05, + "loss": 4.7269, + "step": 2012 + }, + { + "epoch": 0.5118067694263467, + "grad_norm": 23080.47265625, + "learning_rate": 2.8352061582186777e-05, + "loss": 4.7359, + "step": 2013 + }, + { + "epoch": 0.5120610201811536, + "grad_norm": 23217.416015625, + "learning_rate": 2.8330068002174202e-05, + "loss": 4.723, + "step": 2014 + }, + { + "epoch": 0.5123152709359606, + "grad_norm": 23020.74609375, + "learning_rate": 2.830807179799328e-05, + "loss": 4.7409, + "step": 2015 + }, + { + "epoch": 0.5125695216907675, + "grad_norm": 23209.0859375, + "learning_rate": 2.82860729869775e-05, + "loss": 4.7317, + "step": 2016 + }, + { + "epoch": 0.5128237724455744, + "grad_norm": 23137.6796875, + "learning_rate": 2.826407158646243e-05, + "loss": 4.746, + "step": 2017 + }, + { + "epoch": 0.5130780232003814, + "grad_norm": 22964.87109375, + "learning_rate": 2.8242067613785673e-05, + "loss": 4.7285, + "step": 2018 + }, + { + "epoch": 0.5133322739551883, + "grad_norm": 23120.8515625, + "learning_rate": 2.822006108628683e-05, + "loss": 4.7229, + "step": 2019 + }, + { + "epoch": 0.5135865247099952, + "grad_norm": 23373.91015625, + "learning_rate": 2.819805202130756e-05, + "loss": 4.7279, + "step": 2020 + }, + { + "epoch": 0.5138407754648021, + "grad_norm": 23226.46875, + "learning_rate": 2.8176040436191487e-05, + "loss": 4.7329, + "step": 2021 + }, + { + "epoch": 0.5140950262196091, + "grad_norm": 23186.08203125, + "learning_rate": 2.8154026348284247e-05, + "loss": 4.7457, + "step": 2022 + }, + { + "epoch": 0.514349276974416, + "grad_norm": 23213.958984375, + "learning_rate": 2.8132009774933427e-05, + "loss": 4.7295, + "step": 2023 + }, + { + "epoch": 0.5146035277292229, + "grad_norm": 23086.5078125, + "learning_rate": 2.810999073348859e-05, + "loss": 4.7355, + "step": 2024 + }, + { + "epoch": 0.5148577784840299, + "grad_norm": 23512.626953125, + "learning_rate": 2.808796924130122e-05, + "loss": 4.7262, + "step": 2025 + }, + { + "epoch": 0.5151120292388368, + "grad_norm": 23135.099609375, + "learning_rate": 2.8065945315724756e-05, + "loss": 4.7418, + "step": 2026 + }, + { + "epoch": 0.5153662799936437, + "grad_norm": 23408.845703125, + "learning_rate": 2.8043918974114547e-05, + "loss": 4.7315, + "step": 2027 + }, + { + "epoch": 0.5156205307484507, + "grad_norm": 23213.99609375, + "learning_rate": 2.8021890233827842e-05, + "loss": 4.7351, + "step": 2028 + }, + { + "epoch": 0.5158747815032576, + "grad_norm": 35557.0625, + "learning_rate": 2.7999859112223785e-05, + "loss": 4.7523, + "step": 2029 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 23306.736328125, + "learning_rate": 2.79778256266634e-05, + "loss": 4.725, + "step": 2030 + }, + { + "epoch": 0.5163832830128714, + "grad_norm": 23075.65625, + "learning_rate": 2.7955789794509556e-05, + "loss": 4.7246, + "step": 2031 + }, + { + "epoch": 0.5166375337676784, + "grad_norm": 23116.28125, + "learning_rate": 2.7933751633126987e-05, + "loss": 4.7287, + "step": 2032 + }, + { + "epoch": 0.5168917845224853, + "grad_norm": 23078.94921875, + "learning_rate": 2.7911711159882266e-05, + "loss": 4.729, + "step": 2033 + }, + { + "epoch": 0.5171460352772922, + "grad_norm": 23201.748046875, + "learning_rate": 2.7889668392143777e-05, + "loss": 4.7299, + "step": 2034 + }, + { + "epoch": 0.5174002860320992, + "grad_norm": 23212.55078125, + "learning_rate": 2.7867623347281713e-05, + "loss": 4.7207, + "step": 2035 + }, + { + "epoch": 0.5176545367869061, + "grad_norm": 23330.412109375, + "learning_rate": 2.7845576042668066e-05, + "loss": 4.7213, + "step": 2036 + }, + { + "epoch": 0.517908787541713, + "grad_norm": 23131.947265625, + "learning_rate": 2.78235264956766e-05, + "loss": 4.7056, + "step": 2037 + }, + { + "epoch": 0.51816303829652, + "grad_norm": 24849.046875, + "learning_rate": 2.7801474723682873e-05, + "loss": 4.721, + "step": 2038 + }, + { + "epoch": 0.5184172890513269, + "grad_norm": 23961.619140625, + "learning_rate": 2.777942074406416e-05, + "loss": 4.716, + "step": 2039 + }, + { + "epoch": 0.5186715398061338, + "grad_norm": 28028.6328125, + "learning_rate": 2.7757364574199496e-05, + "loss": 4.7289, + "step": 2040 + }, + { + "epoch": 0.5189257905609407, + "grad_norm": 26758.71875, + "learning_rate": 2.7735306231469644e-05, + "loss": 4.7226, + "step": 2041 + }, + { + "epoch": 0.5191800413157477, + "grad_norm": 23318.28125, + "learning_rate": 2.7713245733257053e-05, + "loss": 4.7258, + "step": 2042 + }, + { + "epoch": 0.5194342920705546, + "grad_norm": 24078.271484375, + "learning_rate": 2.769118309694591e-05, + "loss": 4.7304, + "step": 2043 + }, + { + "epoch": 0.5196885428253615, + "grad_norm": 23060.48046875, + "learning_rate": 2.7669118339922072e-05, + "loss": 4.7105, + "step": 2044 + }, + { + "epoch": 0.5199427935801685, + "grad_norm": 25068.29296875, + "learning_rate": 2.764705147957305e-05, + "loss": 4.7283, + "step": 2045 + }, + { + "epoch": 0.5201970443349754, + "grad_norm": 24059.552734375, + "learning_rate": 2.762498253328803e-05, + "loss": 4.7194, + "step": 2046 + }, + { + "epoch": 0.5204512950897823, + "grad_norm": 23689.7578125, + "learning_rate": 2.7602911518457835e-05, + "loss": 4.7216, + "step": 2047 + }, + { + "epoch": 0.5207055458445893, + "grad_norm": 24054.283203125, + "learning_rate": 2.7580838452474923e-05, + "loss": 4.7143, + "step": 2048 + }, + { + "epoch": 0.5209597965993962, + "grad_norm": 23281.78125, + "learning_rate": 2.7558763352733362e-05, + "loss": 4.7101, + "step": 2049 + }, + { + "epoch": 0.5212140473542031, + "grad_norm": 24276.9296875, + "learning_rate": 2.7536686236628834e-05, + "loss": 4.7236, + "step": 2050 + }, + { + "epoch": 0.52146829810901, + "grad_norm": 23157.599609375, + "learning_rate": 2.7514607121558594e-05, + "loss": 4.729, + "step": 2051 + }, + { + "epoch": 0.521722548863817, + "grad_norm": 25382.3515625, + "learning_rate": 2.7492526024921484e-05, + "loss": 4.7223, + "step": 2052 + }, + { + "epoch": 0.5219767996186239, + "grad_norm": 23496.029296875, + "learning_rate": 2.7470442964117897e-05, + "loss": 4.7195, + "step": 2053 + }, + { + "epoch": 0.5222310503734308, + "grad_norm": 23880.427734375, + "learning_rate": 2.7448357956549793e-05, + "loss": 4.7315, + "step": 2054 + }, + { + "epoch": 0.5224853011282378, + "grad_norm": 23273.0703125, + "learning_rate": 2.7426271019620654e-05, + "loss": 4.7207, + "step": 2055 + }, + { + "epoch": 0.5227395518830447, + "grad_norm": 24034.41015625, + "learning_rate": 2.7404182170735464e-05, + "loss": 4.7312, + "step": 2056 + }, + { + "epoch": 0.5229938026378516, + "grad_norm": 23682.8515625, + "learning_rate": 2.7382091427300748e-05, + "loss": 4.7211, + "step": 2057 + }, + { + "epoch": 0.5232480533926586, + "grad_norm": 23399.884765625, + "learning_rate": 2.7359998806724506e-05, + "loss": 4.7079, + "step": 2058 + }, + { + "epoch": 0.5235023041474655, + "grad_norm": 23222.86328125, + "learning_rate": 2.7337904326416214e-05, + "loss": 4.7023, + "step": 2059 + }, + { + "epoch": 0.5237565549022724, + "grad_norm": 23315.37890625, + "learning_rate": 2.7315808003786826e-05, + "loss": 4.728, + "step": 2060 + }, + { + "epoch": 0.5240108056570792, + "grad_norm": 23392.298828125, + "learning_rate": 2.7293709856248734e-05, + "loss": 4.7225, + "step": 2061 + }, + { + "epoch": 0.5242650564118863, + "grad_norm": 23439.419921875, + "learning_rate": 2.7271609901215778e-05, + "loss": 4.7041, + "step": 2062 + }, + { + "epoch": 0.5245193071666931, + "grad_norm": 24017.865234375, + "learning_rate": 2.724950815610321e-05, + "loss": 4.7206, + "step": 2063 + }, + { + "epoch": 0.5247735579215, + "grad_norm": 23258.296875, + "learning_rate": 2.7227404638327712e-05, + "loss": 4.7215, + "step": 2064 + }, + { + "epoch": 0.525027808676307, + "grad_norm": 23284.140625, + "learning_rate": 2.7205299365307345e-05, + "loss": 4.7116, + "step": 2065 + }, + { + "epoch": 0.525282059431114, + "grad_norm": 23141.248046875, + "learning_rate": 2.7183192354461573e-05, + "loss": 4.7155, + "step": 2066 + }, + { + "epoch": 0.5255363101859208, + "grad_norm": 23496.93359375, + "learning_rate": 2.7161083623211203e-05, + "loss": 4.7094, + "step": 2067 + }, + { + "epoch": 0.5257905609407278, + "grad_norm": 23329.158203125, + "learning_rate": 2.7138973188978416e-05, + "loss": 4.7132, + "step": 2068 + }, + { + "epoch": 0.5260448116955347, + "grad_norm": 23593.390625, + "learning_rate": 2.7116861069186726e-05, + "loss": 4.7269, + "step": 2069 + }, + { + "epoch": 0.5262990624503416, + "grad_norm": 23326.470703125, + "learning_rate": 2.7094747281260992e-05, + "loss": 4.7281, + "step": 2070 + }, + { + "epoch": 0.5265533132051485, + "grad_norm": 23733.70703125, + "learning_rate": 2.7072631842627367e-05, + "loss": 4.7078, + "step": 2071 + }, + { + "epoch": 0.5268075639599555, + "grad_norm": 23178.4921875, + "learning_rate": 2.705051477071332e-05, + "loss": 4.7005, + "step": 2072 + }, + { + "epoch": 0.5270618147147624, + "grad_norm": 23485.16015625, + "learning_rate": 2.702839608294758e-05, + "loss": 4.7227, + "step": 2073 + }, + { + "epoch": 0.5273160654695693, + "grad_norm": 23290.314453125, + "learning_rate": 2.7006275796760194e-05, + "loss": 4.719, + "step": 2074 + }, + { + "epoch": 0.5275703162243763, + "grad_norm": 23742.23828125, + "learning_rate": 2.6984153929582433e-05, + "loss": 4.7034, + "step": 2075 + }, + { + "epoch": 0.5278245669791832, + "grad_norm": 23395.416015625, + "learning_rate": 2.696203049884683e-05, + "loss": 4.7251, + "step": 2076 + }, + { + "epoch": 0.5280788177339901, + "grad_norm": 23555.201171875, + "learning_rate": 2.6939905521987137e-05, + "loss": 4.7158, + "step": 2077 + }, + { + "epoch": 0.5283330684887971, + "grad_norm": 23185.91796875, + "learning_rate": 2.6917779016438342e-05, + "loss": 4.7085, + "step": 2078 + }, + { + "epoch": 0.528587319243604, + "grad_norm": 23298.71484375, + "learning_rate": 2.689565099963662e-05, + "loss": 4.7195, + "step": 2079 + }, + { + "epoch": 0.5288415699984109, + "grad_norm": 23305.232421875, + "learning_rate": 2.6873521489019348e-05, + "loss": 4.7213, + "step": 2080 + }, + { + "epoch": 0.5290958207532178, + "grad_norm": 23419.66015625, + "learning_rate": 2.6851390502025082e-05, + "loss": 4.7204, + "step": 2081 + }, + { + "epoch": 0.5293500715080248, + "grad_norm": 23082.498046875, + "learning_rate": 2.6829258056093526e-05, + "loss": 4.7096, + "step": 2082 + }, + { + "epoch": 0.5296043222628317, + "grad_norm": 23548.970703125, + "learning_rate": 2.680712416866556e-05, + "loss": 4.7176, + "step": 2083 + }, + { + "epoch": 0.5298585730176386, + "grad_norm": 23188.61328125, + "learning_rate": 2.678498885718316e-05, + "loss": 4.7045, + "step": 2084 + }, + { + "epoch": 0.5301128237724456, + "grad_norm": 23513.1328125, + "learning_rate": 2.6762852139089467e-05, + "loss": 4.7036, + "step": 2085 + }, + { + "epoch": 0.5303670745272525, + "grad_norm": 23257.73828125, + "learning_rate": 2.6740714031828725e-05, + "loss": 4.7096, + "step": 2086 + }, + { + "epoch": 0.5306213252820594, + "grad_norm": 23426.67578125, + "learning_rate": 2.6718574552846225e-05, + "loss": 4.7137, + "step": 2087 + }, + { + "epoch": 0.5308755760368664, + "grad_norm": 23352.6015625, + "learning_rate": 2.6696433719588398e-05, + "loss": 4.7228, + "step": 2088 + }, + { + "epoch": 0.5311298267916733, + "grad_norm": 23303.849609375, + "learning_rate": 2.6674291549502706e-05, + "loss": 4.7145, + "step": 2089 + }, + { + "epoch": 0.5313840775464802, + "grad_norm": 23343.2109375, + "learning_rate": 2.6652148060037685e-05, + "loss": 4.6971, + "step": 2090 + }, + { + "epoch": 0.5316383283012871, + "grad_norm": 23663.53515625, + "learning_rate": 2.6630003268642902e-05, + "loss": 4.7119, + "step": 2091 + }, + { + "epoch": 0.5318925790560941, + "grad_norm": 23375.068359375, + "learning_rate": 2.6607857192768943e-05, + "loss": 4.7125, + "step": 2092 + }, + { + "epoch": 0.532146829810901, + "grad_norm": 23589.341796875, + "learning_rate": 2.6585709849867414e-05, + "loss": 4.7132, + "step": 2093 + }, + { + "epoch": 0.5324010805657079, + "grad_norm": 23355.55078125, + "learning_rate": 2.6563561257390925e-05, + "loss": 4.706, + "step": 2094 + }, + { + "epoch": 0.5326553313205149, + "grad_norm": 23303.205078125, + "learning_rate": 2.654141143279305e-05, + "loss": 4.7029, + "step": 2095 + }, + { + "epoch": 0.5329095820753218, + "grad_norm": 23292.84765625, + "learning_rate": 2.6519260393528366e-05, + "loss": 4.7012, + "step": 2096 + }, + { + "epoch": 0.5331638328301287, + "grad_norm": 23259.251953125, + "learning_rate": 2.6497108157052386e-05, + "loss": 4.6946, + "step": 2097 + }, + { + "epoch": 0.5334180835849356, + "grad_norm": 23252.619140625, + "learning_rate": 2.6474954740821555e-05, + "loss": 4.6993, + "step": 2098 + }, + { + "epoch": 0.5336723343397426, + "grad_norm": 23247.013671875, + "learning_rate": 2.6452800162293273e-05, + "loss": 4.6996, + "step": 2099 + }, + { + "epoch": 0.5339265850945495, + "grad_norm": 33386.3359375, + "learning_rate": 2.6430644438925844e-05, + "loss": 4.7113, + "step": 2100 + }, + { + "epoch": 0.5339265850945495, + "eval_loss": 9.482353210449219, + "eval_runtime": 696.41, + "eval_samples_per_second": 152.189, + "eval_steps_per_second": 9.513, + "step": 2100 + }, + { + "epoch": 0.5341808358493564, + "grad_norm": 24149.486328125, + "learning_rate": 2.6408487588178477e-05, + "loss": 4.7019, + "step": 2101 + }, + { + "epoch": 0.5344350866041634, + "grad_norm": 23557.75, + "learning_rate": 2.6386329627511265e-05, + "loss": 4.7102, + "step": 2102 + }, + { + "epoch": 0.5346893373589703, + "grad_norm": 23583.0234375, + "learning_rate": 2.636417057438519e-05, + "loss": 4.6826, + "step": 2103 + }, + { + "epoch": 0.5349435881137772, + "grad_norm": 24047.4375, + "learning_rate": 2.634201044626206e-05, + "loss": 4.7044, + "step": 2104 + }, + { + "epoch": 0.5351978388685842, + "grad_norm": 25081.236328125, + "learning_rate": 2.631984926060457e-05, + "loss": 4.6972, + "step": 2105 + }, + { + "epoch": 0.5354520896233911, + "grad_norm": 23678.958984375, + "learning_rate": 2.6297687034876238e-05, + "loss": 4.7079, + "step": 2106 + }, + { + "epoch": 0.535706340378198, + "grad_norm": 23832.02734375, + "learning_rate": 2.6275523786541377e-05, + "loss": 4.7141, + "step": 2107 + }, + { + "epoch": 0.5359605911330049, + "grad_norm": 23217.287109375, + "learning_rate": 2.6253359533065135e-05, + "loss": 4.6954, + "step": 2108 + }, + { + "epoch": 0.5362148418878119, + "grad_norm": 24535.060546875, + "learning_rate": 2.6231194291913447e-05, + "loss": 4.7029, + "step": 2109 + }, + { + "epoch": 0.5364690926426188, + "grad_norm": 23473.38671875, + "learning_rate": 2.6209028080553005e-05, + "loss": 4.7215, + "step": 2110 + }, + { + "epoch": 0.5367233433974257, + "grad_norm": 23935.1796875, + "learning_rate": 2.61868609164513e-05, + "loss": 4.7037, + "step": 2111 + }, + { + "epoch": 0.5369775941522327, + "grad_norm": 23354.58203125, + "learning_rate": 2.6164692817076535e-05, + "loss": 4.7077, + "step": 2112 + }, + { + "epoch": 0.5372318449070396, + "grad_norm": 24832.46484375, + "learning_rate": 2.6142523799897683e-05, + "loss": 4.7068, + "step": 2113 + }, + { + "epoch": 0.5374860956618465, + "grad_norm": 23532.0859375, + "learning_rate": 2.612035388238443e-05, + "loss": 4.6967, + "step": 2114 + }, + { + "epoch": 0.5377403464166535, + "grad_norm": 23633.271484375, + "learning_rate": 2.6098183082007155e-05, + "loss": 4.6981, + "step": 2115 + }, + { + "epoch": 0.5379945971714604, + "grad_norm": 23384.09765625, + "learning_rate": 2.6076011416236956e-05, + "loss": 4.7102, + "step": 2116 + }, + { + "epoch": 0.5382488479262673, + "grad_norm": 24732.24609375, + "learning_rate": 2.6053838902545608e-05, + "loss": 4.704, + "step": 2117 + }, + { + "epoch": 0.5385030986810742, + "grad_norm": 23493.09765625, + "learning_rate": 2.6031665558405536e-05, + "loss": 4.6917, + "step": 2118 + }, + { + "epoch": 0.5387573494358812, + "grad_norm": 23800.6171875, + "learning_rate": 2.6009491401289842e-05, + "loss": 4.6953, + "step": 2119 + }, + { + "epoch": 0.5390116001906881, + "grad_norm": 23385.017578125, + "learning_rate": 2.598731644867226e-05, + "loss": 4.695, + "step": 2120 + }, + { + "epoch": 0.539265850945495, + "grad_norm": 25875.88671875, + "learning_rate": 2.5965140718027152e-05, + "loss": 4.6892, + "step": 2121 + }, + { + "epoch": 0.539520101700302, + "grad_norm": 23469.125, + "learning_rate": 2.5942964226829488e-05, + "loss": 4.6889, + "step": 2122 + }, + { + "epoch": 0.5397743524551089, + "grad_norm": 23864.87109375, + "learning_rate": 2.592078699255484e-05, + "loss": 4.6852, + "step": 2123 + }, + { + "epoch": 0.5400286032099157, + "grad_norm": 23716.55078125, + "learning_rate": 2.5898609032679366e-05, + "loss": 4.7008, + "step": 2124 + }, + { + "epoch": 0.5402828539647228, + "grad_norm": 24383.921875, + "learning_rate": 2.58764303646798e-05, + "loss": 4.7043, + "step": 2125 + }, + { + "epoch": 0.5405371047195296, + "grad_norm": 23633.931640625, + "learning_rate": 2.5854251006033426e-05, + "loss": 4.6915, + "step": 2126 + }, + { + "epoch": 0.5407913554743365, + "grad_norm": 23681.015625, + "learning_rate": 2.5832070974218083e-05, + "loss": 4.7042, + "step": 2127 + }, + { + "epoch": 0.5410456062291434, + "grad_norm": 23592.380859375, + "learning_rate": 2.5809890286712136e-05, + "loss": 4.6928, + "step": 2128 + }, + { + "epoch": 0.5412998569839504, + "grad_norm": 23555.6015625, + "learning_rate": 2.578770896099445e-05, + "loss": 4.6773, + "step": 2129 + }, + { + "epoch": 0.5415541077387573, + "grad_norm": 23653.02734375, + "learning_rate": 2.5765527014544416e-05, + "loss": 4.7001, + "step": 2130 + }, + { + "epoch": 0.5418083584935642, + "grad_norm": 23540.796875, + "learning_rate": 2.5743344464841912e-05, + "loss": 4.6952, + "step": 2131 + }, + { + "epoch": 0.5420626092483712, + "grad_norm": 23544.94921875, + "learning_rate": 2.5721161329367278e-05, + "loss": 4.6916, + "step": 2132 + }, + { + "epoch": 0.5423168600031781, + "grad_norm": 23536.80859375, + "learning_rate": 2.5698977625601323e-05, + "loss": 4.6977, + "step": 2133 + }, + { + "epoch": 0.542571110757985, + "grad_norm": 23597.537109375, + "learning_rate": 2.567679337102531e-05, + "loss": 4.6946, + "step": 2134 + }, + { + "epoch": 0.542825361512792, + "grad_norm": 23443.845703125, + "learning_rate": 2.5654608583120922e-05, + "loss": 4.7006, + "step": 2135 + }, + { + "epoch": 0.5430796122675989, + "grad_norm": 23682.0, + "learning_rate": 2.5632423279370272e-05, + "loss": 4.6903, + "step": 2136 + }, + { + "epoch": 0.5433338630224058, + "grad_norm": 23529.369140625, + "learning_rate": 2.5610237477255878e-05, + "loss": 4.7025, + "step": 2137 + }, + { + "epoch": 0.5435881137772127, + "grad_norm": 23552.083984375, + "learning_rate": 2.558805119426065e-05, + "loss": 4.7028, + "step": 2138 + }, + { + "epoch": 0.5438423645320197, + "grad_norm": 23398.064453125, + "learning_rate": 2.5565864447867878e-05, + "loss": 4.6849, + "step": 2139 + }, + { + "epoch": 0.5440966152868266, + "grad_norm": 23539.12109375, + "learning_rate": 2.5543677255561222e-05, + "loss": 4.676, + "step": 2140 + }, + { + "epoch": 0.5443508660416335, + "grad_norm": 23525.49609375, + "learning_rate": 2.5521489634824674e-05, + "loss": 4.6899, + "step": 2141 + }, + { + "epoch": 0.5446051167964405, + "grad_norm": 23321.806640625, + "learning_rate": 2.5499301603142588e-05, + "loss": 4.6916, + "step": 2142 + }, + { + "epoch": 0.5448593675512474, + "grad_norm": 23877.974609375, + "learning_rate": 2.547711317799963e-05, + "loss": 4.6895, + "step": 2143 + }, + { + "epoch": 0.5451136183060543, + "grad_norm": 23400.06640625, + "learning_rate": 2.5454924376880772e-05, + "loss": 4.6848, + "step": 2144 + }, + { + "epoch": 0.5453678690608613, + "grad_norm": 23665.232421875, + "learning_rate": 2.5432735217271297e-05, + "loss": 4.6847, + "step": 2145 + }, + { + "epoch": 0.5456221198156682, + "grad_norm": 23365.0, + "learning_rate": 2.541054571665675e-05, + "loss": 4.6838, + "step": 2146 + }, + { + "epoch": 0.5458763705704751, + "grad_norm": 23369.833984375, + "learning_rate": 2.538835589252296e-05, + "loss": 4.6734, + "step": 2147 + }, + { + "epoch": 0.546130621325282, + "grad_norm": 23808.841796875, + "learning_rate": 2.5366165762356008e-05, + "loss": 4.6907, + "step": 2148 + }, + { + "epoch": 0.546384872080089, + "grad_norm": 23655.90625, + "learning_rate": 2.5343975343642217e-05, + "loss": 4.6985, + "step": 2149 + }, + { + "epoch": 0.5466391228348959, + "grad_norm": 23475.595703125, + "learning_rate": 2.532178465386813e-05, + "loss": 4.6877, + "step": 2150 + }, + { + "epoch": 0.5468933735897028, + "grad_norm": 23527.767578125, + "learning_rate": 2.5299593710520515e-05, + "loss": 4.699, + "step": 2151 + }, + { + "epoch": 0.5471476243445098, + "grad_norm": 23683.05859375, + "learning_rate": 2.527740253108632e-05, + "loss": 4.6851, + "step": 2152 + }, + { + "epoch": 0.5474018750993167, + "grad_norm": 23423.20703125, + "learning_rate": 2.5255211133052703e-05, + "loss": 4.6853, + "step": 2153 + }, + { + "epoch": 0.5476561258541236, + "grad_norm": 23770.255859375, + "learning_rate": 2.5233019533906994e-05, + "loss": 4.6836, + "step": 2154 + }, + { + "epoch": 0.5479103766089306, + "grad_norm": 23272.328125, + "learning_rate": 2.521082775113665e-05, + "loss": 4.6704, + "step": 2155 + }, + { + "epoch": 0.5481646273637375, + "grad_norm": 23532.994140625, + "learning_rate": 2.518863580222931e-05, + "loss": 4.6863, + "step": 2156 + }, + { + "epoch": 0.5484188781185444, + "grad_norm": 23402.43359375, + "learning_rate": 2.516644370467272e-05, + "loss": 4.683, + "step": 2157 + }, + { + "epoch": 0.5486731288733513, + "grad_norm": 23518.423828125, + "learning_rate": 2.5144251475954754e-05, + "loss": 4.6881, + "step": 2158 + }, + { + "epoch": 0.5489273796281583, + "grad_norm": 23648.654296875, + "learning_rate": 2.512205913356339e-05, + "loss": 4.6834, + "step": 2159 + }, + { + "epoch": 0.5491816303829652, + "grad_norm": 23744.015625, + "learning_rate": 2.5099866694986685e-05, + "loss": 4.6927, + "step": 2160 + }, + { + "epoch": 0.5494358811377721, + "grad_norm": 23478.8515625, + "learning_rate": 2.5077674177712785e-05, + "loss": 4.6796, + "step": 2161 + }, + { + "epoch": 0.5496901318925791, + "grad_norm": 23574.271484375, + "learning_rate": 2.5055481599229886e-05, + "loss": 4.685, + "step": 2162 + }, + { + "epoch": 0.549944382647386, + "grad_norm": 23522.96484375, + "learning_rate": 2.5033288977026237e-05, + "loss": 4.6783, + "step": 2163 + }, + { + "epoch": 0.5501986334021929, + "grad_norm": 23506.69140625, + "learning_rate": 2.5011096328590132e-05, + "loss": 4.6865, + "step": 2164 + }, + { + "epoch": 0.5504528841569999, + "grad_norm": 23553.56640625, + "learning_rate": 2.4988903671409873e-05, + "loss": 4.681, + "step": 2165 + }, + { + "epoch": 0.5507071349118068, + "grad_norm": 23498.599609375, + "learning_rate": 2.4966711022973773e-05, + "loss": 4.6759, + "step": 2166 + }, + { + "epoch": 0.5509613856666137, + "grad_norm": 23617.935546875, + "learning_rate": 2.4944518400770123e-05, + "loss": 4.6804, + "step": 2167 + }, + { + "epoch": 0.5512156364214206, + "grad_norm": 23562.78515625, + "learning_rate": 2.492232582228722e-05, + "loss": 4.684, + "step": 2168 + }, + { + "epoch": 0.5514698871762276, + "grad_norm": 23549.787109375, + "learning_rate": 2.4900133305013325e-05, + "loss": 4.6769, + "step": 2169 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 23639.8984375, + "learning_rate": 2.4877940866436613e-05, + "loss": 4.6659, + "step": 2170 + }, + { + "epoch": 0.5519783886858414, + "grad_norm": 23550.84765625, + "learning_rate": 2.485574852404525e-05, + "loss": 4.6738, + "step": 2171 + }, + { + "epoch": 0.5522326394406484, + "grad_norm": 23430.025390625, + "learning_rate": 2.4833556295327285e-05, + "loss": 4.675, + "step": 2172 + }, + { + "epoch": 0.5524868901954553, + "grad_norm": 24781.015625, + "learning_rate": 2.481136419777069e-05, + "loss": 4.6809, + "step": 2173 + }, + { + "epoch": 0.5527411409502622, + "grad_norm": 23808.51953125, + "learning_rate": 2.4789172248863352e-05, + "loss": 4.6757, + "step": 2174 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 23670.490234375, + "learning_rate": 2.476698046609302e-05, + "loss": 4.6832, + "step": 2175 + }, + { + "epoch": 0.5532496424598761, + "grad_norm": 23626.3671875, + "learning_rate": 2.4744788866947293e-05, + "loss": 4.68, + "step": 2176 + }, + { + "epoch": 0.553503893214683, + "grad_norm": 23613.134765625, + "learning_rate": 2.4722597468913687e-05, + "loss": 4.6705, + "step": 2177 + }, + { + "epoch": 0.5537581439694899, + "grad_norm": 23579.19140625, + "learning_rate": 2.4700406289479498e-05, + "loss": 4.6899, + "step": 2178 + }, + { + "epoch": 0.5540123947242969, + "grad_norm": 23817.73046875, + "learning_rate": 2.4678215346131874e-05, + "loss": 4.677, + "step": 2179 + }, + { + "epoch": 0.5542666454791038, + "grad_norm": 23665.787109375, + "learning_rate": 2.465602465635779e-05, + "loss": 4.6795, + "step": 2180 + }, + { + "epoch": 0.5545208962339107, + "grad_norm": 24260.82421875, + "learning_rate": 2.4633834237643998e-05, + "loss": 4.693, + "step": 2181 + }, + { + "epoch": 0.5547751469887177, + "grad_norm": 23447.123046875, + "learning_rate": 2.4611644107477043e-05, + "loss": 4.666, + "step": 2182 + }, + { + "epoch": 0.5550293977435246, + "grad_norm": 23537.998046875, + "learning_rate": 2.458945428334325e-05, + "loss": 4.6793, + "step": 2183 + }, + { + "epoch": 0.5552836484983315, + "grad_norm": 23895.3984375, + "learning_rate": 2.456726478272871e-05, + "loss": 4.7007, + "step": 2184 + }, + { + "epoch": 0.5555378992531385, + "grad_norm": 23540.876953125, + "learning_rate": 2.4545075623119227e-05, + "loss": 4.6602, + "step": 2185 + }, + { + "epoch": 0.5557921500079454, + "grad_norm": 24107.580078125, + "learning_rate": 2.4522886822000373e-05, + "loss": 4.6786, + "step": 2186 + }, + { + "epoch": 0.5560464007627522, + "grad_norm": 23539.541015625, + "learning_rate": 2.450069839685742e-05, + "loss": 4.6771, + "step": 2187 + }, + { + "epoch": 0.5563006515175591, + "grad_norm": 23844.34765625, + "learning_rate": 2.4478510365175328e-05, + "loss": 4.6693, + "step": 2188 + }, + { + "epoch": 0.5565549022723661, + "grad_norm": 23687.580078125, + "learning_rate": 2.4456322744438784e-05, + "loss": 4.6809, + "step": 2189 + }, + { + "epoch": 0.556809153027173, + "grad_norm": 23721.546875, + "learning_rate": 2.4434135552132128e-05, + "loss": 4.6725, + "step": 2190 + }, + { + "epoch": 0.5570634037819799, + "grad_norm": 23811.8125, + "learning_rate": 2.4411948805739353e-05, + "loss": 4.6776, + "step": 2191 + }, + { + "epoch": 0.5573176545367869, + "grad_norm": 23693.94921875, + "learning_rate": 2.438976252274413e-05, + "loss": 4.6757, + "step": 2192 + }, + { + "epoch": 0.5575719052915938, + "grad_norm": 23756.41796875, + "learning_rate": 2.4367576720629737e-05, + "loss": 4.6709, + "step": 2193 + }, + { + "epoch": 0.5578261560464007, + "grad_norm": 23525.6171875, + "learning_rate": 2.4345391416879084e-05, + "loss": 4.6678, + "step": 2194 + }, + { + "epoch": 0.5580804068012077, + "grad_norm": 23918.93359375, + "learning_rate": 2.4323206628974697e-05, + "loss": 4.6649, + "step": 2195 + }, + { + "epoch": 0.5583346575560146, + "grad_norm": 23451.505859375, + "learning_rate": 2.4301022374398687e-05, + "loss": 4.678, + "step": 2196 + }, + { + "epoch": 0.5585889083108215, + "grad_norm": 23600.818359375, + "learning_rate": 2.4278838670632738e-05, + "loss": 4.6644, + "step": 2197 + }, + { + "epoch": 0.5588431590656284, + "grad_norm": 23666.921875, + "learning_rate": 2.4256655535158097e-05, + "loss": 4.6787, + "step": 2198 + }, + { + "epoch": 0.5590974098204354, + "grad_norm": 23449.609375, + "learning_rate": 2.423447298545559e-05, + "loss": 4.6602, + "step": 2199 + }, + { + "epoch": 0.5593516605752423, + "grad_norm": 23701.328125, + "learning_rate": 2.421229103900556e-05, + "loss": 4.6676, + "step": 2200 + }, + { + "epoch": 0.5593516605752423, + "eval_loss": 9.412834167480469, + "eval_runtime": 696.9193, + "eval_samples_per_second": 152.078, + "eval_steps_per_second": 9.506, + "step": 2200 + }, + { + "epoch": 0.5596059113300492, + "grad_norm": 23569.12890625, + "learning_rate": 2.4190109713287873e-05, + "loss": 4.6554, + "step": 2201 + }, + { + "epoch": 0.5598601620848562, + "grad_norm": 23646.833984375, + "learning_rate": 2.416792902578192e-05, + "loss": 4.6494, + "step": 2202 + }, + { + "epoch": 0.5601144128396631, + "grad_norm": 23752.40625, + "learning_rate": 2.4145748993966576e-05, + "loss": 4.682, + "step": 2203 + }, + { + "epoch": 0.56036866359447, + "grad_norm": 23682.833984375, + "learning_rate": 2.4123569635320205e-05, + "loss": 4.6673, + "step": 2204 + }, + { + "epoch": 0.560622914349277, + "grad_norm": 23659.744140625, + "learning_rate": 2.410139096732064e-05, + "loss": 4.6584, + "step": 2205 + }, + { + "epoch": 0.5608771651040839, + "grad_norm": 23427.720703125, + "learning_rate": 2.407921300744517e-05, + "loss": 4.6631, + "step": 2206 + }, + { + "epoch": 0.5611314158588908, + "grad_norm": 23747.083984375, + "learning_rate": 2.4057035773170515e-05, + "loss": 4.6799, + "step": 2207 + }, + { + "epoch": 0.5613856666136977, + "grad_norm": 23551.533203125, + "learning_rate": 2.4034859281972854e-05, + "loss": 4.6625, + "step": 2208 + }, + { + "epoch": 0.5616399173685047, + "grad_norm": 23538.8203125, + "learning_rate": 2.4012683551327743e-05, + "loss": 4.6762, + "step": 2209 + }, + { + "epoch": 0.5618941681233116, + "grad_norm": 23587.6953125, + "learning_rate": 2.3990508598710153e-05, + "loss": 4.6676, + "step": 2210 + }, + { + "epoch": 0.5621484188781185, + "grad_norm": 23740.1640625, + "learning_rate": 2.3968334441594466e-05, + "loss": 4.6661, + "step": 2211 + }, + { + "epoch": 0.5624026696329255, + "grad_norm": 23712.07421875, + "learning_rate": 2.3946161097454405e-05, + "loss": 4.675, + "step": 2212 + }, + { + "epoch": 0.5626569203877324, + "grad_norm": 23351.328125, + "learning_rate": 2.3923988583763046e-05, + "loss": 4.6619, + "step": 2213 + }, + { + "epoch": 0.5629111711425393, + "grad_norm": 23578.640625, + "learning_rate": 2.3901816917992854e-05, + "loss": 4.661, + "step": 2214 + }, + { + "epoch": 0.5631654218973463, + "grad_norm": 23539.26171875, + "learning_rate": 2.387964611761558e-05, + "loss": 4.661, + "step": 2215 + }, + { + "epoch": 0.5634196726521532, + "grad_norm": 23750.275390625, + "learning_rate": 2.3857476200102316e-05, + "loss": 4.685, + "step": 2216 + }, + { + "epoch": 0.5636739234069601, + "grad_norm": 23637.703125, + "learning_rate": 2.3835307182923468e-05, + "loss": 4.6693, + "step": 2217 + }, + { + "epoch": 0.563928174161767, + "grad_norm": 23817.5859375, + "learning_rate": 2.3813139083548715e-05, + "loss": 4.656, + "step": 2218 + }, + { + "epoch": 0.564182424916574, + "grad_norm": 23694.33984375, + "learning_rate": 2.3790971919446994e-05, + "loss": 4.6586, + "step": 2219 + }, + { + "epoch": 0.5644366756713809, + "grad_norm": 23545.91015625, + "learning_rate": 2.3768805708086556e-05, + "loss": 4.6573, + "step": 2220 + }, + { + "epoch": 0.5646909264261878, + "grad_norm": 23494.041015625, + "learning_rate": 2.3746640466934868e-05, + "loss": 4.656, + "step": 2221 + }, + { + "epoch": 0.5649451771809948, + "grad_norm": 23604.33203125, + "learning_rate": 2.3724476213458622e-05, + "loss": 4.6702, + "step": 2222 + }, + { + "epoch": 0.5651994279358017, + "grad_norm": 23475.966796875, + "learning_rate": 2.3702312965123768e-05, + "loss": 4.6617, + "step": 2223 + }, + { + "epoch": 0.5654536786906086, + "grad_norm": 23563.611328125, + "learning_rate": 2.3680150739395433e-05, + "loss": 4.6688, + "step": 2224 + }, + { + "epoch": 0.5657079294454156, + "grad_norm": 23558.9921875, + "learning_rate": 2.3657989553737943e-05, + "loss": 4.6619, + "step": 2225 + }, + { + "epoch": 0.5659621802002225, + "grad_norm": 23713.333984375, + "learning_rate": 2.3635829425614816e-05, + "loss": 4.6547, + "step": 2226 + }, + { + "epoch": 0.5662164309550294, + "grad_norm": 23621.0625, + "learning_rate": 2.3613670372488737e-05, + "loss": 4.6585, + "step": 2227 + }, + { + "epoch": 0.5664706817098363, + "grad_norm": 23559.63671875, + "learning_rate": 2.3591512411821533e-05, + "loss": 4.6608, + "step": 2228 + }, + { + "epoch": 0.5667249324646433, + "grad_norm": 23587.990234375, + "learning_rate": 2.3569355561074162e-05, + "loss": 4.6704, + "step": 2229 + }, + { + "epoch": 0.5669791832194502, + "grad_norm": 23695.6875, + "learning_rate": 2.3547199837706733e-05, + "loss": 4.6537, + "step": 2230 + }, + { + "epoch": 0.5672334339742571, + "grad_norm": 23656.74609375, + "learning_rate": 2.3525045259178457e-05, + "loss": 4.6547, + "step": 2231 + }, + { + "epoch": 0.5674876847290641, + "grad_norm": 23655.390625, + "learning_rate": 2.3502891842947623e-05, + "loss": 4.6549, + "step": 2232 + }, + { + "epoch": 0.567741935483871, + "grad_norm": 23649.732421875, + "learning_rate": 2.348073960647164e-05, + "loss": 4.6609, + "step": 2233 + }, + { + "epoch": 0.5679961862386779, + "grad_norm": 23770.0625, + "learning_rate": 2.3458588567206956e-05, + "loss": 4.6475, + "step": 2234 + }, + { + "epoch": 0.5682504369934849, + "grad_norm": 24422.041015625, + "learning_rate": 2.343643874260908e-05, + "loss": 4.6458, + "step": 2235 + }, + { + "epoch": 0.5685046877482918, + "grad_norm": 23832.166015625, + "learning_rate": 2.3414290150132588e-05, + "loss": 4.6625, + "step": 2236 + }, + { + "epoch": 0.5687589385030987, + "grad_norm": 23755.197265625, + "learning_rate": 2.3392142807231066e-05, + "loss": 4.6631, + "step": 2237 + }, + { + "epoch": 0.5690131892579056, + "grad_norm": 23709.3046875, + "learning_rate": 2.33699967313571e-05, + "loss": 4.6583, + "step": 2238 + }, + { + "epoch": 0.5692674400127126, + "grad_norm": 23829.67578125, + "learning_rate": 2.3347851939962317e-05, + "loss": 4.6642, + "step": 2239 + }, + { + "epoch": 0.5695216907675195, + "grad_norm": 23508.193359375, + "learning_rate": 2.3325708450497297e-05, + "loss": 4.6493, + "step": 2240 + }, + { + "epoch": 0.5697759415223264, + "grad_norm": 23656.46484375, + "learning_rate": 2.3303566280411604e-05, + "loss": 4.6611, + "step": 2241 + }, + { + "epoch": 0.5700301922771334, + "grad_norm": 23611.6328125, + "learning_rate": 2.328142544715378e-05, + "loss": 4.659, + "step": 2242 + }, + { + "epoch": 0.5702844430319403, + "grad_norm": 23784.775390625, + "learning_rate": 2.325928596817129e-05, + "loss": 4.647, + "step": 2243 + }, + { + "epoch": 0.5705386937867472, + "grad_norm": 23664.802734375, + "learning_rate": 2.323714786091053e-05, + "loss": 4.6583, + "step": 2244 + }, + { + "epoch": 0.5707929445415542, + "grad_norm": 23850.833984375, + "learning_rate": 2.3215011142816843e-05, + "loss": 4.6484, + "step": 2245 + }, + { + "epoch": 0.571047195296361, + "grad_norm": 23581.072265625, + "learning_rate": 2.3192875831334453e-05, + "loss": 4.6491, + "step": 2246 + }, + { + "epoch": 0.571301446051168, + "grad_norm": 23552.564453125, + "learning_rate": 2.3170741943906476e-05, + "loss": 4.6525, + "step": 2247 + }, + { + "epoch": 0.5715556968059748, + "grad_norm": 23436.001953125, + "learning_rate": 2.3148609497974927e-05, + "loss": 4.6426, + "step": 2248 + }, + { + "epoch": 0.5718099475607819, + "grad_norm": 23562.5, + "learning_rate": 2.312647851098066e-05, + "loss": 4.6552, + "step": 2249 + }, + { + "epoch": 0.5720641983155887, + "grad_norm": 23684.80859375, + "learning_rate": 2.3104349000363383e-05, + "loss": 4.656, + "step": 2250 + }, + { + "epoch": 0.5723184490703956, + "grad_norm": 23678.734375, + "learning_rate": 2.308222098356166e-05, + "loss": 4.6475, + "step": 2251 + }, + { + "epoch": 0.5725726998252026, + "grad_norm": 23693.77734375, + "learning_rate": 2.306009447801287e-05, + "loss": 4.6478, + "step": 2252 + }, + { + "epoch": 0.5728269505800095, + "grad_norm": 23781.884765625, + "learning_rate": 2.3037969501153173e-05, + "loss": 4.6622, + "step": 2253 + }, + { + "epoch": 0.5730812013348164, + "grad_norm": 23766.623046875, + "learning_rate": 2.3015846070417572e-05, + "loss": 4.65, + "step": 2254 + }, + { + "epoch": 0.5733354520896234, + "grad_norm": 23729.931640625, + "learning_rate": 2.2993724203239815e-05, + "loss": 4.6452, + "step": 2255 + }, + { + "epoch": 0.5735897028444303, + "grad_norm": 23766.7109375, + "learning_rate": 2.297160391705242e-05, + "loss": 4.6533, + "step": 2256 + }, + { + "epoch": 0.5738439535992372, + "grad_norm": 23791.107421875, + "learning_rate": 2.294948522928669e-05, + "loss": 4.649, + "step": 2257 + }, + { + "epoch": 0.5740982043540441, + "grad_norm": 23727.583984375, + "learning_rate": 2.292736815737264e-05, + "loss": 4.6383, + "step": 2258 + }, + { + "epoch": 0.5743524551088511, + "grad_norm": 23779.15234375, + "learning_rate": 2.2905252718739017e-05, + "loss": 4.6527, + "step": 2259 + }, + { + "epoch": 0.574606705863658, + "grad_norm": 23620.60546875, + "learning_rate": 2.2883138930813276e-05, + "loss": 4.6423, + "step": 2260 + }, + { + "epoch": 0.5748609566184649, + "grad_norm": 23726.921875, + "learning_rate": 2.286102681102159e-05, + "loss": 4.6445, + "step": 2261 + }, + { + "epoch": 0.5751152073732719, + "grad_norm": 23875.283203125, + "learning_rate": 2.2838916376788806e-05, + "loss": 4.6501, + "step": 2262 + }, + { + "epoch": 0.5753694581280788, + "grad_norm": 23707.943359375, + "learning_rate": 2.281680764553843e-05, + "loss": 4.6516, + "step": 2263 + }, + { + "epoch": 0.5756237088828857, + "grad_norm": 23859.169921875, + "learning_rate": 2.279470063469266e-05, + "loss": 4.6586, + "step": 2264 + }, + { + "epoch": 0.5758779596376927, + "grad_norm": 23808.572265625, + "learning_rate": 2.27725953616723e-05, + "loss": 4.6685, + "step": 2265 + }, + { + "epoch": 0.5761322103924996, + "grad_norm": 23589.271484375, + "learning_rate": 2.2750491843896795e-05, + "loss": 4.6517, + "step": 2266 + }, + { + "epoch": 0.5763864611473065, + "grad_norm": 23687.83203125, + "learning_rate": 2.272839009878423e-05, + "loss": 4.6475, + "step": 2267 + }, + { + "epoch": 0.5766407119021134, + "grad_norm": 23941.404296875, + "learning_rate": 2.2706290143751275e-05, + "loss": 4.656, + "step": 2268 + }, + { + "epoch": 0.5768949626569204, + "grad_norm": 23688.3046875, + "learning_rate": 2.2684191996213173e-05, + "loss": 4.6569, + "step": 2269 + }, + { + "epoch": 0.5771492134117273, + "grad_norm": 23800.994140625, + "learning_rate": 2.266209567358379e-05, + "loss": 4.6521, + "step": 2270 + }, + { + "epoch": 0.5774034641665342, + "grad_norm": 23917.234375, + "learning_rate": 2.26400011932755e-05, + "loss": 4.6535, + "step": 2271 + }, + { + "epoch": 0.5776577149213412, + "grad_norm": 23950.013671875, + "learning_rate": 2.2617908572699255e-05, + "loss": 4.6477, + "step": 2272 + }, + { + "epoch": 0.5779119656761481, + "grad_norm": 23824.01171875, + "learning_rate": 2.259581782926454e-05, + "loss": 4.6452, + "step": 2273 + }, + { + "epoch": 0.578166216430955, + "grad_norm": 23571.326171875, + "learning_rate": 2.257372898037936e-05, + "loss": 4.6371, + "step": 2274 + }, + { + "epoch": 0.578420467185762, + "grad_norm": 23664.302734375, + "learning_rate": 2.2551642043450206e-05, + "loss": 4.6404, + "step": 2275 + }, + { + "epoch": 0.5786747179405689, + "grad_norm": 23698.400390625, + "learning_rate": 2.2529557035882106e-05, + "loss": 4.6437, + "step": 2276 + }, + { + "epoch": 0.5789289686953758, + "grad_norm": 24018.16796875, + "learning_rate": 2.2507473975078525e-05, + "loss": 4.6531, + "step": 2277 + }, + { + "epoch": 0.5791832194501827, + "grad_norm": 23694.31640625, + "learning_rate": 2.2485392878441408e-05, + "loss": 4.6532, + "step": 2278 + }, + { + "epoch": 0.5794374702049897, + "grad_norm": 31096.650390625, + "learning_rate": 2.2463313763371172e-05, + "loss": 4.6481, + "step": 2279 + }, + { + "epoch": 0.5796917209597966, + "grad_norm": 24021.9453125, + "learning_rate": 2.2441236647266643e-05, + "loss": 4.6544, + "step": 2280 + }, + { + "epoch": 0.5799459717146035, + "grad_norm": 23882.1328125, + "learning_rate": 2.241916154752508e-05, + "loss": 4.6595, + "step": 2281 + }, + { + "epoch": 0.5802002224694105, + "grad_norm": 24133.59765625, + "learning_rate": 2.239708848154217e-05, + "loss": 4.6477, + "step": 2282 + }, + { + "epoch": 0.5804544732242174, + "grad_norm": 23939.361328125, + "learning_rate": 2.2375017466711974e-05, + "loss": 4.6486, + "step": 2283 + }, + { + "epoch": 0.5807087239790243, + "grad_norm": 24304.78125, + "learning_rate": 2.2352948520426952e-05, + "loss": 4.6377, + "step": 2284 + }, + { + "epoch": 0.5809629747338313, + "grad_norm": 24256.896484375, + "learning_rate": 2.233088166007793e-05, + "loss": 4.634, + "step": 2285 + }, + { + "epoch": 0.5812172254886382, + "grad_norm": 23963.439453125, + "learning_rate": 2.2308816903054093e-05, + "loss": 4.6451, + "step": 2286 + }, + { + "epoch": 0.5814714762434451, + "grad_norm": 23782.306640625, + "learning_rate": 2.228675426674295e-05, + "loss": 4.6485, + "step": 2287 + }, + { + "epoch": 0.581725726998252, + "grad_norm": 24005.71484375, + "learning_rate": 2.2264693768530365e-05, + "loss": 4.6496, + "step": 2288 + }, + { + "epoch": 0.581979977753059, + "grad_norm": 23960.724609375, + "learning_rate": 2.224263542580051e-05, + "loss": 4.6512, + "step": 2289 + }, + { + "epoch": 0.5822342285078659, + "grad_norm": 23995.58203125, + "learning_rate": 2.222057925593585e-05, + "loss": 4.6439, + "step": 2290 + }, + { + "epoch": 0.5824884792626728, + "grad_norm": 23967.404296875, + "learning_rate": 2.2198525276317132e-05, + "loss": 4.6247, + "step": 2291 + }, + { + "epoch": 0.5827427300174798, + "grad_norm": 23795.951171875, + "learning_rate": 2.2176473504323403e-05, + "loss": 4.6459, + "step": 2292 + }, + { + "epoch": 0.5829969807722867, + "grad_norm": 23767.240234375, + "learning_rate": 2.2154423957331943e-05, + "loss": 4.6333, + "step": 2293 + }, + { + "epoch": 0.5832512315270936, + "grad_norm": 23764.513671875, + "learning_rate": 2.2132376652718293e-05, + "loss": 4.6286, + "step": 2294 + }, + { + "epoch": 0.5835054822819006, + "grad_norm": 24018.6171875, + "learning_rate": 2.2110331607856232e-05, + "loss": 4.6403, + "step": 2295 + }, + { + "epoch": 0.5837597330367075, + "grad_norm": 23816.189453125, + "learning_rate": 2.2088288840117747e-05, + "loss": 4.6459, + "step": 2296 + }, + { + "epoch": 0.5840139837915144, + "grad_norm": 23951.2734375, + "learning_rate": 2.2066248366873016e-05, + "loss": 4.6483, + "step": 2297 + }, + { + "epoch": 0.5842682345463213, + "grad_norm": 23919.416015625, + "learning_rate": 2.204421020549045e-05, + "loss": 4.6433, + "step": 2298 + }, + { + "epoch": 0.5845224853011283, + "grad_norm": 23633.701171875, + "learning_rate": 2.202217437333661e-05, + "loss": 4.629, + "step": 2299 + }, + { + "epoch": 0.5847767360559352, + "grad_norm": 23658.61328125, + "learning_rate": 2.2000140887776217e-05, + "loss": 4.6164, + "step": 2300 + }, + { + "epoch": 0.5847767360559352, + "eval_loss": 9.34914493560791, + "eval_runtime": 696.6936, + "eval_samples_per_second": 152.127, + "eval_steps_per_second": 9.509, + "step": 2300 + }, + { + "epoch": 0.5850309868107421, + "grad_norm": 23826.490234375, + "learning_rate": 2.1978109766172163e-05, + "loss": 4.6445, + "step": 2301 + }, + { + "epoch": 0.5852852375655491, + "grad_norm": 24745.111328125, + "learning_rate": 2.195608102588546e-05, + "loss": 4.656, + "step": 2302 + }, + { + "epoch": 0.585539488320356, + "grad_norm": 24077.908203125, + "learning_rate": 2.193405468427525e-05, + "loss": 4.625, + "step": 2303 + }, + { + "epoch": 0.5857937390751629, + "grad_norm": 24552.796875, + "learning_rate": 2.1912030758698787e-05, + "loss": 4.6509, + "step": 2304 + }, + { + "epoch": 0.5860479898299699, + "grad_norm": 23921.84765625, + "learning_rate": 2.1890009266511426e-05, + "loss": 4.6403, + "step": 2305 + }, + { + "epoch": 0.5863022405847768, + "grad_norm": 24942.193359375, + "learning_rate": 2.1867990225066575e-05, + "loss": 4.6431, + "step": 2306 + }, + { + "epoch": 0.5865564913395837, + "grad_norm": 23944.13671875, + "learning_rate": 2.184597365171576e-05, + "loss": 4.6406, + "step": 2307 + }, + { + "epoch": 0.5868107420943905, + "grad_norm": 24154.078125, + "learning_rate": 2.1823959563808516e-05, + "loss": 4.6455, + "step": 2308 + }, + { + "epoch": 0.5870649928491976, + "grad_norm": 23738.748046875, + "learning_rate": 2.1801947978692442e-05, + "loss": 4.6272, + "step": 2309 + }, + { + "epoch": 0.5873192436040044, + "grad_norm": 24209.685546875, + "learning_rate": 2.1779938913713175e-05, + "loss": 4.6309, + "step": 2310 + }, + { + "epoch": 0.5875734943588113, + "grad_norm": 24121.2421875, + "learning_rate": 2.175793238621434e-05, + "loss": 4.6309, + "step": 2311 + }, + { + "epoch": 0.5878277451136183, + "grad_norm": 23901.662109375, + "learning_rate": 2.173592841353757e-05, + "loss": 4.6364, + "step": 2312 + }, + { + "epoch": 0.5880819958684252, + "grad_norm": 24193.826171875, + "learning_rate": 2.1713927013022504e-05, + "loss": 4.6363, + "step": 2313 + }, + { + "epoch": 0.5883362466232321, + "grad_norm": 23835.875, + "learning_rate": 2.1691928202006727e-05, + "loss": 4.6468, + "step": 2314 + }, + { + "epoch": 0.5885904973780391, + "grad_norm": 24479.708984375, + "learning_rate": 2.1669931997825797e-05, + "loss": 4.6484, + "step": 2315 + }, + { + "epoch": 0.588844748132846, + "grad_norm": 23919.068359375, + "learning_rate": 2.164793841781323e-05, + "loss": 4.6271, + "step": 2316 + }, + { + "epoch": 0.5890989988876529, + "grad_norm": 24080.228515625, + "learning_rate": 2.162594747930045e-05, + "loss": 4.6392, + "step": 2317 + }, + { + "epoch": 0.5893532496424598, + "grad_norm": 23908.3125, + "learning_rate": 2.1603959199616802e-05, + "loss": 4.6322, + "step": 2318 + }, + { + "epoch": 0.5896075003972668, + "grad_norm": 23747.232421875, + "learning_rate": 2.1581973596089557e-05, + "loss": 4.6348, + "step": 2319 + }, + { + "epoch": 0.5898617511520737, + "grad_norm": 24052.70703125, + "learning_rate": 2.1559990686043873e-05, + "loss": 4.6429, + "step": 2320 + }, + { + "epoch": 0.5901160019068806, + "grad_norm": 23932.85546875, + "learning_rate": 2.1538010486802747e-05, + "loss": 4.6391, + "step": 2321 + }, + { + "epoch": 0.5903702526616876, + "grad_norm": 23887.466796875, + "learning_rate": 2.15160330156871e-05, + "loss": 4.6404, + "step": 2322 + }, + { + "epoch": 0.5906245034164945, + "grad_norm": 23930.689453125, + "learning_rate": 2.149405829001566e-05, + "loss": 4.6201, + "step": 2323 + }, + { + "epoch": 0.5908787541713014, + "grad_norm": 23932.43359375, + "learning_rate": 2.1472086327105e-05, + "loss": 4.6257, + "step": 2324 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 24049.189453125, + "learning_rate": 2.1450117144269518e-05, + "loss": 4.6346, + "step": 2325 + }, + { + "epoch": 0.5913872556809153, + "grad_norm": 23789.107421875, + "learning_rate": 2.142815075882144e-05, + "loss": 4.6313, + "step": 2326 + }, + { + "epoch": 0.5916415064357222, + "grad_norm": 24165.654296875, + "learning_rate": 2.140618718807076e-05, + "loss": 4.6325, + "step": 2327 + }, + { + "epoch": 0.5918957571905291, + "grad_norm": 23930.60546875, + "learning_rate": 2.1384226449325258e-05, + "loss": 4.623, + "step": 2328 + }, + { + "epoch": 0.5921500079453361, + "grad_norm": 30069.75, + "learning_rate": 2.13622685598905e-05, + "loss": 4.6395, + "step": 2329 + }, + { + "epoch": 0.592404258700143, + "grad_norm": 24049.90234375, + "learning_rate": 2.1340313537069794e-05, + "loss": 4.6289, + "step": 2330 + }, + { + "epoch": 0.5926585094549499, + "grad_norm": 24269.625, + "learning_rate": 2.1318361398164184e-05, + "loss": 4.6249, + "step": 2331 + }, + { + "epoch": 0.5929127602097569, + "grad_norm": 23968.32421875, + "learning_rate": 2.1296412160472463e-05, + "loss": 4.6224, + "step": 2332 + }, + { + "epoch": 0.5931670109645638, + "grad_norm": 24813.8671875, + "learning_rate": 2.1274465841291113e-05, + "loss": 4.6206, + "step": 2333 + }, + { + "epoch": 0.5934212617193707, + "grad_norm": 23804.875, + "learning_rate": 2.1252522457914316e-05, + "loss": 4.6267, + "step": 2334 + }, + { + "epoch": 0.5936755124741776, + "grad_norm": 24078.123046875, + "learning_rate": 2.1230582027633966e-05, + "loss": 4.6244, + "step": 2335 + }, + { + "epoch": 0.5939297632289846, + "grad_norm": 24245.869140625, + "learning_rate": 2.1208644567739617e-05, + "loss": 4.626, + "step": 2336 + }, + { + "epoch": 0.5941840139837915, + "grad_norm": 24020.359375, + "learning_rate": 2.1186710095518464e-05, + "loss": 4.6199, + "step": 2337 + }, + { + "epoch": 0.5944382647385984, + "grad_norm": 24579.013671875, + "learning_rate": 2.1164778628255387e-05, + "loss": 4.6299, + "step": 2338 + }, + { + "epoch": 0.5946925154934054, + "grad_norm": 23744.48828125, + "learning_rate": 2.1142850183232848e-05, + "loss": 4.6126, + "step": 2339 + }, + { + "epoch": 0.5949467662482123, + "grad_norm": 24056.154296875, + "learning_rate": 2.112092477773097e-05, + "loss": 4.6177, + "step": 2340 + }, + { + "epoch": 0.5952010170030192, + "grad_norm": 24197.67578125, + "learning_rate": 2.109900242902746e-05, + "loss": 4.6194, + "step": 2341 + }, + { + "epoch": 0.5954552677578262, + "grad_norm": 23995.0859375, + "learning_rate": 2.1077083154397632e-05, + "loss": 4.6236, + "step": 2342 + }, + { + "epoch": 0.5957095185126331, + "grad_norm": 24290.8359375, + "learning_rate": 2.1055166971114345e-05, + "loss": 4.6327, + "step": 2343 + }, + { + "epoch": 0.59596376926744, + "grad_norm": 23867.078125, + "learning_rate": 2.1033253896448062e-05, + "loss": 4.6153, + "step": 2344 + }, + { + "epoch": 0.5962180200222469, + "grad_norm": 24057.21484375, + "learning_rate": 2.101134394766677e-05, + "loss": 4.6227, + "step": 2345 + }, + { + "epoch": 0.5964722707770539, + "grad_norm": 24038.43359375, + "learning_rate": 2.0989437142035998e-05, + "loss": 4.6253, + "step": 2346 + }, + { + "epoch": 0.5967265215318608, + "grad_norm": 24079.84375, + "learning_rate": 2.0967533496818812e-05, + "loss": 4.6344, + "step": 2347 + }, + { + "epoch": 0.5969807722866677, + "grad_norm": 24216.71875, + "learning_rate": 2.0945633029275768e-05, + "loss": 4.6171, + "step": 2348 + }, + { + "epoch": 0.5972350230414747, + "grad_norm": 23881.865234375, + "learning_rate": 2.092373575666492e-05, + "loss": 4.6144, + "step": 2349 + }, + { + "epoch": 0.5974892737962816, + "grad_norm": 24141.783203125, + "learning_rate": 2.0901841696241824e-05, + "loss": 4.6191, + "step": 2350 + }, + { + "epoch": 0.5977435245510885, + "grad_norm": 23909.884765625, + "learning_rate": 2.0879950865259483e-05, + "loss": 4.6283, + "step": 2351 + }, + { + "epoch": 0.5979977753058955, + "grad_norm": 24007.158203125, + "learning_rate": 2.085806328096836e-05, + "loss": 4.6127, + "step": 2352 + }, + { + "epoch": 0.5982520260607024, + "grad_norm": 23884.1640625, + "learning_rate": 2.0836178960616374e-05, + "loss": 4.6076, + "step": 2353 + }, + { + "epoch": 0.5985062768155093, + "grad_norm": 23840.015625, + "learning_rate": 2.081429792144886e-05, + "loss": 4.6197, + "step": 2354 + }, + { + "epoch": 0.5987605275703162, + "grad_norm": 23980.75390625, + "learning_rate": 2.0792420180708556e-05, + "loss": 4.629, + "step": 2355 + }, + { + "epoch": 0.5990147783251232, + "grad_norm": 23877.373046875, + "learning_rate": 2.0770545755635612e-05, + "loss": 4.6283, + "step": 2356 + }, + { + "epoch": 0.5992690290799301, + "grad_norm": 24055.482421875, + "learning_rate": 2.0748674663467584e-05, + "loss": 4.6246, + "step": 2357 + }, + { + "epoch": 0.599523279834737, + "grad_norm": 23922.23046875, + "learning_rate": 2.0726806921439377e-05, + "loss": 4.6318, + "step": 2358 + }, + { + "epoch": 0.599777530589544, + "grad_norm": 24015.642578125, + "learning_rate": 2.0704942546783244e-05, + "loss": 4.6179, + "step": 2359 + }, + { + "epoch": 0.6000317813443509, + "grad_norm": 24127.0625, + "learning_rate": 2.0683081556728834e-05, + "loss": 4.6268, + "step": 2360 + }, + { + "epoch": 0.6002860320991578, + "grad_norm": 23787.287109375, + "learning_rate": 2.0661223968503073e-05, + "loss": 4.6286, + "step": 2361 + }, + { + "epoch": 0.6005402828539648, + "grad_norm": 24056.884765625, + "learning_rate": 2.0639369799330236e-05, + "loss": 4.6387, + "step": 2362 + }, + { + "epoch": 0.6007945336087717, + "grad_norm": 23954.59765625, + "learning_rate": 2.061751906643192e-05, + "loss": 4.6202, + "step": 2363 + }, + { + "epoch": 0.6010487843635786, + "grad_norm": 23982.9140625, + "learning_rate": 2.059567178702697e-05, + "loss": 4.6083, + "step": 2364 + }, + { + "epoch": 0.6013030351183855, + "grad_norm": 23947.17578125, + "learning_rate": 2.0573827978331528e-05, + "loss": 4.6392, + "step": 2365 + }, + { + "epoch": 0.6015572858731925, + "grad_norm": 24015.712890625, + "learning_rate": 2.0551987657559023e-05, + "loss": 4.6405, + "step": 2366 + }, + { + "epoch": 0.6018115366279994, + "grad_norm": 24191.13671875, + "learning_rate": 2.053015084192012e-05, + "loss": 4.6193, + "step": 2367 + }, + { + "epoch": 0.6020657873828063, + "grad_norm": 23959.859375, + "learning_rate": 2.0508317548622703e-05, + "loss": 4.6161, + "step": 2368 + }, + { + "epoch": 0.6023200381376133, + "grad_norm": 23834.861328125, + "learning_rate": 2.0486487794871915e-05, + "loss": 4.6132, + "step": 2369 + }, + { + "epoch": 0.6025742888924202, + "grad_norm": 23883.919921875, + "learning_rate": 2.046466159787008e-05, + "loss": 4.6239, + "step": 2370 + }, + { + "epoch": 0.602828539647227, + "grad_norm": 24051.052734375, + "learning_rate": 2.044283897481673e-05, + "loss": 4.6122, + "step": 2371 + }, + { + "epoch": 0.603082790402034, + "grad_norm": 23979.1796875, + "learning_rate": 2.0421019942908588e-05, + "loss": 4.616, + "step": 2372 + }, + { + "epoch": 0.603337041156841, + "grad_norm": 23985.74609375, + "learning_rate": 2.039920451933955e-05, + "loss": 4.61, + "step": 2373 + }, + { + "epoch": 0.6035912919116478, + "grad_norm": 24028.240234375, + "learning_rate": 2.0377392721300634e-05, + "loss": 4.6134, + "step": 2374 + }, + { + "epoch": 0.6038455426664547, + "grad_norm": 24225.2109375, + "learning_rate": 2.0355584565980055e-05, + "loss": 4.6203, + "step": 2375 + }, + { + "epoch": 0.6040997934212617, + "grad_norm": 24082.880859375, + "learning_rate": 2.033378007056311e-05, + "loss": 4.6098, + "step": 2376 + }, + { + "epoch": 0.6043540441760686, + "grad_norm": 24132.091796875, + "learning_rate": 2.0311979252232233e-05, + "loss": 4.6122, + "step": 2377 + }, + { + "epoch": 0.6046082949308755, + "grad_norm": 23865.46484375, + "learning_rate": 2.029018212816697e-05, + "loss": 4.61, + "step": 2378 + }, + { + "epoch": 0.6048625456856825, + "grad_norm": 24046.63671875, + "learning_rate": 2.026838871554394e-05, + "loss": 4.6025, + "step": 2379 + }, + { + "epoch": 0.6051167964404894, + "grad_norm": 24040.93359375, + "learning_rate": 2.0246599031536826e-05, + "loss": 4.6109, + "step": 2380 + }, + { + "epoch": 0.6053710471952963, + "grad_norm": 24000.0078125, + "learning_rate": 2.022481309331641e-05, + "loss": 4.6087, + "step": 2381 + }, + { + "epoch": 0.6056252979501033, + "grad_norm": 23814.85546875, + "learning_rate": 2.0203030918050485e-05, + "loss": 4.6103, + "step": 2382 + }, + { + "epoch": 0.6058795487049102, + "grad_norm": 24202.513671875, + "learning_rate": 2.0181252522903896e-05, + "loss": 4.6042, + "step": 2383 + }, + { + "epoch": 0.6061337994597171, + "grad_norm": 23995.8203125, + "learning_rate": 2.015947792503852e-05, + "loss": 4.6267, + "step": 2384 + }, + { + "epoch": 0.606388050214524, + "grad_norm": 24072.720703125, + "learning_rate": 2.013770714161322e-05, + "loss": 4.6166, + "step": 2385 + }, + { + "epoch": 0.606642300969331, + "grad_norm": 24216.36328125, + "learning_rate": 2.0115940189783862e-05, + "loss": 4.6323, + "step": 2386 + }, + { + "epoch": 0.6068965517241379, + "grad_norm": 23951.4140625, + "learning_rate": 2.0094177086703284e-05, + "loss": 4.6179, + "step": 2387 + }, + { + "epoch": 0.6071508024789448, + "grad_norm": 24178.369140625, + "learning_rate": 2.0072417849521318e-05, + "loss": 4.6158, + "step": 2388 + }, + { + "epoch": 0.6074050532337518, + "grad_norm": 24516.453125, + "learning_rate": 2.0050662495384726e-05, + "loss": 4.611, + "step": 2389 + }, + { + "epoch": 0.6076593039885587, + "grad_norm": 23935.572265625, + "learning_rate": 2.0028911041437198e-05, + "loss": 4.6106, + "step": 2390 + }, + { + "epoch": 0.6079135547433656, + "grad_norm": 24013.7578125, + "learning_rate": 2.0007163504819394e-05, + "loss": 4.603, + "step": 2391 + }, + { + "epoch": 0.6081678054981726, + "grad_norm": 23881.341796875, + "learning_rate": 1.9985419902668845e-05, + "loss": 4.6108, + "step": 2392 + }, + { + "epoch": 0.6084220562529795, + "grad_norm": 24132.826171875, + "learning_rate": 1.9963680252119992e-05, + "loss": 4.6052, + "step": 2393 + }, + { + "epoch": 0.6086763070077864, + "grad_norm": 23897.7109375, + "learning_rate": 1.9941944570304193e-05, + "loss": 4.6072, + "step": 2394 + }, + { + "epoch": 0.6089305577625933, + "grad_norm": 24085.8671875, + "learning_rate": 1.9920212874349637e-05, + "loss": 4.6108, + "step": 2395 + }, + { + "epoch": 0.6091848085174003, + "grad_norm": 23989.02734375, + "learning_rate": 1.9898485181381377e-05, + "loss": 4.6097, + "step": 2396 + }, + { + "epoch": 0.6094390592722072, + "grad_norm": 23890.033203125, + "learning_rate": 1.9876761508521342e-05, + "loss": 4.608, + "step": 2397 + }, + { + "epoch": 0.6096933100270141, + "grad_norm": 24097.001953125, + "learning_rate": 1.9855041872888273e-05, + "loss": 4.6056, + "step": 2398 + }, + { + "epoch": 0.6099475607818211, + "grad_norm": 23937.810546875, + "learning_rate": 1.9833326291597712e-05, + "loss": 4.6072, + "step": 2399 + }, + { + "epoch": 0.610201811536628, + "grad_norm": 24209.03515625, + "learning_rate": 1.981161478176205e-05, + "loss": 4.5977, + "step": 2400 + }, + { + "epoch": 0.610201811536628, + "eval_loss": 9.29190731048584, + "eval_runtime": 696.0967, + "eval_samples_per_second": 152.258, + "eval_steps_per_second": 9.517, + "step": 2400 + }, + { + "epoch": 0.6104560622914349, + "grad_norm": 24077.4140625, + "learning_rate": 1.978990736049043e-05, + "loss": 4.6121, + "step": 2401 + }, + { + "epoch": 0.6107103130462419, + "grad_norm": 24136.462890625, + "learning_rate": 1.9768204044888778e-05, + "loss": 4.6078, + "step": 2402 + }, + { + "epoch": 0.6109645638010488, + "grad_norm": 23773.525390625, + "learning_rate": 1.974650485205981e-05, + "loss": 4.5966, + "step": 2403 + }, + { + "epoch": 0.6112188145558557, + "grad_norm": 23942.267578125, + "learning_rate": 1.9724809799102974e-05, + "loss": 4.6134, + "step": 2404 + }, + { + "epoch": 0.6114730653106626, + "grad_norm": 24046.46484375, + "learning_rate": 1.970311890311445e-05, + "loss": 4.6057, + "step": 2405 + }, + { + "epoch": 0.6117273160654696, + "grad_norm": 24032.455078125, + "learning_rate": 1.968143218118717e-05, + "loss": 4.6163, + "step": 2406 + }, + { + "epoch": 0.6119815668202765, + "grad_norm": 24019.90234375, + "learning_rate": 1.9659749650410737e-05, + "loss": 4.6133, + "step": 2407 + }, + { + "epoch": 0.6122358175750834, + "grad_norm": 23899.46484375, + "learning_rate": 1.9638071327871483e-05, + "loss": 4.6102, + "step": 2408 + }, + { + "epoch": 0.6124900683298904, + "grad_norm": 23958.88671875, + "learning_rate": 1.9616397230652407e-05, + "loss": 4.6039, + "step": 2409 + }, + { + "epoch": 0.6127443190846973, + "grad_norm": 24100.830078125, + "learning_rate": 1.9594727375833195e-05, + "loss": 4.6165, + "step": 2410 + }, + { + "epoch": 0.6129985698395042, + "grad_norm": 24041.77734375, + "learning_rate": 1.957306178049016e-05, + "loss": 4.6006, + "step": 2411 + }, + { + "epoch": 0.6132528205943112, + "grad_norm": 24166.205078125, + "learning_rate": 1.9551400461696308e-05, + "loss": 4.5937, + "step": 2412 + }, + { + "epoch": 0.6135070713491181, + "grad_norm": 24076.693359375, + "learning_rate": 1.9529743436521212e-05, + "loss": 4.618, + "step": 2413 + }, + { + "epoch": 0.613761322103925, + "grad_norm": 24092.552734375, + "learning_rate": 1.9508090722031104e-05, + "loss": 4.5888, + "step": 2414 + }, + { + "epoch": 0.6140155728587319, + "grad_norm": 23962.7265625, + "learning_rate": 1.9486442335288826e-05, + "loss": 4.6182, + "step": 2415 + }, + { + "epoch": 0.6142698236135389, + "grad_norm": 24037.544921875, + "learning_rate": 1.946479829335378e-05, + "loss": 4.6133, + "step": 2416 + }, + { + "epoch": 0.6145240743683458, + "grad_norm": 23970.84765625, + "learning_rate": 1.9443158613281953e-05, + "loss": 4.6103, + "step": 2417 + }, + { + "epoch": 0.6147783251231527, + "grad_norm": 24136.70703125, + "learning_rate": 1.94215233121259e-05, + "loss": 4.6152, + "step": 2418 + }, + { + "epoch": 0.6150325758779597, + "grad_norm": 23995.876953125, + "learning_rate": 1.9399892406934728e-05, + "loss": 4.5922, + "step": 2419 + }, + { + "epoch": 0.6152868266327666, + "grad_norm": 24161.4453125, + "learning_rate": 1.9378265914754083e-05, + "loss": 4.6051, + "step": 2420 + }, + { + "epoch": 0.6155410773875735, + "grad_norm": 24075.326171875, + "learning_rate": 1.9356643852626104e-05, + "loss": 4.6037, + "step": 2421 + }, + { + "epoch": 0.6157953281423805, + "grad_norm": 24321.203125, + "learning_rate": 1.9335026237589487e-05, + "loss": 4.6047, + "step": 2422 + }, + { + "epoch": 0.6160495788971874, + "grad_norm": 24204.9921875, + "learning_rate": 1.931341308667938e-05, + "loss": 4.5921, + "step": 2423 + }, + { + "epoch": 0.6163038296519943, + "grad_norm": 24214.46875, + "learning_rate": 1.9291804416927436e-05, + "loss": 4.6085, + "step": 2424 + }, + { + "epoch": 0.6165580804068012, + "grad_norm": 24089.654296875, + "learning_rate": 1.9270200245361783e-05, + "loss": 4.5945, + "step": 2425 + }, + { + "epoch": 0.6168123311616082, + "grad_norm": 24021.005859375, + "learning_rate": 1.9248600589006986e-05, + "loss": 4.593, + "step": 2426 + }, + { + "epoch": 0.6170665819164151, + "grad_norm": 24244.95703125, + "learning_rate": 1.9227005464884043e-05, + "loss": 4.596, + "step": 2427 + }, + { + "epoch": 0.617320832671222, + "grad_norm": 24159.51171875, + "learning_rate": 1.9205414890010426e-05, + "loss": 4.6018, + "step": 2428 + }, + { + "epoch": 0.617575083426029, + "grad_norm": 24049.142578125, + "learning_rate": 1.9183828881399968e-05, + "loss": 4.6034, + "step": 2429 + }, + { + "epoch": 0.6178293341808359, + "grad_norm": 24109.869140625, + "learning_rate": 1.9162247456062938e-05, + "loss": 4.5999, + "step": 2430 + }, + { + "epoch": 0.6180835849356427, + "grad_norm": 23990.083984375, + "learning_rate": 1.914067063100599e-05, + "loss": 4.6131, + "step": 2431 + }, + { + "epoch": 0.6183378356904498, + "grad_norm": 24034.0859375, + "learning_rate": 1.911909842323214e-05, + "loss": 4.6053, + "step": 2432 + }, + { + "epoch": 0.6185920864452567, + "grad_norm": 24245.017578125, + "learning_rate": 1.9097530849740763e-05, + "loss": 4.5911, + "step": 2433 + }, + { + "epoch": 0.6188463372000635, + "grad_norm": 24006.162109375, + "learning_rate": 1.907596792752761e-05, + "loss": 4.6051, + "step": 2434 + }, + { + "epoch": 0.6191005879548704, + "grad_norm": 24283.794921875, + "learning_rate": 1.905440967358474e-05, + "loss": 4.589, + "step": 2435 + }, + { + "epoch": 0.6193548387096774, + "grad_norm": 24029.14453125, + "learning_rate": 1.903285610490053e-05, + "loss": 4.6032, + "step": 2436 + }, + { + "epoch": 0.6196090894644843, + "grad_norm": 24080.90234375, + "learning_rate": 1.9011307238459698e-05, + "loss": 4.5921, + "step": 2437 + }, + { + "epoch": 0.6198633402192912, + "grad_norm": 23884.517578125, + "learning_rate": 1.8989763091243214e-05, + "loss": 4.5807, + "step": 2438 + }, + { + "epoch": 0.6201175909740982, + "grad_norm": 24080.244140625, + "learning_rate": 1.8968223680228357e-05, + "loss": 4.5966, + "step": 2439 + }, + { + "epoch": 0.6203718417289051, + "grad_norm": 24096.248046875, + "learning_rate": 1.8946689022388672e-05, + "loss": 4.6143, + "step": 2440 + }, + { + "epoch": 0.620626092483712, + "grad_norm": 24167.19921875, + "learning_rate": 1.892515913469395e-05, + "loss": 4.5989, + "step": 2441 + }, + { + "epoch": 0.620880343238519, + "grad_norm": 24043.50390625, + "learning_rate": 1.890363403411022e-05, + "loss": 4.6023, + "step": 2442 + }, + { + "epoch": 0.6211345939933259, + "grad_norm": 24128.74609375, + "learning_rate": 1.8882113737599752e-05, + "loss": 4.609, + "step": 2443 + }, + { + "epoch": 0.6213888447481328, + "grad_norm": 24154.384765625, + "learning_rate": 1.8860598262121015e-05, + "loss": 4.5962, + "step": 2444 + }, + { + "epoch": 0.6216430955029397, + "grad_norm": 23937.044921875, + "learning_rate": 1.883908762462869e-05, + "loss": 4.5945, + "step": 2445 + }, + { + "epoch": 0.6218973462577467, + "grad_norm": 24179.6875, + "learning_rate": 1.8817581842073653e-05, + "loss": 4.5945, + "step": 2446 + }, + { + "epoch": 0.6221515970125536, + "grad_norm": 24050.8125, + "learning_rate": 1.8796080931402934e-05, + "loss": 4.604, + "step": 2447 + }, + { + "epoch": 0.6224058477673605, + "grad_norm": 23989.33984375, + "learning_rate": 1.8774584909559728e-05, + "loss": 4.5998, + "step": 2448 + }, + { + "epoch": 0.6226600985221675, + "grad_norm": 24366.857421875, + "learning_rate": 1.8753093793483388e-05, + "loss": 4.6063, + "step": 2449 + }, + { + "epoch": 0.6229143492769744, + "grad_norm": 24138.767578125, + "learning_rate": 1.87316076001094e-05, + "loss": 4.598, + "step": 2450 + }, + { + "epoch": 0.6231686000317813, + "grad_norm": 24208.26171875, + "learning_rate": 1.8710126346369367e-05, + "loss": 4.5969, + "step": 2451 + }, + { + "epoch": 0.6234228507865883, + "grad_norm": 24129.83203125, + "learning_rate": 1.868865004919098e-05, + "loss": 4.6048, + "step": 2452 + }, + { + "epoch": 0.6236771015413952, + "grad_norm": 24142.56640625, + "learning_rate": 1.8667178725498074e-05, + "loss": 4.6057, + "step": 2453 + }, + { + "epoch": 0.6239313522962021, + "grad_norm": 24080.140625, + "learning_rate": 1.864571239221051e-05, + "loss": 4.5938, + "step": 2454 + }, + { + "epoch": 0.624185603051009, + "grad_norm": 24112.578125, + "learning_rate": 1.862425106624425e-05, + "loss": 4.6078, + "step": 2455 + }, + { + "epoch": 0.624439853805816, + "grad_norm": 24273.203125, + "learning_rate": 1.8602794764511312e-05, + "loss": 4.6271, + "step": 2456 + }, + { + "epoch": 0.6246941045606229, + "grad_norm": 24075.630859375, + "learning_rate": 1.8581343503919726e-05, + "loss": 4.5954, + "step": 2457 + }, + { + "epoch": 0.6249483553154298, + "grad_norm": 24070.806640625, + "learning_rate": 1.8559897301373567e-05, + "loss": 4.6036, + "step": 2458 + }, + { + "epoch": 0.6252026060702368, + "grad_norm": 24103.83203125, + "learning_rate": 1.8538456173772938e-05, + "loss": 4.5907, + "step": 2459 + }, + { + "epoch": 0.6254568568250437, + "grad_norm": 24324.216796875, + "learning_rate": 1.8517020138013912e-05, + "loss": 4.5965, + "step": 2460 + }, + { + "epoch": 0.6257111075798506, + "grad_norm": 24115.67578125, + "learning_rate": 1.8495589210988575e-05, + "loss": 4.594, + "step": 2461 + }, + { + "epoch": 0.6259653583346576, + "grad_norm": 24061.4140625, + "learning_rate": 1.847416340958499e-05, + "loss": 4.5945, + "step": 2462 + }, + { + "epoch": 0.6262196090894645, + "grad_norm": 24268.869140625, + "learning_rate": 1.8452742750687156e-05, + "loss": 4.589, + "step": 2463 + }, + { + "epoch": 0.6264738598442714, + "grad_norm": 24175.830078125, + "learning_rate": 1.8431327251175028e-05, + "loss": 4.5912, + "step": 2464 + }, + { + "epoch": 0.6267281105990783, + "grad_norm": 23971.470703125, + "learning_rate": 1.840991692792451e-05, + "loss": 4.5901, + "step": 2465 + }, + { + "epoch": 0.6269823613538853, + "grad_norm": 24081.029296875, + "learning_rate": 1.8388511797807423e-05, + "loss": 4.5883, + "step": 2466 + }, + { + "epoch": 0.6272366121086922, + "grad_norm": 24787.693359375, + "learning_rate": 1.8367111877691473e-05, + "loss": 4.5839, + "step": 2467 + }, + { + "epoch": 0.6274908628634991, + "grad_norm": 24246.01171875, + "learning_rate": 1.83457171844403e-05, + "loss": 4.5712, + "step": 2468 + }, + { + "epoch": 0.6277451136183061, + "grad_norm": 24131.01953125, + "learning_rate": 1.8324327734913385e-05, + "loss": 4.5999, + "step": 2469 + }, + { + "epoch": 0.627999364373113, + "grad_norm": 24299.576171875, + "learning_rate": 1.830294354596609e-05, + "loss": 4.6027, + "step": 2470 + }, + { + "epoch": 0.6282536151279199, + "grad_norm": 24074.599609375, + "learning_rate": 1.8281564634449652e-05, + "loss": 4.5765, + "step": 2471 + }, + { + "epoch": 0.6285078658827269, + "grad_norm": 24136.794921875, + "learning_rate": 1.826019101721113e-05, + "loss": 4.5973, + "step": 2472 + }, + { + "epoch": 0.6287621166375338, + "grad_norm": 24252.568359375, + "learning_rate": 1.8238822711093405e-05, + "loss": 4.5895, + "step": 2473 + }, + { + "epoch": 0.6290163673923407, + "grad_norm": 24157.841796875, + "learning_rate": 1.8217459732935194e-05, + "loss": 4.5756, + "step": 2474 + }, + { + "epoch": 0.6292706181471476, + "grad_norm": 24179.923828125, + "learning_rate": 1.8196102099570995e-05, + "loss": 4.5823, + "step": 2475 + }, + { + "epoch": 0.6295248689019546, + "grad_norm": 24248.7578125, + "learning_rate": 1.81747498278311e-05, + "loss": 4.5988, + "step": 2476 + }, + { + "epoch": 0.6297791196567615, + "grad_norm": 24326.951171875, + "learning_rate": 1.815340293454159e-05, + "loss": 4.5746, + "step": 2477 + }, + { + "epoch": 0.6300333704115684, + "grad_norm": 24214.560546875, + "learning_rate": 1.8132061436524296e-05, + "loss": 4.5948, + "step": 2478 + }, + { + "epoch": 0.6302876211663754, + "grad_norm": 24183.5625, + "learning_rate": 1.8110725350596787e-05, + "loss": 4.597, + "step": 2479 + }, + { + "epoch": 0.6305418719211823, + "grad_norm": 24230.060546875, + "learning_rate": 1.808939469357237e-05, + "loss": 4.5812, + "step": 2480 + }, + { + "epoch": 0.6307961226759892, + "grad_norm": 24158.80078125, + "learning_rate": 1.8068069482260102e-05, + "loss": 4.5957, + "step": 2481 + }, + { + "epoch": 0.6310503734307962, + "grad_norm": 24142.5546875, + "learning_rate": 1.8046749733464723e-05, + "loss": 4.5926, + "step": 2482 + }, + { + "epoch": 0.6313046241856031, + "grad_norm": 24118.13671875, + "learning_rate": 1.8025435463986662e-05, + "loss": 4.5935, + "step": 2483 + }, + { + "epoch": 0.63155887494041, + "grad_norm": 24302.646484375, + "learning_rate": 1.8004126690622063e-05, + "loss": 4.5845, + "step": 2484 + }, + { + "epoch": 0.6318131256952169, + "grad_norm": 24153.447265625, + "learning_rate": 1.7982823430162697e-05, + "loss": 4.5869, + "step": 2485 + }, + { + "epoch": 0.6320673764500239, + "grad_norm": 24060.43359375, + "learning_rate": 1.7961525699396025e-05, + "loss": 4.5937, + "step": 2486 + }, + { + "epoch": 0.6323216272048308, + "grad_norm": 24141.41015625, + "learning_rate": 1.7940233515105128e-05, + "loss": 4.5852, + "step": 2487 + }, + { + "epoch": 0.6325758779596377, + "grad_norm": 24227.083984375, + "learning_rate": 1.7918946894068736e-05, + "loss": 4.5877, + "step": 2488 + }, + { + "epoch": 0.6328301287144447, + "grad_norm": 24059.630859375, + "learning_rate": 1.789766585306117e-05, + "loss": 4.5876, + "step": 2489 + }, + { + "epoch": 0.6330843794692516, + "grad_norm": 24174.29296875, + "learning_rate": 1.7876390408852385e-05, + "loss": 4.599, + "step": 2490 + }, + { + "epoch": 0.6333386302240585, + "grad_norm": 24081.818359375, + "learning_rate": 1.785512057820789e-05, + "loss": 4.5783, + "step": 2491 + }, + { + "epoch": 0.6335928809788655, + "grad_norm": 24321.181640625, + "learning_rate": 1.7833856377888796e-05, + "loss": 4.592, + "step": 2492 + }, + { + "epoch": 0.6338471317336724, + "grad_norm": 24287.48828125, + "learning_rate": 1.7812597824651783e-05, + "loss": 4.6032, + "step": 2493 + }, + { + "epoch": 0.6341013824884792, + "grad_norm": 24185.6796875, + "learning_rate": 1.779134493524906e-05, + "loss": 4.5765, + "step": 2494 + }, + { + "epoch": 0.6343556332432861, + "grad_norm": 24248.607421875, + "learning_rate": 1.777009772642837e-05, + "loss": 4.5691, + "step": 2495 + }, + { + "epoch": 0.6346098839980931, + "grad_norm": 24190.3125, + "learning_rate": 1.7748856214933006e-05, + "loss": 4.5883, + "step": 2496 + }, + { + "epoch": 0.6348641347529, + "grad_norm": 24106.21875, + "learning_rate": 1.772762041750175e-05, + "loss": 4.59, + "step": 2497 + }, + { + "epoch": 0.6351183855077069, + "grad_norm": 24152.291015625, + "learning_rate": 1.770639035086888e-05, + "loss": 4.5899, + "step": 2498 + }, + { + "epoch": 0.635372636262514, + "grad_norm": 23959.857421875, + "learning_rate": 1.7685166031764178e-05, + "loss": 4.5926, + "step": 2499 + }, + { + "epoch": 0.6356268870173208, + "grad_norm": 24178.6875, + "learning_rate": 1.7663947476912886e-05, + "loss": 4.5936, + "step": 2500 + }, + { + "epoch": 0.6356268870173208, + "eval_loss": 9.240402221679688, + "eval_runtime": 696.0918, + "eval_samples_per_second": 152.259, + "eval_steps_per_second": 9.517, + "step": 2500 + }, + { + "epoch": 0.6358811377721277, + "grad_norm": 24244.623046875, + "learning_rate": 1.7642734703035675e-05, + "loss": 4.5803, + "step": 2501 + }, + { + "epoch": 0.6361353885269347, + "grad_norm": 24223.193359375, + "learning_rate": 1.7621527726848717e-05, + "loss": 4.5943, + "step": 2502 + }, + { + "epoch": 0.6363896392817416, + "grad_norm": 24211.23828125, + "learning_rate": 1.7600326565063576e-05, + "loss": 4.5932, + "step": 2503 + }, + { + "epoch": 0.6366438900365485, + "grad_norm": 24327.18359375, + "learning_rate": 1.7579131234387238e-05, + "loss": 4.581, + "step": 2504 + }, + { + "epoch": 0.6368981407913554, + "grad_norm": 24221.48828125, + "learning_rate": 1.7557941751522107e-05, + "loss": 4.578, + "step": 2505 + }, + { + "epoch": 0.6371523915461624, + "grad_norm": 24290.9765625, + "learning_rate": 1.7536758133165963e-05, + "loss": 4.5766, + "step": 2506 + }, + { + "epoch": 0.6374066423009693, + "grad_norm": 24239.48046875, + "learning_rate": 1.7515580396011976e-05, + "loss": 4.588, + "step": 2507 + }, + { + "epoch": 0.6376608930557762, + "grad_norm": 24361.490234375, + "learning_rate": 1.7494408556748683e-05, + "loss": 4.589, + "step": 2508 + }, + { + "epoch": 0.6379151438105832, + "grad_norm": 24069.62890625, + "learning_rate": 1.7473242632059972e-05, + "loss": 4.5666, + "step": 2509 + }, + { + "epoch": 0.6381693945653901, + "grad_norm": 24544.166015625, + "learning_rate": 1.7452082638625057e-05, + "loss": 4.6005, + "step": 2510 + }, + { + "epoch": 0.638423645320197, + "grad_norm": 24267.517578125, + "learning_rate": 1.7430928593118483e-05, + "loss": 4.5908, + "step": 2511 + }, + { + "epoch": 0.638677896075004, + "grad_norm": 24144.861328125, + "learning_rate": 1.7409780512210126e-05, + "loss": 4.5816, + "step": 2512 + }, + { + "epoch": 0.6389321468298109, + "grad_norm": 24160.880859375, + "learning_rate": 1.738863841256515e-05, + "loss": 4.5771, + "step": 2513 + }, + { + "epoch": 0.6391863975846178, + "grad_norm": 24155.625, + "learning_rate": 1.7367502310843986e-05, + "loss": 4.5903, + "step": 2514 + }, + { + "epoch": 0.6394406483394247, + "grad_norm": 24405.29296875, + "learning_rate": 1.7346372223702378e-05, + "loss": 4.5811, + "step": 2515 + }, + { + "epoch": 0.6396948990942317, + "grad_norm": 24368.484375, + "learning_rate": 1.7325248167791294e-05, + "loss": 4.5793, + "step": 2516 + }, + { + "epoch": 0.6399491498490386, + "grad_norm": 24171.306640625, + "learning_rate": 1.730413015975697e-05, + "loss": 4.5761, + "step": 2517 + }, + { + "epoch": 0.6402034006038455, + "grad_norm": 24201.806640625, + "learning_rate": 1.7283018216240874e-05, + "loss": 4.5724, + "step": 2518 + }, + { + "epoch": 0.6404576513586525, + "grad_norm": 24133.904296875, + "learning_rate": 1.726191235387969e-05, + "loss": 4.5737, + "step": 2519 + }, + { + "epoch": 0.6407119021134594, + "grad_norm": 24377.591796875, + "learning_rate": 1.72408125893053e-05, + "loss": 4.5854, + "step": 2520 + }, + { + "epoch": 0.6409661528682663, + "grad_norm": 24438.13671875, + "learning_rate": 1.7219718939144812e-05, + "loss": 4.5793, + "step": 2521 + }, + { + "epoch": 0.6412204036230733, + "grad_norm": 24163.6640625, + "learning_rate": 1.7198631420020484e-05, + "loss": 4.5784, + "step": 2522 + }, + { + "epoch": 0.6414746543778802, + "grad_norm": 24440.03125, + "learning_rate": 1.7177550048549746e-05, + "loss": 4.5765, + "step": 2523 + }, + { + "epoch": 0.6417289051326871, + "grad_norm": 24253.587890625, + "learning_rate": 1.7156474841345212e-05, + "loss": 4.5788, + "step": 2524 + }, + { + "epoch": 0.641983155887494, + "grad_norm": 24234.193359375, + "learning_rate": 1.713540581501461e-05, + "loss": 4.5801, + "step": 2525 + }, + { + "epoch": 0.642237406642301, + "grad_norm": 24537.63671875, + "learning_rate": 1.7114342986160797e-05, + "loss": 4.5821, + "step": 2526 + }, + { + "epoch": 0.6424916573971079, + "grad_norm": 24128.314453125, + "learning_rate": 1.709328637138177e-05, + "loss": 4.5654, + "step": 2527 + }, + { + "epoch": 0.6427459081519148, + "grad_norm": 24724.302734375, + "learning_rate": 1.7072235987270603e-05, + "loss": 4.5805, + "step": 2528 + }, + { + "epoch": 0.6430001589067218, + "grad_norm": 24352.306640625, + "learning_rate": 1.7051191850415467e-05, + "loss": 4.5767, + "step": 2529 + }, + { + "epoch": 0.6432544096615287, + "grad_norm": 24312.7578125, + "learning_rate": 1.7030153977399635e-05, + "loss": 4.5926, + "step": 2530 + }, + { + "epoch": 0.6435086604163356, + "grad_norm": 24119.224609375, + "learning_rate": 1.700912238480141e-05, + "loss": 4.5677, + "step": 2531 + }, + { + "epoch": 0.6437629111711426, + "grad_norm": 24259.05859375, + "learning_rate": 1.698809708919415e-05, + "loss": 4.5843, + "step": 2532 + }, + { + "epoch": 0.6440171619259495, + "grad_norm": 24297.130859375, + "learning_rate": 1.696707810714627e-05, + "loss": 4.5754, + "step": 2533 + }, + { + "epoch": 0.6442714126807564, + "grad_norm": 24270.455078125, + "learning_rate": 1.6946065455221213e-05, + "loss": 4.5669, + "step": 2534 + }, + { + "epoch": 0.6445256634355633, + "grad_norm": 24254.466796875, + "learning_rate": 1.6925059149977395e-05, + "loss": 4.5824, + "step": 2535 + }, + { + "epoch": 0.6447799141903703, + "grad_norm": 24486.9921875, + "learning_rate": 1.6904059207968277e-05, + "loss": 4.5811, + "step": 2536 + }, + { + "epoch": 0.6450341649451772, + "grad_norm": 24020.109375, + "learning_rate": 1.6883065645742274e-05, + "loss": 4.5757, + "step": 2537 + }, + { + "epoch": 0.6452884156999841, + "grad_norm": 24878.609375, + "learning_rate": 1.6862078479842778e-05, + "loss": 4.5947, + "step": 2538 + }, + { + "epoch": 0.6455426664547911, + "grad_norm": 24334.31640625, + "learning_rate": 1.684109772680816e-05, + "loss": 4.5691, + "step": 2539 + }, + { + "epoch": 0.645796917209598, + "grad_norm": 24240.169921875, + "learning_rate": 1.6820123403171723e-05, + "loss": 4.5729, + "step": 2540 + }, + { + "epoch": 0.6460511679644049, + "grad_norm": 24486.2265625, + "learning_rate": 1.6799155525461707e-05, + "loss": 4.5685, + "step": 2541 + }, + { + "epoch": 0.6463054187192119, + "grad_norm": 24304.26953125, + "learning_rate": 1.677819411020125e-05, + "loss": 4.5695, + "step": 2542 + }, + { + "epoch": 0.6465596694740188, + "grad_norm": 24576.150390625, + "learning_rate": 1.675723917390844e-05, + "loss": 4.5747, + "step": 2543 + }, + { + "epoch": 0.6468139202288257, + "grad_norm": 24128.58984375, + "learning_rate": 1.6736290733096235e-05, + "loss": 4.5709, + "step": 2544 + }, + { + "epoch": 0.6470681709836326, + "grad_norm": 24498.22265625, + "learning_rate": 1.671534880427246e-05, + "loss": 4.5801, + "step": 2545 + }, + { + "epoch": 0.6473224217384396, + "grad_norm": 24382.521484375, + "learning_rate": 1.669441340393985e-05, + "loss": 4.5552, + "step": 2546 + }, + { + "epoch": 0.6475766724932465, + "grad_norm": 24435.91796875, + "learning_rate": 1.667348454859596e-05, + "loss": 4.5658, + "step": 2547 + }, + { + "epoch": 0.6478309232480534, + "grad_norm": 24412.626953125, + "learning_rate": 1.6652562254733185e-05, + "loss": 4.5683, + "step": 2548 + }, + { + "epoch": 0.6480851740028604, + "grad_norm": 24386.279296875, + "learning_rate": 1.6631646538838774e-05, + "loss": 4.5748, + "step": 2549 + }, + { + "epoch": 0.6483394247576673, + "grad_norm": 24362.896484375, + "learning_rate": 1.6610737417394785e-05, + "loss": 4.5627, + "step": 2550 + }, + { + "epoch": 0.6485936755124742, + "grad_norm": 24008.484375, + "learning_rate": 1.658983490687806e-05, + "loss": 4.5683, + "step": 2551 + }, + { + "epoch": 0.6488479262672812, + "grad_norm": 24282.52734375, + "learning_rate": 1.656893902376027e-05, + "loss": 4.5629, + "step": 2552 + }, + { + "epoch": 0.6491021770220881, + "grad_norm": 24190.306640625, + "learning_rate": 1.654804978450782e-05, + "loss": 4.562, + "step": 2553 + }, + { + "epoch": 0.649356427776895, + "grad_norm": 24304.244140625, + "learning_rate": 1.6527167205581903e-05, + "loss": 4.5686, + "step": 2554 + }, + { + "epoch": 0.6496106785317018, + "grad_norm": 24525.248046875, + "learning_rate": 1.6506291303438464e-05, + "loss": 4.5531, + "step": 2555 + }, + { + "epoch": 0.6498649292865089, + "grad_norm": 24319.943359375, + "learning_rate": 1.648542209452819e-05, + "loss": 4.5697, + "step": 2556 + }, + { + "epoch": 0.6501191800413157, + "grad_norm": 24227.51953125, + "learning_rate": 1.6464559595296462e-05, + "loss": 4.5717, + "step": 2557 + }, + { + "epoch": 0.6503734307961226, + "grad_norm": 24229.46484375, + "learning_rate": 1.6443703822183428e-05, + "loss": 4.5778, + "step": 2558 + }, + { + "epoch": 0.6506276815509296, + "grad_norm": 24172.23046875, + "learning_rate": 1.6422854791623886e-05, + "loss": 4.559, + "step": 2559 + }, + { + "epoch": 0.6508819323057365, + "grad_norm": 24447.97265625, + "learning_rate": 1.640201252004734e-05, + "loss": 4.5815, + "step": 2560 + }, + { + "epoch": 0.6511361830605434, + "grad_norm": 24136.083984375, + "learning_rate": 1.638117702387798e-05, + "loss": 4.5608, + "step": 2561 + }, + { + "epoch": 0.6513904338153504, + "grad_norm": 24441.25, + "learning_rate": 1.636034831953464e-05, + "loss": 4.5791, + "step": 2562 + }, + { + "epoch": 0.6516446845701573, + "grad_norm": 24246.384765625, + "learning_rate": 1.6339526423430797e-05, + "loss": 4.5755, + "step": 2563 + }, + { + "epoch": 0.6518989353249642, + "grad_norm": 24261.48828125, + "learning_rate": 1.631871135197459e-05, + "loss": 4.573, + "step": 2564 + }, + { + "epoch": 0.6521531860797711, + "grad_norm": 24247.67578125, + "learning_rate": 1.6297903121568747e-05, + "loss": 4.5796, + "step": 2565 + }, + { + "epoch": 0.6524074368345781, + "grad_norm": 24322.1015625, + "learning_rate": 1.6277101748610622e-05, + "loss": 4.5577, + "step": 2566 + }, + { + "epoch": 0.652661687589385, + "grad_norm": 24325.9609375, + "learning_rate": 1.6256307249492177e-05, + "loss": 4.5744, + "step": 2567 + }, + { + "epoch": 0.6529159383441919, + "grad_norm": 24275.09375, + "learning_rate": 1.6235519640599938e-05, + "loss": 4.5701, + "step": 2568 + }, + { + "epoch": 0.6531701890989989, + "grad_norm": 24240.07421875, + "learning_rate": 1.6214738938314994e-05, + "loss": 4.5682, + "step": 2569 + }, + { + "epoch": 0.6534244398538058, + "grad_norm": 24224.517578125, + "learning_rate": 1.6193965159013023e-05, + "loss": 4.5723, + "step": 2570 + }, + { + "epoch": 0.6536786906086127, + "grad_norm": 24453.654296875, + "learning_rate": 1.6173198319064227e-05, + "loss": 4.5777, + "step": 2571 + }, + { + "epoch": 0.6539329413634196, + "grad_norm": 24286.72265625, + "learning_rate": 1.6152438434833337e-05, + "loss": 4.5645, + "step": 2572 + }, + { + "epoch": 0.6541871921182266, + "grad_norm": 24249.177734375, + "learning_rate": 1.61316855226796e-05, + "loss": 4.5678, + "step": 2573 + }, + { + "epoch": 0.6544414428730335, + "grad_norm": 24488.66015625, + "learning_rate": 1.611093959895679e-05, + "loss": 4.5705, + "step": 2574 + }, + { + "epoch": 0.6546956936278404, + "grad_norm": 24407.724609375, + "learning_rate": 1.609020068001316e-05, + "loss": 4.5644, + "step": 2575 + }, + { + "epoch": 0.6549499443826474, + "grad_norm": 24337.296875, + "learning_rate": 1.606946878219143e-05, + "loss": 4.566, + "step": 2576 + }, + { + "epoch": 0.6552041951374543, + "grad_norm": 24338.869140625, + "learning_rate": 1.6048743921828825e-05, + "loss": 4.5549, + "step": 2577 + }, + { + "epoch": 0.6554584458922612, + "grad_norm": 24311.572265625, + "learning_rate": 1.6028026115256984e-05, + "loss": 4.5816, + "step": 2578 + }, + { + "epoch": 0.6557126966470682, + "grad_norm": 24314.982421875, + "learning_rate": 1.6007315378801997e-05, + "loss": 4.5662, + "step": 2579 + }, + { + "epoch": 0.6559669474018751, + "grad_norm": 24301.125, + "learning_rate": 1.5986611728784404e-05, + "loss": 4.5642, + "step": 2580 + }, + { + "epoch": 0.656221198156682, + "grad_norm": 24435.6171875, + "learning_rate": 1.5965915181519144e-05, + "loss": 4.5719, + "step": 2581 + }, + { + "epoch": 0.6564754489114889, + "grad_norm": 24352.328125, + "learning_rate": 1.5945225753315544e-05, + "loss": 4.5722, + "step": 2582 + }, + { + "epoch": 0.6567296996662959, + "grad_norm": 24420.17578125, + "learning_rate": 1.592454346047737e-05, + "loss": 4.5657, + "step": 2583 + }, + { + "epoch": 0.6569839504211028, + "grad_norm": 24249.03515625, + "learning_rate": 1.5903868319302704e-05, + "loss": 4.5693, + "step": 2584 + }, + { + "epoch": 0.6572382011759097, + "grad_norm": 24325.220703125, + "learning_rate": 1.5883200346084032e-05, + "loss": 4.5522, + "step": 2585 + }, + { + "epoch": 0.6574924519307167, + "grad_norm": 24389.767578125, + "learning_rate": 1.5862539557108182e-05, + "loss": 4.5815, + "step": 2586 + }, + { + "epoch": 0.6577467026855236, + "grad_norm": 24442.4453125, + "learning_rate": 1.584188596865632e-05, + "loss": 4.5644, + "step": 2587 + }, + { + "epoch": 0.6580009534403305, + "grad_norm": 24160.30078125, + "learning_rate": 1.5821239597003933e-05, + "loss": 4.5477, + "step": 2588 + }, + { + "epoch": 0.6582552041951375, + "grad_norm": 24312.27734375, + "learning_rate": 1.5800600458420834e-05, + "loss": 4.5595, + "step": 2589 + }, + { + "epoch": 0.6585094549499444, + "grad_norm": 24372.12890625, + "learning_rate": 1.5779968569171118e-05, + "loss": 4.5732, + "step": 2590 + }, + { + "epoch": 0.6587637057047513, + "grad_norm": 24259.5546875, + "learning_rate": 1.5759343945513173e-05, + "loss": 4.5705, + "step": 2591 + }, + { + "epoch": 0.6590179564595582, + "grad_norm": 24217.7421875, + "learning_rate": 1.5738726603699684e-05, + "loss": 4.5595, + "step": 2592 + }, + { + "epoch": 0.6592722072143652, + "grad_norm": 24322.662109375, + "learning_rate": 1.571811655997757e-05, + "loss": 4.5703, + "step": 2593 + }, + { + "epoch": 0.6595264579691721, + "grad_norm": 24315.576171875, + "learning_rate": 1.5697513830587995e-05, + "loss": 4.5585, + "step": 2594 + }, + { + "epoch": 0.659780708723979, + "grad_norm": 24202.53515625, + "learning_rate": 1.56769184317664e-05, + "loss": 4.5604, + "step": 2595 + }, + { + "epoch": 0.660034959478786, + "grad_norm": 24433.025390625, + "learning_rate": 1.5656330379742397e-05, + "loss": 4.5706, + "step": 2596 + }, + { + "epoch": 0.6602892102335929, + "grad_norm": 24239.861328125, + "learning_rate": 1.5635749690739838e-05, + "loss": 4.5695, + "step": 2597 + }, + { + "epoch": 0.6605434609883998, + "grad_norm": 24277.451171875, + "learning_rate": 1.561517638097678e-05, + "loss": 4.5684, + "step": 2598 + }, + { + "epoch": 0.6607977117432068, + "grad_norm": 24537.86328125, + "learning_rate": 1.5594610466665442e-05, + "loss": 4.5681, + "step": 2599 + }, + { + "epoch": 0.6610519624980137, + "grad_norm": 24327.55078125, + "learning_rate": 1.5574051964012226e-05, + "loss": 4.5717, + "step": 2600 + }, + { + "epoch": 0.6610519624980137, + "eval_loss": 9.194845199584961, + "eval_runtime": 695.2131, + "eval_samples_per_second": 152.451, + "eval_steps_per_second": 9.529, + "step": 2600 + }, + { + "epoch": 0.6613062132528206, + "grad_norm": 24295.04296875, + "learning_rate": 1.55535008892177e-05, + "loss": 4.5415, + "step": 2601 + }, + { + "epoch": 0.6615604640076275, + "grad_norm": 24349.5859375, + "learning_rate": 1.5532957258476577e-05, + "loss": 4.5631, + "step": 2602 + }, + { + "epoch": 0.6618147147624345, + "grad_norm": 24385.060546875, + "learning_rate": 1.551242108797769e-05, + "loss": 4.5622, + "step": 2603 + }, + { + "epoch": 0.6620689655172414, + "grad_norm": 24276.966796875, + "learning_rate": 1.549189239390399e-05, + "loss": 4.5616, + "step": 2604 + }, + { + "epoch": 0.6623232162720483, + "grad_norm": 24408.85546875, + "learning_rate": 1.5471371192432577e-05, + "loss": 4.5501, + "step": 2605 + }, + { + "epoch": 0.6625774670268553, + "grad_norm": 24281.455078125, + "learning_rate": 1.54508574997346e-05, + "loss": 4.5605, + "step": 2606 + }, + { + "epoch": 0.6628317177816622, + "grad_norm": 24378.90625, + "learning_rate": 1.5430351331975305e-05, + "loss": 4.5622, + "step": 2607 + }, + { + "epoch": 0.6630859685364691, + "grad_norm": 24312.474609375, + "learning_rate": 1.5409852705314037e-05, + "loss": 4.5591, + "step": 2608 + }, + { + "epoch": 0.6633402192912761, + "grad_norm": 24265.546875, + "learning_rate": 1.5389361635904153e-05, + "loss": 4.5532, + "step": 2609 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 24375.109375, + "learning_rate": 1.5368878139893076e-05, + "loss": 4.5521, + "step": 2610 + }, + { + "epoch": 0.6638487208008899, + "grad_norm": 24323.583984375, + "learning_rate": 1.534840223342227e-05, + "loss": 4.5615, + "step": 2611 + }, + { + "epoch": 0.6641029715556968, + "grad_norm": 24380.0703125, + "learning_rate": 1.532793393262721e-05, + "loss": 4.5621, + "step": 2612 + }, + { + "epoch": 0.6643572223105038, + "grad_norm": 24382.95703125, + "learning_rate": 1.530747325363736e-05, + "loss": 4.5606, + "step": 2613 + }, + { + "epoch": 0.6646114730653107, + "grad_norm": 24253.33984375, + "learning_rate": 1.5287020212576216e-05, + "loss": 4.5492, + "step": 2614 + }, + { + "epoch": 0.6648657238201175, + "grad_norm": 24287.13671875, + "learning_rate": 1.5266574825561224e-05, + "loss": 4.5605, + "step": 2615 + }, + { + "epoch": 0.6651199745749246, + "grad_norm": 24491.078125, + "learning_rate": 1.5246137108703801e-05, + "loss": 4.5533, + "step": 2616 + }, + { + "epoch": 0.6653742253297315, + "grad_norm": 24305.341796875, + "learning_rate": 1.5225707078109336e-05, + "loss": 4.5504, + "step": 2617 + }, + { + "epoch": 0.6656284760845383, + "grad_norm": 24401.724609375, + "learning_rate": 1.5205284749877153e-05, + "loss": 4.5652, + "step": 2618 + }, + { + "epoch": 0.6658827268393454, + "grad_norm": 24451.150390625, + "learning_rate": 1.5184870140100493e-05, + "loss": 4.5577, + "step": 2619 + }, + { + "epoch": 0.6661369775941522, + "grad_norm": 24463.044921875, + "learning_rate": 1.5164463264866546e-05, + "loss": 4.5466, + "step": 2620 + }, + { + "epoch": 0.6663912283489591, + "grad_norm": 24340.640625, + "learning_rate": 1.5144064140256374e-05, + "loss": 4.5506, + "step": 2621 + }, + { + "epoch": 0.666645479103766, + "grad_norm": 24280.7734375, + "learning_rate": 1.5123672782344943e-05, + "loss": 4.575, + "step": 2622 + }, + { + "epoch": 0.666899729858573, + "grad_norm": 24522.998046875, + "learning_rate": 1.510328920720111e-05, + "loss": 4.5535, + "step": 2623 + }, + { + "epoch": 0.6671539806133799, + "grad_norm": 24471.98828125, + "learning_rate": 1.5082913430887591e-05, + "loss": 4.5568, + "step": 2624 + }, + { + "epoch": 0.6674082313681868, + "grad_norm": 24511.259765625, + "learning_rate": 1.506254546946094e-05, + "loss": 4.5615, + "step": 2625 + }, + { + "epoch": 0.6676624821229938, + "grad_norm": 24492.615234375, + "learning_rate": 1.5042185338971588e-05, + "loss": 4.5573, + "step": 2626 + }, + { + "epoch": 0.6679167328778007, + "grad_norm": 24245.654296875, + "learning_rate": 1.502183305546376e-05, + "loss": 4.5409, + "step": 2627 + }, + { + "epoch": 0.6681709836326076, + "grad_norm": 24393.26953125, + "learning_rate": 1.5001488634975514e-05, + "loss": 4.5431, + "step": 2628 + }, + { + "epoch": 0.6684252343874146, + "grad_norm": 24628.59765625, + "learning_rate": 1.4981152093538723e-05, + "loss": 4.5683, + "step": 2629 + }, + { + "epoch": 0.6686794851422215, + "grad_norm": 24390.611328125, + "learning_rate": 1.4960823447179029e-05, + "loss": 4.5587, + "step": 2630 + }, + { + "epoch": 0.6689337358970284, + "grad_norm": 24350.181640625, + "learning_rate": 1.4940502711915852e-05, + "loss": 4.5523, + "step": 2631 + }, + { + "epoch": 0.6691879866518353, + "grad_norm": 24268.439453125, + "learning_rate": 1.4920189903762403e-05, + "loss": 4.5557, + "step": 2632 + }, + { + "epoch": 0.6694422374066423, + "grad_norm": 24424.794921875, + "learning_rate": 1.4899885038725628e-05, + "loss": 4.5592, + "step": 2633 + }, + { + "epoch": 0.6696964881614492, + "grad_norm": 24477.333984375, + "learning_rate": 1.4879588132806205e-05, + "loss": 4.5689, + "step": 2634 + }, + { + "epoch": 0.6699507389162561, + "grad_norm": 24517.03125, + "learning_rate": 1.4859299201998572e-05, + "loss": 4.5476, + "step": 2635 + }, + { + "epoch": 0.6702049896710631, + "grad_norm": 24364.654296875, + "learning_rate": 1.483901826229085e-05, + "loss": 4.5441, + "step": 2636 + }, + { + "epoch": 0.67045924042587, + "grad_norm": 24339.876953125, + "learning_rate": 1.4818745329664868e-05, + "loss": 4.5553, + "step": 2637 + }, + { + "epoch": 0.6707134911806769, + "grad_norm": 24150.935546875, + "learning_rate": 1.4798480420096156e-05, + "loss": 4.537, + "step": 2638 + }, + { + "epoch": 0.6709677419354839, + "grad_norm": 24323.576171875, + "learning_rate": 1.4778223549553929e-05, + "loss": 4.5627, + "step": 2639 + }, + { + "epoch": 0.6712219926902908, + "grad_norm": 24327.634765625, + "learning_rate": 1.4757974734001051e-05, + "loss": 4.5621, + "step": 2640 + }, + { + "epoch": 0.6714762434450977, + "grad_norm": 24511.54296875, + "learning_rate": 1.4737733989394025e-05, + "loss": 4.5444, + "step": 2641 + }, + { + "epoch": 0.6717304941999046, + "grad_norm": 24349.28515625, + "learning_rate": 1.4717501331683037e-05, + "loss": 4.5447, + "step": 2642 + }, + { + "epoch": 0.6719847449547116, + "grad_norm": 24370.83984375, + "learning_rate": 1.4697276776811871e-05, + "loss": 4.5309, + "step": 2643 + }, + { + "epoch": 0.6722389957095185, + "grad_norm": 24459.435546875, + "learning_rate": 1.4677060340717913e-05, + "loss": 4.5427, + "step": 2644 + }, + { + "epoch": 0.6724932464643254, + "grad_norm": 24393.083984375, + "learning_rate": 1.4656852039332194e-05, + "loss": 4.5578, + "step": 2645 + }, + { + "epoch": 0.6727474972191324, + "grad_norm": 24398.8359375, + "learning_rate": 1.4636651888579294e-05, + "loss": 4.5465, + "step": 2646 + }, + { + "epoch": 0.6730017479739393, + "grad_norm": 24499.3359375, + "learning_rate": 1.4616459904377377e-05, + "loss": 4.5464, + "step": 2647 + }, + { + "epoch": 0.6732559987287462, + "grad_norm": 24575.685546875, + "learning_rate": 1.4596276102638196e-05, + "loss": 4.5551, + "step": 2648 + }, + { + "epoch": 0.6735102494835532, + "grad_norm": 24342.509765625, + "learning_rate": 1.457610049926704e-05, + "loss": 4.5442, + "step": 2649 + }, + { + "epoch": 0.6737645002383601, + "grad_norm": 24487.666015625, + "learning_rate": 1.4555933110162719e-05, + "loss": 4.5456, + "step": 2650 + }, + { + "epoch": 0.674018750993167, + "grad_norm": 24358.662109375, + "learning_rate": 1.4535773951217612e-05, + "loss": 4.5463, + "step": 2651 + }, + { + "epoch": 0.6742730017479739, + "grad_norm": 24565.26953125, + "learning_rate": 1.451562303831758e-05, + "loss": 4.5506, + "step": 2652 + }, + { + "epoch": 0.6745272525027809, + "grad_norm": 24423.33984375, + "learning_rate": 1.449548038734198e-05, + "loss": 4.5644, + "step": 2653 + }, + { + "epoch": 0.6747815032575878, + "grad_norm": 24559.173828125, + "learning_rate": 1.4475346014163698e-05, + "loss": 4.5515, + "step": 2654 + }, + { + "epoch": 0.6750357540123947, + "grad_norm": 24355.234375, + "learning_rate": 1.445521993464905e-05, + "loss": 4.5475, + "step": 2655 + }, + { + "epoch": 0.6752900047672017, + "grad_norm": 24532.427734375, + "learning_rate": 1.443510216465786e-05, + "loss": 4.5595, + "step": 2656 + }, + { + "epoch": 0.6755442555220086, + "grad_norm": 24590.47265625, + "learning_rate": 1.4414992720043357e-05, + "loss": 4.5485, + "step": 2657 + }, + { + "epoch": 0.6757985062768155, + "grad_norm": 24426.86328125, + "learning_rate": 1.4394891616652261e-05, + "loss": 4.5451, + "step": 2658 + }, + { + "epoch": 0.6760527570316225, + "grad_norm": 24510.662109375, + "learning_rate": 1.437479887032467e-05, + "loss": 4.5413, + "step": 2659 + }, + { + "epoch": 0.6763070077864294, + "grad_norm": 48439.36328125, + "learning_rate": 1.4354714496894142e-05, + "loss": 4.5473, + "step": 2660 + }, + { + "epoch": 0.6765612585412363, + "grad_norm": 24661.67578125, + "learning_rate": 1.4334638512187602e-05, + "loss": 4.5498, + "step": 2661 + }, + { + "epoch": 0.6768155092960432, + "grad_norm": 24507.7578125, + "learning_rate": 1.4314570932025365e-05, + "loss": 4.5628, + "step": 2662 + }, + { + "epoch": 0.6770697600508502, + "grad_norm": 24600.8125, + "learning_rate": 1.4294511772221156e-05, + "loss": 4.5441, + "step": 2663 + }, + { + "epoch": 0.6773240108056571, + "grad_norm": 24368.47265625, + "learning_rate": 1.4274461048582036e-05, + "loss": 4.5402, + "step": 2664 + }, + { + "epoch": 0.677578261560464, + "grad_norm": 24495.548828125, + "learning_rate": 1.4254418776908412e-05, + "loss": 4.5376, + "step": 2665 + }, + { + "epoch": 0.677832512315271, + "grad_norm": 24475.486328125, + "learning_rate": 1.4234384972994055e-05, + "loss": 4.554, + "step": 2666 + }, + { + "epoch": 0.6780867630700779, + "grad_norm": 24396.6171875, + "learning_rate": 1.4214359652626064e-05, + "loss": 4.5461, + "step": 2667 + }, + { + "epoch": 0.6783410138248848, + "grad_norm": 24350.16796875, + "learning_rate": 1.4194342831584829e-05, + "loss": 4.5499, + "step": 2668 + }, + { + "epoch": 0.6785952645796918, + "grad_norm": 24518.283203125, + "learning_rate": 1.4174334525644045e-05, + "loss": 4.5446, + "step": 2669 + }, + { + "epoch": 0.6788495153344987, + "grad_norm": 24546.09375, + "learning_rate": 1.4154334750570727e-05, + "loss": 4.5571, + "step": 2670 + }, + { + "epoch": 0.6791037660893056, + "grad_norm": 24416.140625, + "learning_rate": 1.4134343522125138e-05, + "loss": 4.5519, + "step": 2671 + }, + { + "epoch": 0.6793580168441125, + "grad_norm": 24436.51171875, + "learning_rate": 1.41143608560608e-05, + "loss": 4.5402, + "step": 2672 + }, + { + "epoch": 0.6796122675989195, + "grad_norm": 24666.904296875, + "learning_rate": 1.4094386768124527e-05, + "loss": 4.5506, + "step": 2673 + }, + { + "epoch": 0.6798665183537264, + "grad_norm": 24486.943359375, + "learning_rate": 1.4074421274056337e-05, + "loss": 4.5514, + "step": 2674 + }, + { + "epoch": 0.6801207691085333, + "grad_norm": 24392.087890625, + "learning_rate": 1.4054464389589478e-05, + "loss": 4.5483, + "step": 2675 + }, + { + "epoch": 0.6803750198633403, + "grad_norm": 24516.591796875, + "learning_rate": 1.4034516130450431e-05, + "loss": 4.5409, + "step": 2676 + }, + { + "epoch": 0.6806292706181472, + "grad_norm": 24615.484375, + "learning_rate": 1.401457651235889e-05, + "loss": 4.5404, + "step": 2677 + }, + { + "epoch": 0.680883521372954, + "grad_norm": 24461.681640625, + "learning_rate": 1.3994645551027692e-05, + "loss": 4.5538, + "step": 2678 + }, + { + "epoch": 0.681137772127761, + "grad_norm": 24424.5, + "learning_rate": 1.3974723262162902e-05, + "loss": 4.532, + "step": 2679 + }, + { + "epoch": 0.681392022882568, + "grad_norm": 24341.431640625, + "learning_rate": 1.3954809661463731e-05, + "loss": 4.5371, + "step": 2680 + }, + { + "epoch": 0.6816462736373748, + "grad_norm": 24641.0390625, + "learning_rate": 1.3934904764622525e-05, + "loss": 4.5517, + "step": 2681 + }, + { + "epoch": 0.6819005243921817, + "grad_norm": 24360.1015625, + "learning_rate": 1.3915008587324812e-05, + "loss": 4.5391, + "step": 2682 + }, + { + "epoch": 0.6821547751469887, + "grad_norm": 24652.677734375, + "learning_rate": 1.3895121145249218e-05, + "loss": 4.5477, + "step": 2683 + }, + { + "epoch": 0.6824090259017956, + "grad_norm": 24644.857421875, + "learning_rate": 1.387524245406748e-05, + "loss": 4.543, + "step": 2684 + }, + { + "epoch": 0.6826632766566025, + "grad_norm": 24428.333984375, + "learning_rate": 1.3855372529444477e-05, + "loss": 4.5368, + "step": 2685 + }, + { + "epoch": 0.6829175274114095, + "grad_norm": 24593.79296875, + "learning_rate": 1.383551138703813e-05, + "loss": 4.536, + "step": 2686 + }, + { + "epoch": 0.6831717781662164, + "grad_norm": 24470.54296875, + "learning_rate": 1.3815659042499495e-05, + "loss": 4.5488, + "step": 2687 + }, + { + "epoch": 0.6834260289210233, + "grad_norm": 24542.51953125, + "learning_rate": 1.3795815511472634e-05, + "loss": 4.5381, + "step": 2688 + }, + { + "epoch": 0.6836802796758303, + "grad_norm": 24658.890625, + "learning_rate": 1.3775980809594725e-05, + "loss": 4.5445, + "step": 2689 + }, + { + "epoch": 0.6839345304306372, + "grad_norm": 24542.85546875, + "learning_rate": 1.3756154952495932e-05, + "loss": 4.5614, + "step": 2690 + }, + { + "epoch": 0.6841887811854441, + "grad_norm": 24484.181640625, + "learning_rate": 1.3736337955799495e-05, + "loss": 4.5477, + "step": 2691 + }, + { + "epoch": 0.684443031940251, + "grad_norm": 30898.072265625, + "learning_rate": 1.3716529835121644e-05, + "loss": 4.5439, + "step": 2692 + }, + { + "epoch": 0.684697282695058, + "grad_norm": 26642.958984375, + "learning_rate": 1.3696730606071617e-05, + "loss": 4.5383, + "step": 2693 + }, + { + "epoch": 0.6849515334498649, + "grad_norm": 24576.06640625, + "learning_rate": 1.3676940284251666e-05, + "loss": 4.5404, + "step": 2694 + }, + { + "epoch": 0.6852057842046718, + "grad_norm": 24803.51953125, + "learning_rate": 1.3657158885256998e-05, + "loss": 4.5414, + "step": 2695 + }, + { + "epoch": 0.6854600349594788, + "grad_norm": 25177.0390625, + "learning_rate": 1.3637386424675793e-05, + "loss": 4.5463, + "step": 2696 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 24413.080078125, + "learning_rate": 1.3617622918089215e-05, + "loss": 4.5353, + "step": 2697 + }, + { + "epoch": 0.6859685364690926, + "grad_norm": 25677.380859375, + "learning_rate": 1.3597868381071327e-05, + "loss": 4.5294, + "step": 2698 + }, + { + "epoch": 0.6862227872238996, + "grad_norm": 24625.978515625, + "learning_rate": 1.3578122829189168e-05, + "loss": 4.541, + "step": 2699 + }, + { + "epoch": 0.6864770379787065, + "grad_norm": 24854.724609375, + "learning_rate": 1.355838627800266e-05, + "loss": 4.5493, + "step": 2700 + }, + { + "epoch": 0.6864770379787065, + "eval_loss": 9.15518856048584, + "eval_runtime": 695.3568, + "eval_samples_per_second": 152.42, + "eval_steps_per_second": 9.527, + "step": 2700 + }, + { + "epoch": 0.6867312887335134, + "grad_norm": 25059.322265625, + "learning_rate": 1.3538658743064667e-05, + "loss": 4.548, + "step": 2701 + }, + { + "epoch": 0.6869855394883203, + "grad_norm": 24568.916015625, + "learning_rate": 1.3518940239920916e-05, + "loss": 4.5444, + "step": 2702 + }, + { + "epoch": 0.6872397902431273, + "grad_norm": 25235.271484375, + "learning_rate": 1.3499230784110024e-05, + "loss": 4.5602, + "step": 2703 + }, + { + "epoch": 0.6874940409979342, + "grad_norm": 24682.8671875, + "learning_rate": 1.3479530391163504e-05, + "loss": 4.5261, + "step": 2704 + }, + { + "epoch": 0.6877482917527411, + "grad_norm": 24491.275390625, + "learning_rate": 1.3459839076605696e-05, + "loss": 4.5383, + "step": 2705 + }, + { + "epoch": 0.6880025425075481, + "grad_norm": 24557.62109375, + "learning_rate": 1.344015685595379e-05, + "loss": 4.5407, + "step": 2706 + }, + { + "epoch": 0.688256793262355, + "grad_norm": 24460.927734375, + "learning_rate": 1.3420483744717838e-05, + "loss": 4.5492, + "step": 2707 + }, + { + "epoch": 0.6885110440171619, + "grad_norm": 24695.630859375, + "learning_rate": 1.340081975840067e-05, + "loss": 4.5299, + "step": 2708 + }, + { + "epoch": 0.6887652947719689, + "grad_norm": 24426.888671875, + "learning_rate": 1.3381164912497962e-05, + "loss": 4.5339, + "step": 2709 + }, + { + "epoch": 0.6890195455267758, + "grad_norm": 24603.654296875, + "learning_rate": 1.3361519222498187e-05, + "loss": 4.5303, + "step": 2710 + }, + { + "epoch": 0.6892737962815827, + "grad_norm": 24566.6015625, + "learning_rate": 1.3341882703882572e-05, + "loss": 4.5382, + "step": 2711 + }, + { + "epoch": 0.6895280470363896, + "grad_norm": 24491.1953125, + "learning_rate": 1.3322255372125131e-05, + "loss": 4.5327, + "step": 2712 + }, + { + "epoch": 0.6897822977911966, + "grad_norm": 24574.552734375, + "learning_rate": 1.3302637242692656e-05, + "loss": 4.5427, + "step": 2713 + }, + { + "epoch": 0.6900365485460035, + "grad_norm": 24334.91015625, + "learning_rate": 1.328302833104467e-05, + "loss": 4.5375, + "step": 2714 + }, + { + "epoch": 0.6902907993008104, + "grad_norm": 24664.884765625, + "learning_rate": 1.326342865263342e-05, + "loss": 4.5488, + "step": 2715 + }, + { + "epoch": 0.6905450500556174, + "grad_norm": 24596.033203125, + "learning_rate": 1.324383822290392e-05, + "loss": 4.5357, + "step": 2716 + }, + { + "epoch": 0.6907993008104243, + "grad_norm": 24578.140625, + "learning_rate": 1.3224257057293848e-05, + "loss": 4.5223, + "step": 2717 + }, + { + "epoch": 0.6910535515652312, + "grad_norm": 24637.0625, + "learning_rate": 1.3204685171233602e-05, + "loss": 4.5457, + "step": 2718 + }, + { + "epoch": 0.6913078023200382, + "grad_norm": 24653.853515625, + "learning_rate": 1.3185122580146274e-05, + "loss": 4.5372, + "step": 2719 + }, + { + "epoch": 0.6915620530748451, + "grad_norm": 24514.70703125, + "learning_rate": 1.3165569299447633e-05, + "loss": 4.5363, + "step": 2720 + }, + { + "epoch": 0.691816303829652, + "grad_norm": 24774.013671875, + "learning_rate": 1.3146025344546087e-05, + "loss": 4.5438, + "step": 2721 + }, + { + "epoch": 0.6920705545844589, + "grad_norm": 24428.32421875, + "learning_rate": 1.3126490730842727e-05, + "loss": 4.5334, + "step": 2722 + }, + { + "epoch": 0.6923248053392659, + "grad_norm": 24422.35546875, + "learning_rate": 1.310696547373126e-05, + "loss": 4.5392, + "step": 2723 + }, + { + "epoch": 0.6925790560940728, + "grad_norm": 24679.43359375, + "learning_rate": 1.3087449588598016e-05, + "loss": 4.5539, + "step": 2724 + }, + { + "epoch": 0.6928333068488797, + "grad_norm": 25650.291015625, + "learning_rate": 1.3067943090821971e-05, + "loss": 4.5398, + "step": 2725 + }, + { + "epoch": 0.6930875576036867, + "grad_norm": 24690.90625, + "learning_rate": 1.3048445995774672e-05, + "loss": 4.5331, + "step": 2726 + }, + { + "epoch": 0.6933418083584936, + "grad_norm": 24565.85546875, + "learning_rate": 1.302895831882026e-05, + "loss": 4.5292, + "step": 2727 + }, + { + "epoch": 0.6935960591133005, + "grad_norm": 24578.10546875, + "learning_rate": 1.3009480075315481e-05, + "loss": 4.5213, + "step": 2728 + }, + { + "epoch": 0.6938503098681075, + "grad_norm": 24551.95703125, + "learning_rate": 1.2990011280609607e-05, + "loss": 4.5345, + "step": 2729 + }, + { + "epoch": 0.6941045606229144, + "grad_norm": 24626.541015625, + "learning_rate": 1.2970551950044507e-05, + "loss": 4.5317, + "step": 2730 + }, + { + "epoch": 0.6943588113777213, + "grad_norm": 24581.595703125, + "learning_rate": 1.295110209895455e-05, + "loss": 4.5413, + "step": 2731 + }, + { + "epoch": 0.6946130621325282, + "grad_norm": 24518.07421875, + "learning_rate": 1.2931661742666676e-05, + "loss": 4.5164, + "step": 2732 + }, + { + "epoch": 0.6948673128873352, + "grad_norm": 24475.59375, + "learning_rate": 1.291223089650031e-05, + "loss": 4.5473, + "step": 2733 + }, + { + "epoch": 0.6951215636421421, + "grad_norm": 24565.189453125, + "learning_rate": 1.2892809575767389e-05, + "loss": 4.525, + "step": 2734 + }, + { + "epoch": 0.695375814396949, + "grad_norm": 24648.650390625, + "learning_rate": 1.2873397795772363e-05, + "loss": 4.5428, + "step": 2735 + }, + { + "epoch": 0.695630065151756, + "grad_norm": 24591.53515625, + "learning_rate": 1.2853995571812146e-05, + "loss": 4.5326, + "step": 2736 + }, + { + "epoch": 0.6958843159065629, + "grad_norm": 24548.4140625, + "learning_rate": 1.2834602919176117e-05, + "loss": 4.5234, + "step": 2737 + }, + { + "epoch": 0.6961385666613698, + "grad_norm": 24423.3125, + "learning_rate": 1.2815219853146137e-05, + "loss": 4.5396, + "step": 2738 + }, + { + "epoch": 0.6963928174161768, + "grad_norm": 24610.857421875, + "learning_rate": 1.2795846388996482e-05, + "loss": 4.5322, + "step": 2739 + }, + { + "epoch": 0.6966470681709837, + "grad_norm": 24601.65625, + "learning_rate": 1.2776482541993884e-05, + "loss": 4.5336, + "step": 2740 + }, + { + "epoch": 0.6969013189257905, + "grad_norm": 24320.134765625, + "learning_rate": 1.27571283273975e-05, + "loss": 4.5347, + "step": 2741 + }, + { + "epoch": 0.6971555696805974, + "grad_norm": 24588.26171875, + "learning_rate": 1.273778376045887e-05, + "loss": 4.5438, + "step": 2742 + }, + { + "epoch": 0.6974098204354044, + "grad_norm": 24585.6328125, + "learning_rate": 1.271844885642195e-05, + "loss": 4.5348, + "step": 2743 + }, + { + "epoch": 0.6976640711902113, + "grad_norm": 24780.6953125, + "learning_rate": 1.2699123630523086e-05, + "loss": 4.5626, + "step": 2744 + }, + { + "epoch": 0.6979183219450182, + "grad_norm": 24794.88671875, + "learning_rate": 1.2679808097990986e-05, + "loss": 4.5423, + "step": 2745 + }, + { + "epoch": 0.6981725726998252, + "grad_norm": 24593.896484375, + "learning_rate": 1.2660502274046714e-05, + "loss": 4.5297, + "step": 2746 + }, + { + "epoch": 0.6984268234546321, + "grad_norm": 24634.080078125, + "learning_rate": 1.2641206173903708e-05, + "loss": 4.5079, + "step": 2747 + }, + { + "epoch": 0.698681074209439, + "grad_norm": 24642.599609375, + "learning_rate": 1.2621919812767724e-05, + "loss": 4.5281, + "step": 2748 + }, + { + "epoch": 0.698935324964246, + "grad_norm": 24419.669921875, + "learning_rate": 1.260264320583683e-05, + "loss": 4.5357, + "step": 2749 + }, + { + "epoch": 0.6991895757190529, + "grad_norm": 24614.642578125, + "learning_rate": 1.2583376368301442e-05, + "loss": 4.5272, + "step": 2750 + }, + { + "epoch": 0.6994438264738598, + "grad_norm": 24744.416015625, + "learning_rate": 1.256411931534427e-05, + "loss": 4.5245, + "step": 2751 + }, + { + "epoch": 0.6996980772286667, + "grad_norm": 24517.322265625, + "learning_rate": 1.2544872062140281e-05, + "loss": 4.5282, + "step": 2752 + }, + { + "epoch": 0.6999523279834737, + "grad_norm": 24756.703125, + "learning_rate": 1.2525634623856763e-05, + "loss": 4.5379, + "step": 2753 + }, + { + "epoch": 0.7002065787382806, + "grad_norm": 24605.791015625, + "learning_rate": 1.2506407015653244e-05, + "loss": 4.5336, + "step": 2754 + }, + { + "epoch": 0.7004608294930875, + "grad_norm": 24491.95703125, + "learning_rate": 1.2487189252681491e-05, + "loss": 4.5312, + "step": 2755 + }, + { + "epoch": 0.7007150802478945, + "grad_norm": 24499.0859375, + "learning_rate": 1.246798135008556e-05, + "loss": 4.5382, + "step": 2756 + }, + { + "epoch": 0.7009693310027014, + "grad_norm": 24607.48828125, + "learning_rate": 1.2448783323001703e-05, + "loss": 4.526, + "step": 2757 + }, + { + "epoch": 0.7012235817575083, + "grad_norm": 24581.46484375, + "learning_rate": 1.242959518655838e-05, + "loss": 4.5362, + "step": 2758 + }, + { + "epoch": 0.7014778325123153, + "grad_norm": 24611.47265625, + "learning_rate": 1.2410416955876294e-05, + "loss": 4.5352, + "step": 2759 + }, + { + "epoch": 0.7017320832671222, + "grad_norm": 24762.625, + "learning_rate": 1.2391248646068304e-05, + "loss": 4.5494, + "step": 2760 + }, + { + "epoch": 0.7019863340219291, + "grad_norm": 24322.44140625, + "learning_rate": 1.2372090272239483e-05, + "loss": 4.5271, + "step": 2761 + }, + { + "epoch": 0.702240584776736, + "grad_norm": 24410.3125, + "learning_rate": 1.2352941849487048e-05, + "loss": 4.5409, + "step": 2762 + }, + { + "epoch": 0.702494835531543, + "grad_norm": 24564.642578125, + "learning_rate": 1.23338033929004e-05, + "loss": 4.5246, + "step": 2763 + }, + { + "epoch": 0.7027490862863499, + "grad_norm": 24566.39453125, + "learning_rate": 1.2314674917561067e-05, + "loss": 4.5204, + "step": 2764 + }, + { + "epoch": 0.7030033370411568, + "grad_norm": 24650.7421875, + "learning_rate": 1.2295556438542702e-05, + "loss": 4.5454, + "step": 2765 + }, + { + "epoch": 0.7032575877959638, + "grad_norm": 24607.2578125, + "learning_rate": 1.2276447970911118e-05, + "loss": 4.5204, + "step": 2766 + }, + { + "epoch": 0.7035118385507707, + "grad_norm": 24574.49609375, + "learning_rate": 1.2257349529724208e-05, + "loss": 4.5153, + "step": 2767 + }, + { + "epoch": 0.7037660893055776, + "grad_norm": 24731.025390625, + "learning_rate": 1.2238261130031959e-05, + "loss": 4.5328, + "step": 2768 + }, + { + "epoch": 0.7040203400603846, + "grad_norm": 24577.15625, + "learning_rate": 1.2219182786876482e-05, + "loss": 4.5213, + "step": 2769 + }, + { + "epoch": 0.7042745908151915, + "grad_norm": 24677.474609375, + "learning_rate": 1.220011451529192e-05, + "loss": 4.5264, + "step": 2770 + }, + { + "epoch": 0.7045288415699984, + "grad_norm": 24732.84375, + "learning_rate": 1.2181056330304505e-05, + "loss": 4.5402, + "step": 2771 + }, + { + "epoch": 0.7047830923248053, + "grad_norm": 24607.443359375, + "learning_rate": 1.2162008246932527e-05, + "loss": 4.5284, + "step": 2772 + }, + { + "epoch": 0.7050373430796123, + "grad_norm": 24714.83984375, + "learning_rate": 1.2142970280186295e-05, + "loss": 4.5423, + "step": 2773 + }, + { + "epoch": 0.7052915938344192, + "grad_norm": 24522.263671875, + "learning_rate": 1.212394244506814e-05, + "loss": 4.5264, + "step": 2774 + }, + { + "epoch": 0.7055458445892261, + "grad_norm": 24505.8671875, + "learning_rate": 1.210492475657245e-05, + "loss": 4.5334, + "step": 2775 + }, + { + "epoch": 0.7058000953440331, + "grad_norm": 24583.64453125, + "learning_rate": 1.2085917229685573e-05, + "loss": 4.5403, + "step": 2776 + }, + { + "epoch": 0.70605434609884, + "grad_norm": 24423.26171875, + "learning_rate": 1.2066919879385864e-05, + "loss": 4.5262, + "step": 2777 + }, + { + "epoch": 0.7063085968536469, + "grad_norm": 24572.904296875, + "learning_rate": 1.2047932720643676e-05, + "loss": 4.5344, + "step": 2778 + }, + { + "epoch": 0.7065628476084539, + "grad_norm": 24621.18359375, + "learning_rate": 1.2028955768421307e-05, + "loss": 4.5229, + "step": 2779 + }, + { + "epoch": 0.7068170983632608, + "grad_norm": 24553.146484375, + "learning_rate": 1.2009989037673017e-05, + "loss": 4.5353, + "step": 2780 + }, + { + "epoch": 0.7070713491180677, + "grad_norm": 24619.4765625, + "learning_rate": 1.1991032543345019e-05, + "loss": 4.5441, + "step": 2781 + }, + { + "epoch": 0.7073255998728746, + "grad_norm": 24470.681640625, + "learning_rate": 1.1972086300375468e-05, + "loss": 4.5282, + "step": 2782 + }, + { + "epoch": 0.7075798506276816, + "grad_norm": 24457.095703125, + "learning_rate": 1.1953150323694413e-05, + "loss": 4.5171, + "step": 2783 + }, + { + "epoch": 0.7078341013824885, + "grad_norm": 24458.388671875, + "learning_rate": 1.1934224628223841e-05, + "loss": 4.5305, + "step": 2784 + }, + { + "epoch": 0.7080883521372954, + "grad_norm": 24587.169921875, + "learning_rate": 1.1915309228877622e-05, + "loss": 4.5183, + "step": 2785 + }, + { + "epoch": 0.7083426028921024, + "grad_norm": 24627.361328125, + "learning_rate": 1.1896404140561504e-05, + "loss": 4.5284, + "step": 2786 + }, + { + "epoch": 0.7085968536469093, + "grad_norm": 24575.806640625, + "learning_rate": 1.1877509378173137e-05, + "loss": 4.5147, + "step": 2787 + }, + { + "epoch": 0.7088511044017162, + "grad_norm": 24511.853515625, + "learning_rate": 1.1858624956602013e-05, + "loss": 4.5349, + "step": 2788 + }, + { + "epoch": 0.7091053551565232, + "grad_norm": 24568.107421875, + "learning_rate": 1.1839750890729467e-05, + "loss": 4.5262, + "step": 2789 + }, + { + "epoch": 0.7093596059113301, + "grad_norm": 24516.11328125, + "learning_rate": 1.1820887195428707e-05, + "loss": 4.5246, + "step": 2790 + }, + { + "epoch": 0.709613856666137, + "grad_norm": 24561.26953125, + "learning_rate": 1.1802033885564732e-05, + "loss": 4.5291, + "step": 2791 + }, + { + "epoch": 0.7098681074209439, + "grad_norm": 24440.9921875, + "learning_rate": 1.1783190975994388e-05, + "loss": 4.5282, + "step": 2792 + }, + { + "epoch": 0.7101223581757509, + "grad_norm": 24644.482421875, + "learning_rate": 1.1764358481566293e-05, + "loss": 4.5257, + "step": 2793 + }, + { + "epoch": 0.7103766089305578, + "grad_norm": 24571.787109375, + "learning_rate": 1.1745536417120895e-05, + "loss": 4.5246, + "step": 2794 + }, + { + "epoch": 0.7106308596853647, + "grad_norm": 24504.5703125, + "learning_rate": 1.1726724797490396e-05, + "loss": 4.5357, + "step": 2795 + }, + { + "epoch": 0.7108851104401717, + "grad_norm": 24616.7890625, + "learning_rate": 1.1707923637498763e-05, + "loss": 4.5271, + "step": 2796 + }, + { + "epoch": 0.7111393611949786, + "grad_norm": 24491.08984375, + "learning_rate": 1.1689132951961751e-05, + "loss": 4.5203, + "step": 2797 + }, + { + "epoch": 0.7113936119497855, + "grad_norm": 24620.1015625, + "learning_rate": 1.1670352755686834e-05, + "loss": 4.5337, + "step": 2798 + }, + { + "epoch": 0.7116478627045923, + "grad_norm": 24555.818359375, + "learning_rate": 1.1651583063473217e-05, + "loss": 4.5284, + "step": 2799 + }, + { + "epoch": 0.7119021134593994, + "grad_norm": 24425.498046875, + "learning_rate": 1.163282389011186e-05, + "loss": 4.5173, + "step": 2800 + }, + { + "epoch": 0.7119021134593994, + "eval_loss": 9.120746612548828, + "eval_runtime": 698.1799, + "eval_samples_per_second": 151.803, + "eval_steps_per_second": 9.489, + "step": 2800 + }, + { + "epoch": 0.7121563642142063, + "grad_norm": 24571.57421875, + "learning_rate": 1.1614075250385392e-05, + "loss": 4.5257, + "step": 2801 + }, + { + "epoch": 0.7124106149690131, + "grad_norm": 24655.33203125, + "learning_rate": 1.1595337159068173e-05, + "loss": 4.5185, + "step": 2802 + }, + { + "epoch": 0.7126648657238202, + "grad_norm": 24661.380859375, + "learning_rate": 1.1576609630926246e-05, + "loss": 4.5329, + "step": 2803 + }, + { + "epoch": 0.712919116478627, + "grad_norm": 24512.60546875, + "learning_rate": 1.155789268071732e-05, + "loss": 4.5347, + "step": 2804 + }, + { + "epoch": 0.7131733672334339, + "grad_norm": 24715.060546875, + "learning_rate": 1.1539186323190757e-05, + "loss": 4.5342, + "step": 2805 + }, + { + "epoch": 0.713427617988241, + "grad_norm": 24664.0546875, + "learning_rate": 1.1520490573087605e-05, + "loss": 4.5177, + "step": 2806 + }, + { + "epoch": 0.7136818687430478, + "grad_norm": 24562.830078125, + "learning_rate": 1.1501805445140531e-05, + "loss": 4.5267, + "step": 2807 + }, + { + "epoch": 0.7139361194978547, + "grad_norm": 24562.740234375, + "learning_rate": 1.1483130954073826e-05, + "loss": 4.5263, + "step": 2808 + }, + { + "epoch": 0.7141903702526616, + "grad_norm": 24506.6171875, + "learning_rate": 1.1464467114603419e-05, + "loss": 4.5133, + "step": 2809 + }, + { + "epoch": 0.7144446210074686, + "grad_norm": 24463.29296875, + "learning_rate": 1.1445813941436833e-05, + "loss": 4.5253, + "step": 2810 + }, + { + "epoch": 0.7146988717622755, + "grad_norm": 24612.748046875, + "learning_rate": 1.1427171449273175e-05, + "loss": 4.5164, + "step": 2811 + }, + { + "epoch": 0.7149531225170824, + "grad_norm": 24495.36328125, + "learning_rate": 1.1408539652803157e-05, + "loss": 4.518, + "step": 2812 + }, + { + "epoch": 0.7152073732718894, + "grad_norm": 24609.59765625, + "learning_rate": 1.138991856670906e-05, + "loss": 4.531, + "step": 2813 + }, + { + "epoch": 0.7154616240266963, + "grad_norm": 24586.6640625, + "learning_rate": 1.1371308205664705e-05, + "loss": 4.5214, + "step": 2814 + }, + { + "epoch": 0.7157158747815032, + "grad_norm": 24626.869140625, + "learning_rate": 1.1352708584335486e-05, + "loss": 4.5195, + "step": 2815 + }, + { + "epoch": 0.7159701255363102, + "grad_norm": 24586.49609375, + "learning_rate": 1.133411971737832e-05, + "loss": 4.5166, + "step": 2816 + }, + { + "epoch": 0.7162243762911171, + "grad_norm": 24509.5234375, + "learning_rate": 1.1315541619441636e-05, + "loss": 4.5202, + "step": 2817 + }, + { + "epoch": 0.716478627045924, + "grad_norm": 24551.453125, + "learning_rate": 1.1296974305165414e-05, + "loss": 4.526, + "step": 2818 + }, + { + "epoch": 0.7167328778007309, + "grad_norm": 24518.50390625, + "learning_rate": 1.1278417789181104e-05, + "loss": 4.5134, + "step": 2819 + }, + { + "epoch": 0.7169871285555379, + "grad_norm": 24575.814453125, + "learning_rate": 1.125987208611165e-05, + "loss": 4.526, + "step": 2820 + }, + { + "epoch": 0.7172413793103448, + "grad_norm": 24721.21875, + "learning_rate": 1.1241337210571498e-05, + "loss": 4.5257, + "step": 2821 + }, + { + "epoch": 0.7174956300651517, + "grad_norm": 24604.32421875, + "learning_rate": 1.122281317716653e-05, + "loss": 4.5222, + "step": 2822 + }, + { + "epoch": 0.7177498808199587, + "grad_norm": 24642.431640625, + "learning_rate": 1.1204300000494117e-05, + "loss": 4.5227, + "step": 2823 + }, + { + "epoch": 0.7180041315747656, + "grad_norm": 24788.953125, + "learning_rate": 1.118579769514304e-05, + "loss": 4.5049, + "step": 2824 + }, + { + "epoch": 0.7182583823295725, + "grad_norm": 24547.365234375, + "learning_rate": 1.1167306275693553e-05, + "loss": 4.4986, + "step": 2825 + }, + { + "epoch": 0.7185126330843795, + "grad_norm": 24592.56640625, + "learning_rate": 1.1148825756717296e-05, + "loss": 4.5227, + "step": 2826 + }, + { + "epoch": 0.7187668838391864, + "grad_norm": 24771.935546875, + "learning_rate": 1.1130356152777324e-05, + "loss": 4.5322, + "step": 2827 + }, + { + "epoch": 0.7190211345939933, + "grad_norm": 24512.740234375, + "learning_rate": 1.1111897478428118e-05, + "loss": 4.5186, + "step": 2828 + }, + { + "epoch": 0.7192753853488002, + "grad_norm": 24559.509765625, + "learning_rate": 1.1093449748215522e-05, + "loss": 4.514, + "step": 2829 + }, + { + "epoch": 0.7195296361036072, + "grad_norm": 24490.0625, + "learning_rate": 1.1075012976676748e-05, + "loss": 4.5148, + "step": 2830 + }, + { + "epoch": 0.7197838868584141, + "grad_norm": 24711.111328125, + "learning_rate": 1.1056587178340408e-05, + "loss": 4.518, + "step": 2831 + }, + { + "epoch": 0.720038137613221, + "grad_norm": 24496.298828125, + "learning_rate": 1.1038172367726424e-05, + "loss": 4.5162, + "step": 2832 + }, + { + "epoch": 0.720292388368028, + "grad_norm": 24688.23046875, + "learning_rate": 1.10197685593461e-05, + "loss": 4.515, + "step": 2833 + }, + { + "epoch": 0.7205466391228349, + "grad_norm": 24696.15625, + "learning_rate": 1.100137576770203e-05, + "loss": 4.5215, + "step": 2834 + }, + { + "epoch": 0.7208008898776418, + "grad_norm": 24514.96484375, + "learning_rate": 1.0982994007288166e-05, + "loss": 4.5088, + "step": 2835 + }, + { + "epoch": 0.7210551406324488, + "grad_norm": 24648.576171875, + "learning_rate": 1.0964623292589728e-05, + "loss": 4.5073, + "step": 2836 + }, + { + "epoch": 0.7213093913872557, + "grad_norm": 24753.626953125, + "learning_rate": 1.0946263638083276e-05, + "loss": 4.5111, + "step": 2837 + }, + { + "epoch": 0.7215636421420626, + "grad_norm": 24724.482421875, + "learning_rate": 1.0927915058236615e-05, + "loss": 4.52, + "step": 2838 + }, + { + "epoch": 0.7218178928968695, + "grad_norm": 24680.513671875, + "learning_rate": 1.090957756750883e-05, + "loss": 4.5266, + "step": 2839 + }, + { + "epoch": 0.7220721436516765, + "grad_norm": 24543.712890625, + "learning_rate": 1.0891251180350295e-05, + "loss": 4.5166, + "step": 2840 + }, + { + "epoch": 0.7223263944064834, + "grad_norm": 24585.509765625, + "learning_rate": 1.0872935911202603e-05, + "loss": 4.5157, + "step": 2841 + }, + { + "epoch": 0.7225806451612903, + "grad_norm": 24542.85546875, + "learning_rate": 1.0854631774498591e-05, + "loss": 4.523, + "step": 2842 + }, + { + "epoch": 0.7228348959160973, + "grad_norm": 24622.171875, + "learning_rate": 1.0836338784662348e-05, + "loss": 4.5346, + "step": 2843 + }, + { + "epoch": 0.7230891466709042, + "grad_norm": 24633.845703125, + "learning_rate": 1.0818056956109138e-05, + "loss": 4.5032, + "step": 2844 + }, + { + "epoch": 0.7233433974257111, + "grad_norm": 24724.28515625, + "learning_rate": 1.0799786303245465e-05, + "loss": 4.513, + "step": 2845 + }, + { + "epoch": 0.7235976481805181, + "grad_norm": 24725.634765625, + "learning_rate": 1.0781526840469022e-05, + "loss": 4.5248, + "step": 2846 + }, + { + "epoch": 0.723851898935325, + "grad_norm": 24581.431640625, + "learning_rate": 1.0763278582168671e-05, + "loss": 4.5124, + "step": 2847 + }, + { + "epoch": 0.7241061496901319, + "grad_norm": 24616.732421875, + "learning_rate": 1.0745041542724431e-05, + "loss": 4.52, + "step": 2848 + }, + { + "epoch": 0.7243604004449388, + "grad_norm": 24578.033203125, + "learning_rate": 1.0726815736507526e-05, + "loss": 4.5095, + "step": 2849 + }, + { + "epoch": 0.7246146511997458, + "grad_norm": 24537.380859375, + "learning_rate": 1.0708601177880284e-05, + "loss": 4.5105, + "step": 2850 + }, + { + "epoch": 0.7248689019545527, + "grad_norm": 24634.66015625, + "learning_rate": 1.0690397881196182e-05, + "loss": 4.5236, + "step": 2851 + }, + { + "epoch": 0.7251231527093596, + "grad_norm": 24801.009765625, + "learning_rate": 1.0672205860799841e-05, + "loss": 4.5062, + "step": 2852 + }, + { + "epoch": 0.7253774034641666, + "grad_norm": 24844.7421875, + "learning_rate": 1.0654025131026976e-05, + "loss": 4.5098, + "step": 2853 + }, + { + "epoch": 0.7256316542189735, + "grad_norm": 24626.2578125, + "learning_rate": 1.06358557062044e-05, + "loss": 4.5281, + "step": 2854 + }, + { + "epoch": 0.7258859049737804, + "grad_norm": 24679.78515625, + "learning_rate": 1.0617697600650033e-05, + "loss": 4.5121, + "step": 2855 + }, + { + "epoch": 0.7261401557285874, + "grad_norm": 24709.298828125, + "learning_rate": 1.0599550828672886e-05, + "loss": 4.5254, + "step": 2856 + }, + { + "epoch": 0.7263944064833943, + "grad_norm": 24571.041015625, + "learning_rate": 1.0581415404573008e-05, + "loss": 4.5176, + "step": 2857 + }, + { + "epoch": 0.7266486572382012, + "grad_norm": 24846.720703125, + "learning_rate": 1.0563291342641515e-05, + "loss": 4.5203, + "step": 2858 + }, + { + "epoch": 0.726902907993008, + "grad_norm": 24602.583984375, + "learning_rate": 1.054517865716059e-05, + "loss": 4.5012, + "step": 2859 + }, + { + "epoch": 0.7271571587478151, + "grad_norm": 24707.646484375, + "learning_rate": 1.052707736240343e-05, + "loss": 4.5106, + "step": 2860 + }, + { + "epoch": 0.727411409502622, + "grad_norm": 24561.548828125, + "learning_rate": 1.0508987472634249e-05, + "loss": 4.5209, + "step": 2861 + }, + { + "epoch": 0.7276656602574288, + "grad_norm": 24707.3828125, + "learning_rate": 1.0490909002108303e-05, + "loss": 4.4947, + "step": 2862 + }, + { + "epoch": 0.7279199110122359, + "grad_norm": 24711.82421875, + "learning_rate": 1.0472841965071831e-05, + "loss": 4.5269, + "step": 2863 + }, + { + "epoch": 0.7281741617670427, + "grad_norm": 24861.1484375, + "learning_rate": 1.0454786375762049e-05, + "loss": 4.5303, + "step": 2864 + }, + { + "epoch": 0.7284284125218496, + "grad_norm": 24730.208984375, + "learning_rate": 1.0436742248407177e-05, + "loss": 4.5274, + "step": 2865 + }, + { + "epoch": 0.7286826632766567, + "grad_norm": 24674.330078125, + "learning_rate": 1.04187095972264e-05, + "loss": 4.5066, + "step": 2866 + }, + { + "epoch": 0.7289369140314635, + "grad_norm": 24659.705078125, + "learning_rate": 1.0400688436429837e-05, + "loss": 4.5086, + "step": 2867 + }, + { + "epoch": 0.7291911647862704, + "grad_norm": 24667.701171875, + "learning_rate": 1.0382678780218585e-05, + "loss": 4.5005, + "step": 2868 + }, + { + "epoch": 0.7294454155410773, + "grad_norm": 24320.171875, + "learning_rate": 1.0364680642784646e-05, + "loss": 4.52, + "step": 2869 + }, + { + "epoch": 0.7296996662958843, + "grad_norm": 24739.58984375, + "learning_rate": 1.034669403831095e-05, + "loss": 4.5138, + "step": 2870 + }, + { + "epoch": 0.7299539170506912, + "grad_norm": 24748.58984375, + "learning_rate": 1.0328718980971361e-05, + "loss": 4.5272, + "step": 2871 + }, + { + "epoch": 0.7302081678054981, + "grad_norm": 24510.892578125, + "learning_rate": 1.0310755484930621e-05, + "loss": 4.507, + "step": 2872 + }, + { + "epoch": 0.7304624185603051, + "grad_norm": 24731.041015625, + "learning_rate": 1.0292803564344358e-05, + "loss": 4.5131, + "step": 2873 + }, + { + "epoch": 0.730716669315112, + "grad_norm": 24497.859375, + "learning_rate": 1.0274863233359106e-05, + "loss": 4.5071, + "step": 2874 + }, + { + "epoch": 0.7309709200699189, + "grad_norm": 24641.732421875, + "learning_rate": 1.0256934506112228e-05, + "loss": 4.5205, + "step": 2875 + }, + { + "epoch": 0.7312251708247259, + "grad_norm": 24699.0546875, + "learning_rate": 1.0239017396731978e-05, + "loss": 4.5069, + "step": 2876 + }, + { + "epoch": 0.7314794215795328, + "grad_norm": 24471.77734375, + "learning_rate": 1.0221111919337451e-05, + "loss": 4.5017, + "step": 2877 + }, + { + "epoch": 0.7317336723343397, + "grad_norm": 24584.58203125, + "learning_rate": 1.0203218088038546e-05, + "loss": 4.5266, + "step": 2878 + }, + { + "epoch": 0.7319879230891466, + "grad_norm": 24751.947265625, + "learning_rate": 1.0185335916936006e-05, + "loss": 4.5191, + "step": 2879 + }, + { + "epoch": 0.7322421738439536, + "grad_norm": 24693.267578125, + "learning_rate": 1.0167465420121394e-05, + "loss": 4.5181, + "step": 2880 + }, + { + "epoch": 0.7324964245987605, + "grad_norm": 24565.5859375, + "learning_rate": 1.0149606611677057e-05, + "loss": 4.5209, + "step": 2881 + }, + { + "epoch": 0.7327506753535674, + "grad_norm": 24728.08984375, + "learning_rate": 1.0131759505676128e-05, + "loss": 4.5137, + "step": 2882 + }, + { + "epoch": 0.7330049261083744, + "grad_norm": 24549.197265625, + "learning_rate": 1.0113924116182545e-05, + "loss": 4.5203, + "step": 2883 + }, + { + "epoch": 0.7332591768631813, + "grad_norm": 24702.048828125, + "learning_rate": 1.0096100457250982e-05, + "loss": 4.5081, + "step": 2884 + }, + { + "epoch": 0.7335134276179882, + "grad_norm": 24607.986328125, + "learning_rate": 1.0078288542926881e-05, + "loss": 4.5141, + "step": 2885 + }, + { + "epoch": 0.7337676783727952, + "grad_norm": 24852.66015625, + "learning_rate": 1.0060488387246433e-05, + "loss": 4.4966, + "step": 2886 + }, + { + "epoch": 0.7340219291276021, + "grad_norm": 24571.57421875, + "learning_rate": 1.0042700004236574e-05, + "loss": 4.5055, + "step": 2887 + }, + { + "epoch": 0.734276179882409, + "grad_norm": 24748.0625, + "learning_rate": 1.0024923407914937e-05, + "loss": 4.5174, + "step": 2888 + }, + { + "epoch": 0.7345304306372159, + "grad_norm": 24663.998046875, + "learning_rate": 1.0007158612289875e-05, + "loss": 4.5018, + "step": 2889 + }, + { + "epoch": 0.7347846813920229, + "grad_norm": 24664.279296875, + "learning_rate": 9.989405631360454e-06, + "loss": 4.518, + "step": 2890 + }, + { + "epoch": 0.7350389321468298, + "grad_norm": 24721.41796875, + "learning_rate": 9.971664479116424e-06, + "loss": 4.5194, + "step": 2891 + }, + { + "epoch": 0.7352931829016367, + "grad_norm": 24771.279296875, + "learning_rate": 9.953935169538195e-06, + "loss": 4.5195, + "step": 2892 + }, + { + "epoch": 0.7355474336564437, + "grad_norm": 24779.580078125, + "learning_rate": 9.93621771659688e-06, + "loss": 4.5154, + "step": 2893 + }, + { + "epoch": 0.7358016844112506, + "grad_norm": 24772.01171875, + "learning_rate": 9.918512134254224e-06, + "loss": 4.5082, + "step": 2894 + }, + { + "epoch": 0.7360559351660575, + "grad_norm": 24735.28515625, + "learning_rate": 9.900818436462608e-06, + "loss": 4.5153, + "step": 2895 + }, + { + "epoch": 0.7363101859208645, + "grad_norm": 24682.697265625, + "learning_rate": 9.88313663716508e-06, + "loss": 4.5095, + "step": 2896 + }, + { + "epoch": 0.7365644366756714, + "grad_norm": 24713.830078125, + "learning_rate": 9.865466750295299e-06, + "loss": 4.5085, + "step": 2897 + }, + { + "epoch": 0.7368186874304783, + "grad_norm": 24520.05078125, + "learning_rate": 9.847808789777516e-06, + "loss": 4.5033, + "step": 2898 + }, + { + "epoch": 0.7370729381852852, + "grad_norm": 24690.03125, + "learning_rate": 9.830162769526616e-06, + "loss": 4.5131, + "step": 2899 + }, + { + "epoch": 0.7373271889400922, + "grad_norm": 24717.259765625, + "learning_rate": 9.812528703448053e-06, + "loss": 4.5158, + "step": 2900 + }, + { + "epoch": 0.7373271889400922, + "eval_loss": 9.091280937194824, + "eval_runtime": 699.6815, + "eval_samples_per_second": 151.477, + "eval_steps_per_second": 9.469, + "step": 2900 + }, + { + "epoch": 0.7375814396948991, + "grad_norm": 24643.61328125, + "learning_rate": 9.79490660543786e-06, + "loss": 4.5028, + "step": 2901 + }, + { + "epoch": 0.737835690449706, + "grad_norm": 24695.962890625, + "learning_rate": 9.77729648938266e-06, + "loss": 4.5053, + "step": 2902 + }, + { + "epoch": 0.738089941204513, + "grad_norm": 24660.46484375, + "learning_rate": 9.759698369159608e-06, + "loss": 4.5258, + "step": 2903 + }, + { + "epoch": 0.7383441919593199, + "grad_norm": 24826.34375, + "learning_rate": 9.742112258636415e-06, + "loss": 4.5041, + "step": 2904 + }, + { + "epoch": 0.7385984427141268, + "grad_norm": 24553.201171875, + "learning_rate": 9.72453817167135e-06, + "loss": 4.5012, + "step": 2905 + }, + { + "epoch": 0.7388526934689338, + "grad_norm": 24939.783203125, + "learning_rate": 9.706976122113162e-06, + "loss": 4.519, + "step": 2906 + }, + { + "epoch": 0.7391069442237407, + "grad_norm": 24790.2109375, + "learning_rate": 9.689426123801157e-06, + "loss": 4.5105, + "step": 2907 + }, + { + "epoch": 0.7393611949785476, + "grad_norm": 24654.67578125, + "learning_rate": 9.671888190565132e-06, + "loss": 4.509, + "step": 2908 + }, + { + "epoch": 0.7396154457333545, + "grad_norm": 24797.306640625, + "learning_rate": 9.654362336225368e-06, + "loss": 4.5131, + "step": 2909 + }, + { + "epoch": 0.7398696964881615, + "grad_norm": 24856.2265625, + "learning_rate": 9.636848574592616e-06, + "loss": 4.4883, + "step": 2910 + }, + { + "epoch": 0.7401239472429684, + "grad_norm": 24707.658203125, + "learning_rate": 9.619346919468136e-06, + "loss": 4.5157, + "step": 2911 + }, + { + "epoch": 0.7403781979977753, + "grad_norm": 24921.486328125, + "learning_rate": 9.601857384643617e-06, + "loss": 4.5139, + "step": 2912 + }, + { + "epoch": 0.7406324487525823, + "grad_norm": 25025.767578125, + "learning_rate": 9.584379983901193e-06, + "loss": 4.509, + "step": 2913 + }, + { + "epoch": 0.7408866995073892, + "grad_norm": 24735.580078125, + "learning_rate": 9.566914731013469e-06, + "loss": 4.507, + "step": 2914 + }, + { + "epoch": 0.7411409502621961, + "grad_norm": 24881.625, + "learning_rate": 9.549461639743445e-06, + "loss": 4.5001, + "step": 2915 + }, + { + "epoch": 0.7413952010170031, + "grad_norm": 24822.53125, + "learning_rate": 9.53202072384454e-06, + "loss": 4.5144, + "step": 2916 + }, + { + "epoch": 0.74164945177181, + "grad_norm": 24711.216796875, + "learning_rate": 9.5145919970606e-06, + "loss": 4.5049, + "step": 2917 + }, + { + "epoch": 0.7419037025266169, + "grad_norm": 24874.71875, + "learning_rate": 9.497175473125854e-06, + "loss": 4.511, + "step": 2918 + }, + { + "epoch": 0.7421579532814238, + "grad_norm": 24836.255859375, + "learning_rate": 9.479771165764916e-06, + "loss": 4.5082, + "step": 2919 + }, + { + "epoch": 0.7424122040362308, + "grad_norm": 24491.640625, + "learning_rate": 9.462379088692752e-06, + "loss": 4.4925, + "step": 2920 + }, + { + "epoch": 0.7426664547910377, + "grad_norm": 24887.7265625, + "learning_rate": 9.444999255614734e-06, + "loss": 4.4999, + "step": 2921 + }, + { + "epoch": 0.7429207055458446, + "grad_norm": 24762.064453125, + "learning_rate": 9.427631680226548e-06, + "loss": 4.505, + "step": 2922 + }, + { + "epoch": 0.7431749563006516, + "grad_norm": 24669.28515625, + "learning_rate": 9.410276376214225e-06, + "loss": 4.498, + "step": 2923 + }, + { + "epoch": 0.7434292070554585, + "grad_norm": 24822.990234375, + "learning_rate": 9.392933357254151e-06, + "loss": 4.5174, + "step": 2924 + }, + { + "epoch": 0.7436834578102653, + "grad_norm": 24809.12890625, + "learning_rate": 9.37560263701301e-06, + "loss": 4.5119, + "step": 2925 + }, + { + "epoch": 0.7439377085650724, + "grad_norm": 24701.10546875, + "learning_rate": 9.358284229147785e-06, + "loss": 4.5113, + "step": 2926 + }, + { + "epoch": 0.7441919593198792, + "grad_norm": 24921.001953125, + "learning_rate": 9.34097814730578e-06, + "loss": 4.5091, + "step": 2927 + }, + { + "epoch": 0.7444462100746861, + "grad_norm": 24737.771484375, + "learning_rate": 9.323684405124586e-06, + "loss": 4.5099, + "step": 2928 + }, + { + "epoch": 0.744700460829493, + "grad_norm": 24839.58984375, + "learning_rate": 9.306403016232042e-06, + "loss": 4.5022, + "step": 2929 + }, + { + "epoch": 0.7449547115843, + "grad_norm": 24923.19921875, + "learning_rate": 9.289133994246288e-06, + "loss": 4.5225, + "step": 2930 + }, + { + "epoch": 0.7452089623391069, + "grad_norm": 24689.068359375, + "learning_rate": 9.271877352775693e-06, + "loss": 4.507, + "step": 2931 + }, + { + "epoch": 0.7454632130939138, + "grad_norm": 24819.75390625, + "learning_rate": 9.25463310541887e-06, + "loss": 4.511, + "step": 2932 + }, + { + "epoch": 0.7457174638487208, + "grad_norm": 24796.404296875, + "learning_rate": 9.237401265764687e-06, + "loss": 4.5036, + "step": 2933 + }, + { + "epoch": 0.7459717146035277, + "grad_norm": 24705.724609375, + "learning_rate": 9.220181847392215e-06, + "loss": 4.5241, + "step": 2934 + }, + { + "epoch": 0.7462259653583346, + "grad_norm": 24488.142578125, + "learning_rate": 9.202974863870734e-06, + "loss": 4.5104, + "step": 2935 + }, + { + "epoch": 0.7464802161131416, + "grad_norm": 24748.849609375, + "learning_rate": 9.185780328759746e-06, + "loss": 4.487, + "step": 2936 + }, + { + "epoch": 0.7467344668679485, + "grad_norm": 24702.623046875, + "learning_rate": 9.168598255608917e-06, + "loss": 4.5082, + "step": 2937 + }, + { + "epoch": 0.7469887176227554, + "grad_norm": 24454.08203125, + "learning_rate": 9.151428657958119e-06, + "loss": 4.4932, + "step": 2938 + }, + { + "epoch": 0.7472429683775623, + "grad_norm": 24816.328125, + "learning_rate": 9.13427154933738e-06, + "loss": 4.5216, + "step": 2939 + }, + { + "epoch": 0.7474972191323693, + "grad_norm": 24859.419921875, + "learning_rate": 9.117126943266887e-06, + "loss": 4.5138, + "step": 2940 + }, + { + "epoch": 0.7477514698871762, + "grad_norm": 24800.67578125, + "learning_rate": 9.09999485325696e-06, + "loss": 4.5021, + "step": 2941 + }, + { + "epoch": 0.7480057206419831, + "grad_norm": 24711.828125, + "learning_rate": 9.082875292808091e-06, + "loss": 4.5023, + "step": 2942 + }, + { + "epoch": 0.7482599713967901, + "grad_norm": 24823.349609375, + "learning_rate": 9.065768275410865e-06, + "loss": 4.5068, + "step": 2943 + }, + { + "epoch": 0.748514222151597, + "grad_norm": 24625.912109375, + "learning_rate": 9.048673814545994e-06, + "loss": 4.4879, + "step": 2944 + }, + { + "epoch": 0.7487684729064039, + "grad_norm": 24630.763671875, + "learning_rate": 9.03159192368431e-06, + "loss": 4.5103, + "step": 2945 + }, + { + "epoch": 0.7490227236612109, + "grad_norm": 24870.470703125, + "learning_rate": 9.014522616286717e-06, + "loss": 4.514, + "step": 2946 + }, + { + "epoch": 0.7492769744160178, + "grad_norm": 24770.916015625, + "learning_rate": 8.997465905804205e-06, + "loss": 4.4901, + "step": 2947 + }, + { + "epoch": 0.7495312251708247, + "grad_norm": 24852.30859375, + "learning_rate": 8.980421805677855e-06, + "loss": 4.4983, + "step": 2948 + }, + { + "epoch": 0.7497854759256316, + "grad_norm": 24658.908203125, + "learning_rate": 8.963390329338808e-06, + "loss": 4.5004, + "step": 2949 + }, + { + "epoch": 0.7500397266804386, + "grad_norm": 24816.4453125, + "learning_rate": 8.946371490208241e-06, + "loss": 4.5079, + "step": 2950 + }, + { + "epoch": 0.7502939774352455, + "grad_norm": 24840.474609375, + "learning_rate": 8.929365301697373e-06, + "loss": 4.495, + "step": 2951 + }, + { + "epoch": 0.7505482281900524, + "grad_norm": 24781.185546875, + "learning_rate": 8.912371777207478e-06, + "loss": 4.496, + "step": 2952 + }, + { + "epoch": 0.7508024789448594, + "grad_norm": 24763.560546875, + "learning_rate": 8.89539093012983e-06, + "loss": 4.5163, + "step": 2953 + }, + { + "epoch": 0.7510567296996663, + "grad_norm": 24786.45703125, + "learning_rate": 8.878422773845704e-06, + "loss": 4.5086, + "step": 2954 + }, + { + "epoch": 0.7513109804544732, + "grad_norm": 24827.353515625, + "learning_rate": 8.86146732172641e-06, + "loss": 4.4979, + "step": 2955 + }, + { + "epoch": 0.7515652312092802, + "grad_norm": 24724.4140625, + "learning_rate": 8.844524587133216e-06, + "loss": 4.4954, + "step": 2956 + }, + { + "epoch": 0.7518194819640871, + "grad_norm": 24726.474609375, + "learning_rate": 8.827594583417365e-06, + "loss": 4.5073, + "step": 2957 + }, + { + "epoch": 0.752073732718894, + "grad_norm": 24840.26171875, + "learning_rate": 8.81067732392009e-06, + "loss": 4.475, + "step": 2958 + }, + { + "epoch": 0.7523279834737009, + "grad_norm": 24783.5703125, + "learning_rate": 8.793772821972582e-06, + "loss": 4.5137, + "step": 2959 + }, + { + "epoch": 0.7525822342285079, + "grad_norm": 24860.142578125, + "learning_rate": 8.77688109089595e-06, + "loss": 4.5024, + "step": 2960 + }, + { + "epoch": 0.7528364849833148, + "grad_norm": 24594.4921875, + "learning_rate": 8.760002144001272e-06, + "loss": 4.4995, + "step": 2961 + }, + { + "epoch": 0.7530907357381217, + "grad_norm": 24607.1484375, + "learning_rate": 8.743135994589533e-06, + "loss": 4.4845, + "step": 2962 + }, + { + "epoch": 0.7533449864929287, + "grad_norm": 24703.84375, + "learning_rate": 8.726282655951625e-06, + "loss": 4.4897, + "step": 2963 + }, + { + "epoch": 0.7535992372477356, + "grad_norm": 24817.990234375, + "learning_rate": 8.70944214136838e-06, + "loss": 4.4955, + "step": 2964 + }, + { + "epoch": 0.7538534880025425, + "grad_norm": 24610.814453125, + "learning_rate": 8.692614464110486e-06, + "loss": 4.5021, + "step": 2965 + }, + { + "epoch": 0.7541077387573495, + "grad_norm": 24879.591796875, + "learning_rate": 8.675799637438522e-06, + "loss": 4.5059, + "step": 2966 + }, + { + "epoch": 0.7543619895121564, + "grad_norm": 24550.423828125, + "learning_rate": 8.658997674602973e-06, + "loss": 4.4913, + "step": 2967 + }, + { + "epoch": 0.7546162402669633, + "grad_norm": 24834.60546875, + "learning_rate": 8.642208588844139e-06, + "loss": 4.5187, + "step": 2968 + }, + { + "epoch": 0.7548704910217702, + "grad_norm": 24763.9765625, + "learning_rate": 8.625432393392219e-06, + "loss": 4.5108, + "step": 2969 + }, + { + "epoch": 0.7551247417765772, + "grad_norm": 24771.94921875, + "learning_rate": 8.60866910146721e-06, + "loss": 4.5151, + "step": 2970 + }, + { + "epoch": 0.7553789925313841, + "grad_norm": 24773.953125, + "learning_rate": 8.591918726278982e-06, + "loss": 4.509, + "step": 2971 + }, + { + "epoch": 0.755633243286191, + "grad_norm": 24784.517578125, + "learning_rate": 8.575181281027192e-06, + "loss": 4.5014, + "step": 2972 + }, + { + "epoch": 0.755887494040998, + "grad_norm": 24757.48828125, + "learning_rate": 8.558456778901333e-06, + "loss": 4.5124, + "step": 2973 + }, + { + "epoch": 0.7561417447958049, + "grad_norm": 24694.72265625, + "learning_rate": 8.541745233080687e-06, + "loss": 4.5031, + "step": 2974 + }, + { + "epoch": 0.7563959955506118, + "grad_norm": 24608.060546875, + "learning_rate": 8.525046656734312e-06, + "loss": 4.5034, + "step": 2975 + }, + { + "epoch": 0.7566502463054188, + "grad_norm": 24574.517578125, + "learning_rate": 8.508361063021084e-06, + "loss": 4.4907, + "step": 2976 + }, + { + "epoch": 0.7569044970602257, + "grad_norm": 24649.9609375, + "learning_rate": 8.491688465089612e-06, + "loss": 4.5132, + "step": 2977 + }, + { + "epoch": 0.7571587478150326, + "grad_norm": 24596.609375, + "learning_rate": 8.475028876078272e-06, + "loss": 4.4972, + "step": 2978 + }, + { + "epoch": 0.7574129985698395, + "grad_norm": 24690.931640625, + "learning_rate": 8.458382309115212e-06, + "loss": 4.5074, + "step": 2979 + }, + { + "epoch": 0.7576672493246465, + "grad_norm": 24754.642578125, + "learning_rate": 8.44174877731828e-06, + "loss": 4.4943, + "step": 2980 + }, + { + "epoch": 0.7579215000794534, + "grad_norm": 24791.7421875, + "learning_rate": 8.425128293795095e-06, + "loss": 4.5083, + "step": 2981 + }, + { + "epoch": 0.7581757508342603, + "grad_norm": 24782.58203125, + "learning_rate": 8.40852087164295e-06, + "loss": 4.4886, + "step": 2982 + }, + { + "epoch": 0.7584300015890673, + "grad_norm": 24663.4140625, + "learning_rate": 8.391926523948883e-06, + "loss": 4.5062, + "step": 2983 + }, + { + "epoch": 0.7586842523438742, + "grad_norm": 24547.46875, + "learning_rate": 8.375345263789608e-06, + "loss": 4.4846, + "step": 2984 + }, + { + "epoch": 0.758938503098681, + "grad_norm": 24608.7109375, + "learning_rate": 8.358777104231516e-06, + "loss": 4.4955, + "step": 2985 + }, + { + "epoch": 0.7591927538534881, + "grad_norm": 24791.587890625, + "learning_rate": 8.342222058330708e-06, + "loss": 4.495, + "step": 2986 + }, + { + "epoch": 0.759447004608295, + "grad_norm": 25059.408203125, + "learning_rate": 8.32568013913293e-06, + "loss": 4.5227, + "step": 2987 + }, + { + "epoch": 0.7597012553631018, + "grad_norm": 24596.451171875, + "learning_rate": 8.309151359673567e-06, + "loss": 4.5026, + "step": 2988 + }, + { + "epoch": 0.7599555061179087, + "grad_norm": 24864.330078125, + "learning_rate": 8.29263573297769e-06, + "loss": 4.4971, + "step": 2989 + }, + { + "epoch": 0.7602097568727157, + "grad_norm": 24871.751953125, + "learning_rate": 8.276133272059968e-06, + "loss": 4.5126, + "step": 2990 + }, + { + "epoch": 0.7604640076275226, + "grad_norm": 24624.2265625, + "learning_rate": 8.259643989924718e-06, + "loss": 4.4852, + "step": 2991 + }, + { + "epoch": 0.7607182583823295, + "grad_norm": 24952.94921875, + "learning_rate": 8.243167899565874e-06, + "loss": 4.515, + "step": 2992 + }, + { + "epoch": 0.7609725091371365, + "grad_norm": 24933.1796875, + "learning_rate": 8.22670501396695e-06, + "loss": 4.5153, + "step": 2993 + }, + { + "epoch": 0.7612267598919434, + "grad_norm": 24772.37109375, + "learning_rate": 8.210255346101073e-06, + "loss": 4.4977, + "step": 2994 + }, + { + "epoch": 0.7614810106467503, + "grad_norm": 24786.470703125, + "learning_rate": 8.19381890893096e-06, + "loss": 4.4987, + "step": 2995 + }, + { + "epoch": 0.7617352614015573, + "grad_norm": 24628.978515625, + "learning_rate": 8.177395715408881e-06, + "loss": 4.4863, + "step": 2996 + }, + { + "epoch": 0.7619895121563642, + "grad_norm": 24777.875, + "learning_rate": 8.16098577847668e-06, + "loss": 4.4865, + "step": 2997 + }, + { + "epoch": 0.7622437629111711, + "grad_norm": 24735.517578125, + "learning_rate": 8.144589111065767e-06, + "loss": 4.4895, + "step": 2998 + }, + { + "epoch": 0.762498013665978, + "grad_norm": 24796.296875, + "learning_rate": 8.12820572609708e-06, + "loss": 4.5068, + "step": 2999 + }, + { + "epoch": 0.762752264420785, + "grad_norm": 24632.31640625, + "learning_rate": 8.111835636481083e-06, + "loss": 4.5016, + "step": 3000 + }, + { + "epoch": 0.762752264420785, + "eval_loss": 9.066652297973633, + "eval_runtime": 700.0909, + "eval_samples_per_second": 151.389, + "eval_steps_per_second": 9.463, + "step": 3000 + }, + { + "epoch": 0.7630065151755919, + "grad_norm": 24812.98046875, + "learning_rate": 8.095478855117786e-06, + "loss": 4.4949, + "step": 3001 + }, + { + "epoch": 0.7632607659303988, + "grad_norm": 24744.34765625, + "learning_rate": 8.079135394896704e-06, + "loss": 4.513, + "step": 3002 + }, + { + "epoch": 0.7635150166852058, + "grad_norm": 24719.552734375, + "learning_rate": 8.062805268696836e-06, + "loss": 4.4922, + "step": 3003 + }, + { + "epoch": 0.7637692674400127, + "grad_norm": 24782.23046875, + "learning_rate": 8.046488489386703e-06, + "loss": 4.4881, + "step": 3004 + }, + { + "epoch": 0.7640235181948196, + "grad_norm": 24790.525390625, + "learning_rate": 8.030185069824286e-06, + "loss": 4.5201, + "step": 3005 + }, + { + "epoch": 0.7642777689496266, + "grad_norm": 24780.771484375, + "learning_rate": 8.013895022857041e-06, + "loss": 4.4881, + "step": 3006 + }, + { + "epoch": 0.7645320197044335, + "grad_norm": 24717.3828125, + "learning_rate": 7.9976183613219e-06, + "loss": 4.4951, + "step": 3007 + }, + { + "epoch": 0.7647862704592404, + "grad_norm": 24726.548828125, + "learning_rate": 7.98135509804524e-06, + "loss": 4.4978, + "step": 3008 + }, + { + "epoch": 0.7650405212140473, + "grad_norm": 24805.623046875, + "learning_rate": 7.965105245842861e-06, + "loss": 4.5157, + "step": 3009 + }, + { + "epoch": 0.7652947719688543, + "grad_norm": 24728.56640625, + "learning_rate": 7.948868817520028e-06, + "loss": 4.5039, + "step": 3010 + }, + { + "epoch": 0.7655490227236612, + "grad_norm": 24716.193359375, + "learning_rate": 7.932645825871397e-06, + "loss": 4.5065, + "step": 3011 + }, + { + "epoch": 0.7658032734784681, + "grad_norm": 24814.25390625, + "learning_rate": 7.916436283681064e-06, + "loss": 4.5049, + "step": 3012 + }, + { + "epoch": 0.7660575242332751, + "grad_norm": 24650.6328125, + "learning_rate": 7.9002402037225e-06, + "loss": 4.4915, + "step": 3013 + }, + { + "epoch": 0.766311774988082, + "grad_norm": 24666.185546875, + "learning_rate": 7.884057598758594e-06, + "loss": 4.4985, + "step": 3014 + }, + { + "epoch": 0.7665660257428889, + "grad_norm": 24725.87109375, + "learning_rate": 7.867888481541592e-06, + "loss": 4.5065, + "step": 3015 + }, + { + "epoch": 0.7668202764976959, + "grad_norm": 24871.794921875, + "learning_rate": 7.851732864813116e-06, + "loss": 4.4943, + "step": 3016 + }, + { + "epoch": 0.7670745272525028, + "grad_norm": 24606.994140625, + "learning_rate": 7.835590761304168e-06, + "loss": 4.4942, + "step": 3017 + }, + { + "epoch": 0.7673287780073097, + "grad_norm": 24957.333984375, + "learning_rate": 7.819462183735083e-06, + "loss": 4.507, + "step": 3018 + }, + { + "epoch": 0.7675830287621166, + "grad_norm": 24821.9296875, + "learning_rate": 7.803347144815531e-06, + "loss": 4.4932, + "step": 3019 + }, + { + "epoch": 0.7678372795169236, + "grad_norm": 24766.341796875, + "learning_rate": 7.787245657244543e-06, + "loss": 4.4862, + "step": 3020 + }, + { + "epoch": 0.7680915302717305, + "grad_norm": 24696.41796875, + "learning_rate": 7.771157733710435e-06, + "loss": 4.4828, + "step": 3021 + }, + { + "epoch": 0.7683457810265374, + "grad_norm": 24908.318359375, + "learning_rate": 7.755083386890864e-06, + "loss": 4.5207, + "step": 3022 + }, + { + "epoch": 0.7686000317813444, + "grad_norm": 24870.7265625, + "learning_rate": 7.739022629452777e-06, + "loss": 4.4882, + "step": 3023 + }, + { + "epoch": 0.7688542825361513, + "grad_norm": 24655.09375, + "learning_rate": 7.72297547405241e-06, + "loss": 4.4829, + "step": 3024 + }, + { + "epoch": 0.7691085332909582, + "grad_norm": 25012.5, + "learning_rate": 7.70694193333527e-06, + "loss": 4.5009, + "step": 3025 + }, + { + "epoch": 0.7693627840457652, + "grad_norm": 24816.1953125, + "learning_rate": 7.690922019936164e-06, + "loss": 4.4962, + "step": 3026 + }, + { + "epoch": 0.7696170348005721, + "grad_norm": 24783.46484375, + "learning_rate": 7.67491574647914e-06, + "loss": 4.4841, + "step": 3027 + }, + { + "epoch": 0.769871285555379, + "grad_norm": 24930.853515625, + "learning_rate": 7.658923125577483e-06, + "loss": 4.4999, + "step": 3028 + }, + { + "epoch": 0.7701255363101859, + "grad_norm": 24910.69921875, + "learning_rate": 7.642944169833763e-06, + "loss": 4.4916, + "step": 3029 + }, + { + "epoch": 0.7703797870649929, + "grad_norm": 24702.07421875, + "learning_rate": 7.626978891839742e-06, + "loss": 4.4825, + "step": 3030 + }, + { + "epoch": 0.7706340378197998, + "grad_norm": 24853.23046875, + "learning_rate": 7.61102730417641e-06, + "loss": 4.4849, + "step": 3031 + }, + { + "epoch": 0.7708882885746067, + "grad_norm": 24865.048828125, + "learning_rate": 7.595089419413989e-06, + "loss": 4.4957, + "step": 3032 + }, + { + "epoch": 0.7711425393294137, + "grad_norm": 24514.96875, + "learning_rate": 7.579165250111894e-06, + "loss": 4.4942, + "step": 3033 + }, + { + "epoch": 0.7713967900842206, + "grad_norm": 24968.3828125, + "learning_rate": 7.563254808818712e-06, + "loss": 4.5064, + "step": 3034 + }, + { + "epoch": 0.7716510408390275, + "grad_norm": 24628.16015625, + "learning_rate": 7.547358108072244e-06, + "loss": 4.5035, + "step": 3035 + }, + { + "epoch": 0.7719052915938344, + "grad_norm": 24699.173828125, + "learning_rate": 7.53147516039944e-06, + "loss": 4.4921, + "step": 3036 + }, + { + "epoch": 0.7721595423486414, + "grad_norm": 24687.427734375, + "learning_rate": 7.515605978316412e-06, + "loss": 4.4798, + "step": 3037 + }, + { + "epoch": 0.7724137931034483, + "grad_norm": 24806.796875, + "learning_rate": 7.499750574328446e-06, + "loss": 4.4803, + "step": 3038 + }, + { + "epoch": 0.7726680438582552, + "grad_norm": 24806.646484375, + "learning_rate": 7.483908960929947e-06, + "loss": 4.4885, + "step": 3039 + }, + { + "epoch": 0.7729222946130622, + "grad_norm": 24857.46484375, + "learning_rate": 7.4680811506044615e-06, + "loss": 4.4924, + "step": 3040 + }, + { + "epoch": 0.7731765453678691, + "grad_norm": 24780.79296875, + "learning_rate": 7.452267155824666e-06, + "loss": 4.5027, + "step": 3041 + }, + { + "epoch": 0.773430796122676, + "grad_norm": 24662.123046875, + "learning_rate": 7.436466989052332e-06, + "loss": 4.4883, + "step": 3042 + }, + { + "epoch": 0.773685046877483, + "grad_norm": 24766.751953125, + "learning_rate": 7.420680662738363e-06, + "loss": 4.5038, + "step": 3043 + }, + { + "epoch": 0.7739392976322899, + "grad_norm": 24775.390625, + "learning_rate": 7.40490818932272e-06, + "loss": 4.4829, + "step": 3044 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 24670.78515625, + "learning_rate": 7.389149581234484e-06, + "loss": 4.4984, + "step": 3045 + }, + { + "epoch": 0.7744477991419036, + "grad_norm": 24701.46484375, + "learning_rate": 7.3734048508917826e-06, + "loss": 4.5015, + "step": 3046 + }, + { + "epoch": 0.7747020498967107, + "grad_norm": 24848.77734375, + "learning_rate": 7.357674010701807e-06, + "loss": 4.5075, + "step": 3047 + }, + { + "epoch": 0.7749563006515175, + "grad_norm": 24823.4453125, + "learning_rate": 7.341957073060832e-06, + "loss": 4.4872, + "step": 3048 + }, + { + "epoch": 0.7752105514063244, + "grad_norm": 24722.748046875, + "learning_rate": 7.326254050354151e-06, + "loss": 4.4839, + "step": 3049 + }, + { + "epoch": 0.7754648021611315, + "grad_norm": 24862.36328125, + "learning_rate": 7.310564954956084e-06, + "loss": 4.4994, + "step": 3050 + }, + { + "epoch": 0.7757190529159383, + "grad_norm": 24736.33203125, + "learning_rate": 7.2948897992300094e-06, + "loss": 4.4816, + "step": 3051 + }, + { + "epoch": 0.7759733036707452, + "grad_norm": 24862.775390625, + "learning_rate": 7.2792285955282896e-06, + "loss": 4.4932, + "step": 3052 + }, + { + "epoch": 0.7762275544255522, + "grad_norm": 24790.2890625, + "learning_rate": 7.263581356192306e-06, + "loss": 4.496, + "step": 3053 + }, + { + "epoch": 0.7764818051803591, + "grad_norm": 24886.8984375, + "learning_rate": 7.247948093552448e-06, + "loss": 4.4923, + "step": 3054 + }, + { + "epoch": 0.776736055935166, + "grad_norm": 24792.615234375, + "learning_rate": 7.232328819928069e-06, + "loss": 4.5034, + "step": 3055 + }, + { + "epoch": 0.7769903066899729, + "grad_norm": 24817.869140625, + "learning_rate": 7.216723547627496e-06, + "loss": 4.4993, + "step": 3056 + }, + { + "epoch": 0.7772445574447799, + "grad_norm": 24770.5859375, + "learning_rate": 7.201132288948051e-06, + "loss": 4.4996, + "step": 3057 + }, + { + "epoch": 0.7774988081995868, + "grad_norm": 24777.591796875, + "learning_rate": 7.185555056175991e-06, + "loss": 4.4889, + "step": 3058 + }, + { + "epoch": 0.7777530589543937, + "grad_norm": 24638.62109375, + "learning_rate": 7.169991861586514e-06, + "loss": 4.4882, + "step": 3059 + }, + { + "epoch": 0.7780073097092007, + "grad_norm": 24901.322265625, + "learning_rate": 7.154442717443785e-06, + "loss": 4.4843, + "step": 3060 + }, + { + "epoch": 0.7782615604640076, + "grad_norm": 24750.486328125, + "learning_rate": 7.138907636000866e-06, + "loss": 4.4842, + "step": 3061 + }, + { + "epoch": 0.7785158112188145, + "grad_norm": 24643.51953125, + "learning_rate": 7.123386629499748e-06, + "loss": 4.4899, + "step": 3062 + }, + { + "epoch": 0.7787700619736215, + "grad_norm": 24659.4453125, + "learning_rate": 7.107879710171339e-06, + "loss": 4.4771, + "step": 3063 + }, + { + "epoch": 0.7790243127284284, + "grad_norm": 24744.48828125, + "learning_rate": 7.092386890235444e-06, + "loss": 4.4763, + "step": 3064 + }, + { + "epoch": 0.7792785634832353, + "grad_norm": 24783.048828125, + "learning_rate": 7.076908181900741e-06, + "loss": 4.4981, + "step": 3065 + }, + { + "epoch": 0.7795328142380422, + "grad_norm": 24890.05078125, + "learning_rate": 7.061443597364814e-06, + "loss": 4.4992, + "step": 3066 + }, + { + "epoch": 0.7797870649928492, + "grad_norm": 24672.982421875, + "learning_rate": 7.045993148814095e-06, + "loss": 4.4753, + "step": 3067 + }, + { + "epoch": 0.7800413157476561, + "grad_norm": 24757.85546875, + "learning_rate": 7.030556848423875e-06, + "loss": 4.4962, + "step": 3068 + }, + { + "epoch": 0.780295566502463, + "grad_norm": 24777.28125, + "learning_rate": 7.0151347083583255e-06, + "loss": 4.4848, + "step": 3069 + }, + { + "epoch": 0.78054981725727, + "grad_norm": 24790.796875, + "learning_rate": 6.9997267407704265e-06, + "loss": 4.4847, + "step": 3070 + }, + { + "epoch": 0.7808040680120769, + "grad_norm": 24711.962890625, + "learning_rate": 6.984332957801998e-06, + "loss": 4.488, + "step": 3071 + }, + { + "epoch": 0.7810583187668838, + "grad_norm": 24655.369140625, + "learning_rate": 6.968953371583697e-06, + "loss": 4.4777, + "step": 3072 + }, + { + "epoch": 0.7813125695216908, + "grad_norm": 24812.095703125, + "learning_rate": 6.9535879942349755e-06, + "loss": 4.4973, + "step": 3073 + }, + { + "epoch": 0.7815668202764977, + "grad_norm": 24676.564453125, + "learning_rate": 6.938236837864104e-06, + "loss": 4.5004, + "step": 3074 + }, + { + "epoch": 0.7818210710313046, + "grad_norm": 24703.044921875, + "learning_rate": 6.922899914568126e-06, + "loss": 4.4871, + "step": 3075 + }, + { + "epoch": 0.7820753217861115, + "grad_norm": 24819.53125, + "learning_rate": 6.907577236432896e-06, + "loss": 4.4941, + "step": 3076 + }, + { + "epoch": 0.7823295725409185, + "grad_norm": 24601.890625, + "learning_rate": 6.892268815533021e-06, + "loss": 4.4922, + "step": 3077 + }, + { + "epoch": 0.7825838232957254, + "grad_norm": 24802.498046875, + "learning_rate": 6.876974663931873e-06, + "loss": 4.4829, + "step": 3078 + }, + { + "epoch": 0.7828380740505323, + "grad_norm": 24886.794921875, + "learning_rate": 6.861694793681603e-06, + "loss": 4.4925, + "step": 3079 + }, + { + "epoch": 0.7830923248053393, + "grad_norm": 24908.9609375, + "learning_rate": 6.846429216823083e-06, + "loss": 4.4968, + "step": 3080 + }, + { + "epoch": 0.7833465755601462, + "grad_norm": 24722.369140625, + "learning_rate": 6.831177945385925e-06, + "loss": 4.4965, + "step": 3081 + }, + { + "epoch": 0.7836008263149531, + "grad_norm": 24695.556640625, + "learning_rate": 6.815940991388484e-06, + "loss": 4.498, + "step": 3082 + }, + { + "epoch": 0.7838550770697601, + "grad_norm": 24929.9296875, + "learning_rate": 6.800718366837808e-06, + "loss": 4.5075, + "step": 3083 + }, + { + "epoch": 0.784109327824567, + "grad_norm": 24741.54296875, + "learning_rate": 6.7855100837296765e-06, + "loss": 4.4785, + "step": 3084 + }, + { + "epoch": 0.7843635785793739, + "grad_norm": 24830.31640625, + "learning_rate": 6.770316154048567e-06, + "loss": 4.4762, + "step": 3085 + }, + { + "epoch": 0.7846178293341808, + "grad_norm": 24922.259765625, + "learning_rate": 6.755136589767624e-06, + "loss": 4.4859, + "step": 3086 + }, + { + "epoch": 0.7848720800889878, + "grad_norm": 24668.982421875, + "learning_rate": 6.739971402848683e-06, + "loss": 4.4886, + "step": 3087 + }, + { + "epoch": 0.7851263308437947, + "grad_norm": 24701.212890625, + "learning_rate": 6.724820605242263e-06, + "loss": 4.4833, + "step": 3088 + }, + { + "epoch": 0.7853805815986016, + "grad_norm": 24762.29296875, + "learning_rate": 6.7096842088875285e-06, + "loss": 4.4913, + "step": 3089 + }, + { + "epoch": 0.7856348323534086, + "grad_norm": 24827.857421875, + "learning_rate": 6.694562225712292e-06, + "loss": 4.479, + "step": 3090 + }, + { + "epoch": 0.7858890831082155, + "grad_norm": 24873.296875, + "learning_rate": 6.679454667633025e-06, + "loss": 4.4921, + "step": 3091 + }, + { + "epoch": 0.7861433338630224, + "grad_norm": 24919.337890625, + "learning_rate": 6.664361546554823e-06, + "loss": 4.4928, + "step": 3092 + }, + { + "epoch": 0.7863975846178294, + "grad_norm": 24770.166015625, + "learning_rate": 6.6492828743713904e-06, + "loss": 4.4951, + "step": 3093 + }, + { + "epoch": 0.7866518353726363, + "grad_norm": 24725.470703125, + "learning_rate": 6.634218662965072e-06, + "loss": 4.4937, + "step": 3094 + }, + { + "epoch": 0.7869060861274432, + "grad_norm": 24801.416015625, + "learning_rate": 6.619168924206809e-06, + "loss": 4.4901, + "step": 3095 + }, + { + "epoch": 0.7871603368822501, + "grad_norm": 24770.974609375, + "learning_rate": 6.6041336699561165e-06, + "loss": 4.4758, + "step": 3096 + }, + { + "epoch": 0.7874145876370571, + "grad_norm": 24936.57421875, + "learning_rate": 6.5891129120611325e-06, + "loss": 4.4823, + "step": 3097 + }, + { + "epoch": 0.787668838391864, + "grad_norm": 25135.271484375, + "learning_rate": 6.574106662358542e-06, + "loss": 4.4974, + "step": 3098 + }, + { + "epoch": 0.7879230891466709, + "grad_norm": 24939.65625, + "learning_rate": 6.559114932673599e-06, + "loss": 4.4753, + "step": 3099 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 24745.9609375, + "learning_rate": 6.544137734820136e-06, + "loss": 4.4644, + "step": 3100 + }, + { + "epoch": 0.7881773399014779, + "eval_loss": 9.046455383300781, + "eval_runtime": 699.1793, + "eval_samples_per_second": 151.586, + "eval_steps_per_second": 9.475, + "step": 3100 + }, + { + "epoch": 0.7884315906562848, + "grad_norm": 25031.130859375, + "learning_rate": 6.529175080600516e-06, + "loss": 4.4777, + "step": 3101 + }, + { + "epoch": 0.7886858414110917, + "grad_norm": 25009.912109375, + "learning_rate": 6.514226981805638e-06, + "loss": 4.495, + "step": 3102 + }, + { + "epoch": 0.7889400921658987, + "grad_norm": 24806.14453125, + "learning_rate": 6.499293450214955e-06, + "loss": 4.4785, + "step": 3103 + }, + { + "epoch": 0.7891943429207056, + "grad_norm": 24932.083984375, + "learning_rate": 6.484374497596413e-06, + "loss": 4.5037, + "step": 3104 + }, + { + "epoch": 0.7894485936755125, + "grad_norm": 24777.408203125, + "learning_rate": 6.46947013570649e-06, + "loss": 4.4774, + "step": 3105 + }, + { + "epoch": 0.7897028444303194, + "grad_norm": 24946.05859375, + "learning_rate": 6.454580376290148e-06, + "loss": 4.4738, + "step": 3106 + }, + { + "epoch": 0.7899570951851264, + "grad_norm": 24691.1796875, + "learning_rate": 6.439705231080867e-06, + "loss": 4.4738, + "step": 3107 + }, + { + "epoch": 0.7902113459399333, + "grad_norm": 24852.51953125, + "learning_rate": 6.424844711800584e-06, + "loss": 4.4829, + "step": 3108 + }, + { + "epoch": 0.7904655966947401, + "grad_norm": 24964.654296875, + "learning_rate": 6.409998830159716e-06, + "loss": 4.4898, + "step": 3109 + }, + { + "epoch": 0.7907198474495472, + "grad_norm": 24723.6171875, + "learning_rate": 6.395167597857169e-06, + "loss": 4.4787, + "step": 3110 + }, + { + "epoch": 0.790974098204354, + "grad_norm": 24948.20703125, + "learning_rate": 6.380351026580275e-06, + "loss": 4.4911, + "step": 3111 + }, + { + "epoch": 0.7912283489591609, + "grad_norm": 24780.328125, + "learning_rate": 6.365549128004822e-06, + "loss": 4.4809, + "step": 3112 + }, + { + "epoch": 0.791482599713968, + "grad_norm": 24843.33203125, + "learning_rate": 6.350761913795048e-06, + "loss": 4.4865, + "step": 3113 + }, + { + "epoch": 0.7917368504687748, + "grad_norm": 24697.119140625, + "learning_rate": 6.335989395603598e-06, + "loss": 4.4911, + "step": 3114 + }, + { + "epoch": 0.7919911012235817, + "grad_norm": 24744.1328125, + "learning_rate": 6.321231585071563e-06, + "loss": 4.4764, + "step": 3115 + }, + { + "epoch": 0.7922453519783886, + "grad_norm": 24878.234375, + "learning_rate": 6.306488493828416e-06, + "loss": 4.4725, + "step": 3116 + }, + { + "epoch": 0.7924996027331956, + "grad_norm": 24903.19140625, + "learning_rate": 6.291760133492056e-06, + "loss": 4.4926, + "step": 3117 + }, + { + "epoch": 0.7927538534880025, + "grad_norm": 24664.0, + "learning_rate": 6.2770465156687466e-06, + "loss": 4.4831, + "step": 3118 + }, + { + "epoch": 0.7930081042428094, + "grad_norm": 24822.92578125, + "learning_rate": 6.262347651953162e-06, + "loss": 4.4884, + "step": 3119 + }, + { + "epoch": 0.7932623549976164, + "grad_norm": 24909.0, + "learning_rate": 6.247663553928338e-06, + "loss": 4.481, + "step": 3120 + }, + { + "epoch": 0.7935166057524233, + "grad_norm": 24926.3125, + "learning_rate": 6.232994233165653e-06, + "loss": 4.4773, + "step": 3121 + }, + { + "epoch": 0.7937708565072302, + "grad_norm": 24859.541015625, + "learning_rate": 6.218339701224887e-06, + "loss": 4.4863, + "step": 3122 + }, + { + "epoch": 0.7940251072620372, + "grad_norm": 24714.427734375, + "learning_rate": 6.203699969654131e-06, + "loss": 4.49, + "step": 3123 + }, + { + "epoch": 0.7942793580168441, + "grad_norm": 24748.134765625, + "learning_rate": 6.189075049989809e-06, + "loss": 4.4661, + "step": 3124 + }, + { + "epoch": 0.794533608771651, + "grad_norm": 24945.236328125, + "learning_rate": 6.174464953756706e-06, + "loss": 4.5007, + "step": 3125 + }, + { + "epoch": 0.7947878595264579, + "grad_norm": 24647.71484375, + "learning_rate": 6.15986969246789e-06, + "loss": 4.4893, + "step": 3126 + }, + { + "epoch": 0.7950421102812649, + "grad_norm": 24716.8828125, + "learning_rate": 6.145289277624761e-06, + "loss": 4.4911, + "step": 3127 + }, + { + "epoch": 0.7952963610360718, + "grad_norm": 24663.099609375, + "learning_rate": 6.130723720717021e-06, + "loss": 4.4836, + "step": 3128 + }, + { + "epoch": 0.7955506117908787, + "grad_norm": 24772.46875, + "learning_rate": 6.116173033222649e-06, + "loss": 4.4936, + "step": 3129 + }, + { + "epoch": 0.7958048625456857, + "grad_norm": 24830.66796875, + "learning_rate": 6.1016372266079065e-06, + "loss": 4.4754, + "step": 3130 + }, + { + "epoch": 0.7960591133004926, + "grad_norm": 24866.197265625, + "learning_rate": 6.087116312327348e-06, + "loss": 4.489, + "step": 3131 + }, + { + "epoch": 0.7963133640552995, + "grad_norm": 24880.685546875, + "learning_rate": 6.072610301823775e-06, + "loss": 4.4933, + "step": 3132 + }, + { + "epoch": 0.7965676148101065, + "grad_norm": 24822.49609375, + "learning_rate": 6.058119206528243e-06, + "loss": 4.4874, + "step": 3133 + }, + { + "epoch": 0.7968218655649134, + "grad_norm": 25177.599609375, + "learning_rate": 6.043643037860072e-06, + "loss": 4.4707, + "step": 3134 + }, + { + "epoch": 0.7970761163197203, + "grad_norm": 24640.3671875, + "learning_rate": 6.029181807226794e-06, + "loss": 4.4792, + "step": 3135 + }, + { + "epoch": 0.7973303670745272, + "grad_norm": 25037.650390625, + "learning_rate": 6.0147355260241986e-06, + "loss": 4.4933, + "step": 3136 + }, + { + "epoch": 0.7975846178293342, + "grad_norm": 24882.111328125, + "learning_rate": 6.000304205636265e-06, + "loss": 4.4712, + "step": 3137 + }, + { + "epoch": 0.7978388685841411, + "grad_norm": 24760.849609375, + "learning_rate": 5.985887857435213e-06, + "loss": 4.4829, + "step": 3138 + }, + { + "epoch": 0.798093119338948, + "grad_norm": 24810.79296875, + "learning_rate": 5.971486492781442e-06, + "loss": 4.4848, + "step": 3139 + }, + { + "epoch": 0.798347370093755, + "grad_norm": 24867.193359375, + "learning_rate": 5.957100123023543e-06, + "loss": 4.4876, + "step": 3140 + }, + { + "epoch": 0.7986016208485619, + "grad_norm": 24791.056640625, + "learning_rate": 5.942728759498309e-06, + "loss": 4.4807, + "step": 3141 + }, + { + "epoch": 0.7988558716033688, + "grad_norm": 24756.990234375, + "learning_rate": 5.9283724135306946e-06, + "loss": 4.4777, + "step": 3142 + }, + { + "epoch": 0.7991101223581758, + "grad_norm": 24763.775390625, + "learning_rate": 5.9140310964338145e-06, + "loss": 4.4831, + "step": 3143 + }, + { + "epoch": 0.7993643731129827, + "grad_norm": 24711.83984375, + "learning_rate": 5.899704819508964e-06, + "loss": 4.4806, + "step": 3144 + }, + { + "epoch": 0.7996186238677896, + "grad_norm": 24668.044921875, + "learning_rate": 5.885393594045552e-06, + "loss": 4.4817, + "step": 3145 + }, + { + "epoch": 0.7998728746225965, + "grad_norm": 24867.08203125, + "learning_rate": 5.871097431321165e-06, + "loss": 4.4777, + "step": 3146 + }, + { + "epoch": 0.8001271253774035, + "grad_norm": 25030.48046875, + "learning_rate": 5.856816342601481e-06, + "loss": 4.4844, + "step": 3147 + }, + { + "epoch": 0.8003813761322104, + "grad_norm": 24788.541015625, + "learning_rate": 5.84255033914034e-06, + "loss": 4.4786, + "step": 3148 + }, + { + "epoch": 0.8006356268870173, + "grad_norm": 24939.25, + "learning_rate": 5.828299432179652e-06, + "loss": 4.4946, + "step": 3149 + }, + { + "epoch": 0.8008898776418243, + "grad_norm": 25031.431640625, + "learning_rate": 5.814063632949468e-06, + "loss": 4.4854, + "step": 3150 + }, + { + "epoch": 0.8011441283966312, + "grad_norm": 24842.267578125, + "learning_rate": 5.799842952667911e-06, + "loss": 4.4764, + "step": 3151 + }, + { + "epoch": 0.8013983791514381, + "grad_norm": 24891.974609375, + "learning_rate": 5.785637402541189e-06, + "loss": 4.4838, + "step": 3152 + }, + { + "epoch": 0.8016526299062451, + "grad_norm": 24802.287109375, + "learning_rate": 5.77144699376361e-06, + "loss": 4.4853, + "step": 3153 + }, + { + "epoch": 0.801906880661052, + "grad_norm": 24971.9296875, + "learning_rate": 5.757271737517525e-06, + "loss": 4.4829, + "step": 3154 + }, + { + "epoch": 0.8021611314158589, + "grad_norm": 24811.19921875, + "learning_rate": 5.743111644973348e-06, + "loss": 4.4875, + "step": 3155 + }, + { + "epoch": 0.8024153821706658, + "grad_norm": 24843.765625, + "learning_rate": 5.728966727289564e-06, + "loss": 4.4785, + "step": 3156 + }, + { + "epoch": 0.8026696329254728, + "grad_norm": 24808.470703125, + "learning_rate": 5.714836995612671e-06, + "loss": 4.4925, + "step": 3157 + }, + { + "epoch": 0.8029238836802797, + "grad_norm": 24870.248046875, + "learning_rate": 5.700722461077226e-06, + "loss": 4.491, + "step": 3158 + }, + { + "epoch": 0.8031781344350866, + "grad_norm": 24869.529296875, + "learning_rate": 5.686623134805802e-06, + "loss": 4.4811, + "step": 3159 + }, + { + "epoch": 0.8034323851898936, + "grad_norm": 24906.11328125, + "learning_rate": 5.672539027908977e-06, + "loss": 4.4887, + "step": 3160 + }, + { + "epoch": 0.8036866359447005, + "grad_norm": 24823.68359375, + "learning_rate": 5.658470151485337e-06, + "loss": 4.4793, + "step": 3161 + }, + { + "epoch": 0.8039408866995074, + "grad_norm": 24841.802734375, + "learning_rate": 5.64441651662149e-06, + "loss": 4.4762, + "step": 3162 + }, + { + "epoch": 0.8041951374543144, + "grad_norm": 24834.0, + "learning_rate": 5.630378134392006e-06, + "loss": 4.4692, + "step": 3163 + }, + { + "epoch": 0.8044493882091213, + "grad_norm": 24878.4765625, + "learning_rate": 5.616355015859437e-06, + "loss": 4.4963, + "step": 3164 + }, + { + "epoch": 0.8047036389639282, + "grad_norm": 24599.439453125, + "learning_rate": 5.602347172074332e-06, + "loss": 4.4856, + "step": 3165 + }, + { + "epoch": 0.804957889718735, + "grad_norm": 24957.349609375, + "learning_rate": 5.588354614075178e-06, + "loss": 4.491, + "step": 3166 + }, + { + "epoch": 0.8052121404735421, + "grad_norm": 24786.6328125, + "learning_rate": 5.5743773528884185e-06, + "loss": 4.4675, + "step": 3167 + }, + { + "epoch": 0.805466391228349, + "grad_norm": 24639.76171875, + "learning_rate": 5.560415399528457e-06, + "loss": 4.4866, + "step": 3168 + }, + { + "epoch": 0.8057206419831558, + "grad_norm": 24835.48046875, + "learning_rate": 5.546468764997631e-06, + "loss": 4.4809, + "step": 3169 + }, + { + "epoch": 0.8059748927379629, + "grad_norm": 24779.017578125, + "learning_rate": 5.532537460286194e-06, + "loss": 4.4727, + "step": 3170 + }, + { + "epoch": 0.8062291434927698, + "grad_norm": 24812.84765625, + "learning_rate": 5.518621496372323e-06, + "loss": 4.4769, + "step": 3171 + }, + { + "epoch": 0.8064833942475766, + "grad_norm": 25013.306640625, + "learning_rate": 5.5047208842221225e-06, + "loss": 4.4864, + "step": 3172 + }, + { + "epoch": 0.8067376450023837, + "grad_norm": 24901.984375, + "learning_rate": 5.4908356347895816e-06, + "loss": 4.4833, + "step": 3173 + }, + { + "epoch": 0.8069918957571905, + "grad_norm": 24812.544921875, + "learning_rate": 5.476965759016581e-06, + "loss": 4.485, + "step": 3174 + }, + { + "epoch": 0.8072461465119974, + "grad_norm": 24836.447265625, + "learning_rate": 5.463111267832904e-06, + "loss": 4.49, + "step": 3175 + }, + { + "epoch": 0.8075003972668043, + "grad_norm": 24987.533203125, + "learning_rate": 5.449272172156197e-06, + "loss": 4.4879, + "step": 3176 + }, + { + "epoch": 0.8077546480216113, + "grad_norm": 24953.025390625, + "learning_rate": 5.435448482891975e-06, + "loss": 4.4846, + "step": 3177 + }, + { + "epoch": 0.8080088987764182, + "grad_norm": 24866.1796875, + "learning_rate": 5.421640210933615e-06, + "loss": 4.4769, + "step": 3178 + }, + { + "epoch": 0.8082631495312251, + "grad_norm": 24921.35546875, + "learning_rate": 5.407847367162358e-06, + "loss": 4.4794, + "step": 3179 + }, + { + "epoch": 0.8085174002860321, + "grad_norm": 24947.603515625, + "learning_rate": 5.3940699624472595e-06, + "loss": 4.4749, + "step": 3180 + }, + { + "epoch": 0.808771651040839, + "grad_norm": 24988.021484375, + "learning_rate": 5.380308007645235e-06, + "loss": 4.4747, + "step": 3181 + }, + { + "epoch": 0.8090259017956459, + "grad_norm": 24958.92578125, + "learning_rate": 5.36656151360101e-06, + "loss": 4.4825, + "step": 3182 + }, + { + "epoch": 0.8092801525504529, + "grad_norm": 24902.267578125, + "learning_rate": 5.352830491147121e-06, + "loss": 4.4793, + "step": 3183 + }, + { + "epoch": 0.8095344033052598, + "grad_norm": 24851.865234375, + "learning_rate": 5.339114951103935e-06, + "loss": 4.4864, + "step": 3184 + }, + { + "epoch": 0.8097886540600667, + "grad_norm": 24732.71484375, + "learning_rate": 5.3254149042796045e-06, + "loss": 4.4716, + "step": 3185 + }, + { + "epoch": 0.8100429048148736, + "grad_norm": 24873.80078125, + "learning_rate": 5.31173036147006e-06, + "loss": 4.479, + "step": 3186 + }, + { + "epoch": 0.8102971555696806, + "grad_norm": 24935.572265625, + "learning_rate": 5.298061333459048e-06, + "loss": 4.4895, + "step": 3187 + }, + { + "epoch": 0.8105514063244875, + "grad_norm": 24816.404296875, + "learning_rate": 5.2844078310180536e-06, + "loss": 4.4987, + "step": 3188 + }, + { + "epoch": 0.8108056570792944, + "grad_norm": 24858.3984375, + "learning_rate": 5.270769864906349e-06, + "loss": 4.4749, + "step": 3189 + }, + { + "epoch": 0.8110599078341014, + "grad_norm": 24886.70703125, + "learning_rate": 5.257147445870963e-06, + "loss": 4.4784, + "step": 3190 + }, + { + "epoch": 0.8113141585889083, + "grad_norm": 24765.677734375, + "learning_rate": 5.243540584646667e-06, + "loss": 4.4639, + "step": 3191 + }, + { + "epoch": 0.8115684093437152, + "grad_norm": 25030.677734375, + "learning_rate": 5.22994929195596e-06, + "loss": 4.4984, + "step": 3192 + }, + { + "epoch": 0.8118226600985222, + "grad_norm": 24748.40625, + "learning_rate": 5.216373578509104e-06, + "loss": 4.4716, + "step": 3193 + }, + { + "epoch": 0.8120769108533291, + "grad_norm": 24849.619140625, + "learning_rate": 5.2028134550040545e-06, + "loss": 4.4788, + "step": 3194 + }, + { + "epoch": 0.812331161608136, + "grad_norm": 24659.5, + "learning_rate": 5.1892689321264925e-06, + "loss": 4.4666, + "step": 3195 + }, + { + "epoch": 0.8125854123629429, + "grad_norm": 24920.703125, + "learning_rate": 5.175740020549813e-06, + "loss": 4.4742, + "step": 3196 + }, + { + "epoch": 0.8128396631177499, + "grad_norm": 24827.90625, + "learning_rate": 5.1622267309350995e-06, + "loss": 4.478, + "step": 3197 + }, + { + "epoch": 0.8130939138725568, + "grad_norm": 24984.4140625, + "learning_rate": 5.1487290739311216e-06, + "loss": 4.4702, + "step": 3198 + }, + { + "epoch": 0.8133481646273637, + "grad_norm": 24786.87109375, + "learning_rate": 5.135247060174339e-06, + "loss": 4.4666, + "step": 3199 + }, + { + "epoch": 0.8136024153821707, + "grad_norm": 24834.40625, + "learning_rate": 5.121780700288892e-06, + "loss": 4.4742, + "step": 3200 + }, + { + "epoch": 0.8136024153821707, + "eval_loss": 9.030352592468262, + "eval_runtime": 696.4169, + "eval_samples_per_second": 152.188, + "eval_steps_per_second": 9.513, + "step": 3200 + } + ], + "logging_steps": 1, + "max_steps": 3933, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3225070809907200.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}