llm_model / trainer_state.json
hoang14's picture
Upload folder using huggingface_hub
a61c06f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0002126483222047,
"eval_steps": 500,
"global_step": 1470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000680474631055161,
"grad_norm": 0.3922363365520713,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.4926,
"step": 1
},
{
"epoch": 0.001360949262110322,
"grad_norm": 0.3852464156727402,
"learning_rate": 6.666666666666667e-07,
"loss": 1.4852,
"step": 2
},
{
"epoch": 0.002041423893165483,
"grad_norm": 0.3802201192349553,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.4661,
"step": 3
},
{
"epoch": 0.002721898524220644,
"grad_norm": 0.33400621697524313,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.3367,
"step": 4
},
{
"epoch": 0.003402373155275805,
"grad_norm": 0.27651807402246553,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.4286,
"step": 5
},
{
"epoch": 0.004082847786330966,
"grad_norm": 0.23107642417160407,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3464,
"step": 6
},
{
"epoch": 0.004763322417386127,
"grad_norm": 0.12939937756268355,
"learning_rate": 2.3333333333333336e-06,
"loss": 1.378,
"step": 7
},
{
"epoch": 0.005443797048441288,
"grad_norm": 0.10378812673486397,
"learning_rate": 2.666666666666667e-06,
"loss": 1.3512,
"step": 8
},
{
"epoch": 0.006124271679496448,
"grad_norm": 0.08387789988316384,
"learning_rate": 3e-06,
"loss": 1.2501,
"step": 9
},
{
"epoch": 0.00680474631055161,
"grad_norm": 0.1018334804799599,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.2649,
"step": 10
},
{
"epoch": 0.007485220941606771,
"grad_norm": 0.10048274036790146,
"learning_rate": 3.6666666666666666e-06,
"loss": 1.3417,
"step": 11
},
{
"epoch": 0.008165695572661932,
"grad_norm": 0.08622738816646529,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3535,
"step": 12
},
{
"epoch": 0.008846170203717092,
"grad_norm": 0.1046684072379548,
"learning_rate": 4.333333333333334e-06,
"loss": 1.353,
"step": 13
},
{
"epoch": 0.009526644834772254,
"grad_norm": 0.08990830971852798,
"learning_rate": 4.666666666666667e-06,
"loss": 1.298,
"step": 14
},
{
"epoch": 0.010207119465827415,
"grad_norm": 0.08061928498710874,
"learning_rate": 5e-06,
"loss": 1.404,
"step": 15
},
{
"epoch": 0.010887594096882575,
"grad_norm": 0.17008730521534624,
"learning_rate": 5.333333333333334e-06,
"loss": 1.2939,
"step": 16
},
{
"epoch": 0.011568068727937737,
"grad_norm": 0.07599401871200454,
"learning_rate": 5.666666666666667e-06,
"loss": 1.3472,
"step": 17
},
{
"epoch": 0.012248543358992897,
"grad_norm": 0.06793991432852008,
"learning_rate": 6e-06,
"loss": 1.3321,
"step": 18
},
{
"epoch": 0.012929017990048058,
"grad_norm": 0.06953577040604254,
"learning_rate": 6.333333333333333e-06,
"loss": 1.2434,
"step": 19
},
{
"epoch": 0.01360949262110322,
"grad_norm": 0.06821163536615973,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2727,
"step": 20
},
{
"epoch": 0.01428996725215838,
"grad_norm": 0.09241778948737919,
"learning_rate": 7e-06,
"loss": 1.321,
"step": 21
},
{
"epoch": 0.014970441883213541,
"grad_norm": 0.0651787370812163,
"learning_rate": 7.333333333333333e-06,
"loss": 1.2779,
"step": 22
},
{
"epoch": 0.0156509165142687,
"grad_norm": 0.11311481394537864,
"learning_rate": 7.666666666666667e-06,
"loss": 1.2984,
"step": 23
},
{
"epoch": 0.016331391145323865,
"grad_norm": 0.06144410930285514,
"learning_rate": 8.000000000000001e-06,
"loss": 1.265,
"step": 24
},
{
"epoch": 0.017011865776379025,
"grad_norm": 0.06635273793167143,
"learning_rate": 8.333333333333334e-06,
"loss": 1.2723,
"step": 25
},
{
"epoch": 0.017692340407434184,
"grad_norm": 0.06955837771876457,
"learning_rate": 8.666666666666668e-06,
"loss": 1.2648,
"step": 26
},
{
"epoch": 0.018372815038489348,
"grad_norm": 0.06124084590785051,
"learning_rate": 9e-06,
"loss": 1.2987,
"step": 27
},
{
"epoch": 0.019053289669544508,
"grad_norm": 0.05644177076460923,
"learning_rate": 9.333333333333334e-06,
"loss": 1.3354,
"step": 28
},
{
"epoch": 0.019733764300599668,
"grad_norm": 0.11074686439531949,
"learning_rate": 9.666666666666667e-06,
"loss": 1.2433,
"step": 29
},
{
"epoch": 0.02041423893165483,
"grad_norm": 0.056636755992912104,
"learning_rate": 1e-05,
"loss": 1.2879,
"step": 30
},
{
"epoch": 0.02109471356270999,
"grad_norm": 0.05782845307576006,
"learning_rate": 9.99999708223099e-06,
"loss": 1.3396,
"step": 31
},
{
"epoch": 0.02177518819376515,
"grad_norm": 0.05482925037031388,
"learning_rate": 9.999988328927362e-06,
"loss": 1.2671,
"step": 32
},
{
"epoch": 0.02245566282482031,
"grad_norm": 0.06210231699691654,
"learning_rate": 9.999973740099334e-06,
"loss": 1.2895,
"step": 33
},
{
"epoch": 0.023136137455875474,
"grad_norm": 0.05493409882469449,
"learning_rate": 9.999953315763929e-06,
"loss": 1.3145,
"step": 34
},
{
"epoch": 0.023816612086930634,
"grad_norm": 0.05683824350074583,
"learning_rate": 9.99992705594499e-06,
"loss": 1.2525,
"step": 35
},
{
"epoch": 0.024497086717985794,
"grad_norm": 0.04839156028188221,
"learning_rate": 9.999894960673162e-06,
"loss": 1.2359,
"step": 36
},
{
"epoch": 0.025177561349040957,
"grad_norm": 0.046656613876928314,
"learning_rate": 9.999857029985903e-06,
"loss": 1.2446,
"step": 37
},
{
"epoch": 0.025858035980096117,
"grad_norm": 0.05183835608104621,
"learning_rate": 9.999813263927483e-06,
"loss": 1.2929,
"step": 38
},
{
"epoch": 0.026538510611151277,
"grad_norm": 0.04777762814434946,
"learning_rate": 9.999763662548982e-06,
"loss": 1.2444,
"step": 39
},
{
"epoch": 0.02721898524220644,
"grad_norm": 0.05059603999112179,
"learning_rate": 9.999708225908292e-06,
"loss": 1.1663,
"step": 40
},
{
"epoch": 0.0278994598732616,
"grad_norm": 0.045382847875702514,
"learning_rate": 9.99964695407011e-06,
"loss": 1.2801,
"step": 41
},
{
"epoch": 0.02857993450431676,
"grad_norm": 0.04343908128703206,
"learning_rate": 9.999579847105947e-06,
"loss": 1.185,
"step": 42
},
{
"epoch": 0.029260409135371923,
"grad_norm": 0.0489584485386619,
"learning_rate": 9.999506905094128e-06,
"loss": 1.2427,
"step": 43
},
{
"epoch": 0.029940883766427083,
"grad_norm": 0.04731927561730429,
"learning_rate": 9.999428128119779e-06,
"loss": 1.3339,
"step": 44
},
{
"epoch": 0.030621358397482243,
"grad_norm": 0.04807552297741366,
"learning_rate": 9.999343516274844e-06,
"loss": 1.2842,
"step": 45
},
{
"epoch": 0.0313018330285374,
"grad_norm": 0.046345202194362445,
"learning_rate": 9.999253069658074e-06,
"loss": 1.2616,
"step": 46
},
{
"epoch": 0.03198230765959256,
"grad_norm": 0.0454525828723615,
"learning_rate": 9.999156788375033e-06,
"loss": 1.2253,
"step": 47
},
{
"epoch": 0.03266278229064773,
"grad_norm": 0.04821779562867459,
"learning_rate": 9.999054672538085e-06,
"loss": 1.2836,
"step": 48
},
{
"epoch": 0.03334325692170289,
"grad_norm": 0.0646325797782212,
"learning_rate": 9.998946722266415e-06,
"loss": 1.274,
"step": 49
},
{
"epoch": 0.03402373155275805,
"grad_norm": 0.049126858090645584,
"learning_rate": 9.99883293768601e-06,
"loss": 1.1964,
"step": 50
},
{
"epoch": 0.03470420618381321,
"grad_norm": 0.07363174361547103,
"learning_rate": 9.998713318929672e-06,
"loss": 1.3436,
"step": 51
},
{
"epoch": 0.03538468081486837,
"grad_norm": 0.04548345898433307,
"learning_rate": 9.998587866137005e-06,
"loss": 1.2717,
"step": 52
},
{
"epoch": 0.03606515544592353,
"grad_norm": 0.046133384420950434,
"learning_rate": 9.99845657945443e-06,
"loss": 1.1653,
"step": 53
},
{
"epoch": 0.036745630076978696,
"grad_norm": 0.049760880417649325,
"learning_rate": 9.998319459035168e-06,
"loss": 1.2607,
"step": 54
},
{
"epoch": 0.037426104708033855,
"grad_norm": 0.11363675889769449,
"learning_rate": 9.998176505039257e-06,
"loss": 1.2633,
"step": 55
},
{
"epoch": 0.038106579339089015,
"grad_norm": 0.046032893223258195,
"learning_rate": 9.998027717633539e-06,
"loss": 1.1964,
"step": 56
},
{
"epoch": 0.038787053970144175,
"grad_norm": 0.04922590729969489,
"learning_rate": 9.997873096991663e-06,
"loss": 1.2569,
"step": 57
},
{
"epoch": 0.039467528601199335,
"grad_norm": 0.057702657772928254,
"learning_rate": 9.997712643294093e-06,
"loss": 1.2314,
"step": 58
},
{
"epoch": 0.040148003232254495,
"grad_norm": 0.044442212757030586,
"learning_rate": 9.99754635672809e-06,
"loss": 1.1801,
"step": 59
},
{
"epoch": 0.04082847786330966,
"grad_norm": 0.046764303530808723,
"learning_rate": 9.997374237487729e-06,
"loss": 1.1928,
"step": 60
},
{
"epoch": 0.04150895249436482,
"grad_norm": 0.12382314732314198,
"learning_rate": 9.997196285773894e-06,
"loss": 1.2907,
"step": 61
},
{
"epoch": 0.04218942712541998,
"grad_norm": 0.0460098561617423,
"learning_rate": 9.997012501794273e-06,
"loss": 1.2551,
"step": 62
},
{
"epoch": 0.04286990175647514,
"grad_norm": 0.05635815384217967,
"learning_rate": 9.996822885763364e-06,
"loss": 1.2534,
"step": 63
},
{
"epoch": 0.0435503763875303,
"grad_norm": 0.054155741053703646,
"learning_rate": 9.996627437902465e-06,
"loss": 1.2121,
"step": 64
},
{
"epoch": 0.04423085101858546,
"grad_norm": 0.05278557656770515,
"learning_rate": 9.996426158439685e-06,
"loss": 1.2403,
"step": 65
},
{
"epoch": 0.04491132564964062,
"grad_norm": 0.04559523672664366,
"learning_rate": 9.996219047609943e-06,
"loss": 1.2213,
"step": 66
},
{
"epoch": 0.04559180028069579,
"grad_norm": 0.05817741367927157,
"learning_rate": 9.996006105654955e-06,
"loss": 1.215,
"step": 67
},
{
"epoch": 0.04627227491175095,
"grad_norm": 0.053498531951879036,
"learning_rate": 9.99578733282325e-06,
"loss": 1.1988,
"step": 68
},
{
"epoch": 0.04695274954280611,
"grad_norm": 0.04777681754375196,
"learning_rate": 9.995562729370158e-06,
"loss": 1.2246,
"step": 69
},
{
"epoch": 0.04763322417386127,
"grad_norm": 0.05565357509635904,
"learning_rate": 9.995332295557818e-06,
"loss": 1.2654,
"step": 70
},
{
"epoch": 0.04831369880491643,
"grad_norm": 0.06680224945846398,
"learning_rate": 9.995096031655167e-06,
"loss": 1.2684,
"step": 71
},
{
"epoch": 0.04899417343597159,
"grad_norm": 0.048424669107112364,
"learning_rate": 9.994853937937954e-06,
"loss": 1.2133,
"step": 72
},
{
"epoch": 0.049674648067026754,
"grad_norm": 0.04928739246746449,
"learning_rate": 9.994606014688726e-06,
"loss": 1.2882,
"step": 73
},
{
"epoch": 0.050355122698081914,
"grad_norm": 0.055922248042688465,
"learning_rate": 9.994352262196839e-06,
"loss": 1.266,
"step": 74
},
{
"epoch": 0.051035597329137074,
"grad_norm": 0.057124012344938746,
"learning_rate": 9.994092680758443e-06,
"loss": 1.2561,
"step": 75
},
{
"epoch": 0.051716071960192234,
"grad_norm": 0.04679564952989396,
"learning_rate": 9.993827270676507e-06,
"loss": 1.2215,
"step": 76
},
{
"epoch": 0.052396546591247393,
"grad_norm": 0.05061952666080301,
"learning_rate": 9.993556032260785e-06,
"loss": 1.2794,
"step": 77
},
{
"epoch": 0.05307702122230255,
"grad_norm": 0.0692075710954883,
"learning_rate": 9.993278965827844e-06,
"loss": 1.2755,
"step": 78
},
{
"epoch": 0.05375749585335772,
"grad_norm": 0.056697791581012555,
"learning_rate": 9.992996071701052e-06,
"loss": 1.1994,
"step": 79
},
{
"epoch": 0.05443797048441288,
"grad_norm": 0.042996286514274486,
"learning_rate": 9.992707350210577e-06,
"loss": 1.2539,
"step": 80
},
{
"epoch": 0.05511844511546804,
"grad_norm": 0.04485835290046384,
"learning_rate": 9.992412801693385e-06,
"loss": 1.2139,
"step": 81
},
{
"epoch": 0.0557989197465232,
"grad_norm": 0.04908203465634852,
"learning_rate": 9.992112426493247e-06,
"loss": 1.1792,
"step": 82
},
{
"epoch": 0.05647939437757836,
"grad_norm": 0.05456588235730342,
"learning_rate": 9.991806224960736e-06,
"loss": 1.2868,
"step": 83
},
{
"epoch": 0.05715986900863352,
"grad_norm": 0.0545054800403904,
"learning_rate": 9.991494197453219e-06,
"loss": 1.2093,
"step": 84
},
{
"epoch": 0.05784034363968868,
"grad_norm": 0.0661242392524205,
"learning_rate": 9.991176344334866e-06,
"loss": 1.2699,
"step": 85
},
{
"epoch": 0.058520818270743846,
"grad_norm": 0.05577376587360494,
"learning_rate": 9.990852665976648e-06,
"loss": 1.1748,
"step": 86
},
{
"epoch": 0.059201292901799006,
"grad_norm": 0.04479015473245748,
"learning_rate": 9.990523162756329e-06,
"loss": 1.1457,
"step": 87
},
{
"epoch": 0.059881767532854166,
"grad_norm": 0.05030583354226186,
"learning_rate": 9.990187835058475e-06,
"loss": 1.2363,
"step": 88
},
{
"epoch": 0.060562242163909326,
"grad_norm": 0.04663224750040068,
"learning_rate": 9.989846683274453e-06,
"loss": 1.2362,
"step": 89
},
{
"epoch": 0.061242716794964486,
"grad_norm": 0.06233780842730507,
"learning_rate": 9.989499707802424e-06,
"loss": 1.2263,
"step": 90
},
{
"epoch": 0.061923191426019646,
"grad_norm": 0.10496041411997244,
"learning_rate": 9.989146909047341e-06,
"loss": 1.2967,
"step": 91
},
{
"epoch": 0.0626036660570748,
"grad_norm": 0.04543916977442651,
"learning_rate": 9.988788287420961e-06,
"loss": 1.2112,
"step": 92
},
{
"epoch": 0.06328414068812997,
"grad_norm": 0.06986447974229824,
"learning_rate": 9.988423843341834e-06,
"loss": 1.2516,
"step": 93
},
{
"epoch": 0.06396461531918513,
"grad_norm": 0.0496941211098862,
"learning_rate": 9.988053577235306e-06,
"loss": 1.1914,
"step": 94
},
{
"epoch": 0.06464508995024029,
"grad_norm": 0.05228331387236259,
"learning_rate": 9.987677489533516e-06,
"loss": 1.1709,
"step": 95
},
{
"epoch": 0.06532556458129546,
"grad_norm": 0.04354293223274572,
"learning_rate": 9.987295580675398e-06,
"loss": 1.2004,
"step": 96
},
{
"epoch": 0.06600603921235061,
"grad_norm": 0.06710084618116975,
"learning_rate": 9.986907851106684e-06,
"loss": 1.3077,
"step": 97
},
{
"epoch": 0.06668651384340578,
"grad_norm": 0.04143778101716894,
"learning_rate": 9.986514301279894e-06,
"loss": 1.2371,
"step": 98
},
{
"epoch": 0.06736698847446093,
"grad_norm": 0.047100741300005265,
"learning_rate": 9.986114931654343e-06,
"loss": 1.1714,
"step": 99
},
{
"epoch": 0.0680474631055161,
"grad_norm": 0.04613376762288349,
"learning_rate": 9.985709742696138e-06,
"loss": 1.2521,
"step": 100
},
{
"epoch": 0.06872793773657127,
"grad_norm": 0.0699090612598444,
"learning_rate": 9.985298734878179e-06,
"loss": 1.2199,
"step": 101
},
{
"epoch": 0.06940841236762642,
"grad_norm": 0.04465069136936077,
"learning_rate": 9.984881908680157e-06,
"loss": 1.1824,
"step": 102
},
{
"epoch": 0.07008888699868158,
"grad_norm": 0.04424081951286644,
"learning_rate": 9.984459264588551e-06,
"loss": 1.268,
"step": 103
},
{
"epoch": 0.07076936162973674,
"grad_norm": 0.050746883751994334,
"learning_rate": 9.984030803096633e-06,
"loss": 1.1606,
"step": 104
},
{
"epoch": 0.0714498362607919,
"grad_norm": 0.04390734965937593,
"learning_rate": 9.983596524704466e-06,
"loss": 1.2813,
"step": 105
},
{
"epoch": 0.07213031089184706,
"grad_norm": 0.05194970705074101,
"learning_rate": 9.983156429918895e-06,
"loss": 1.1676,
"step": 106
},
{
"epoch": 0.07281078552290222,
"grad_norm": 0.05135562045759976,
"learning_rate": 9.982710519253563e-06,
"loss": 1.2715,
"step": 107
},
{
"epoch": 0.07349126015395739,
"grad_norm": 0.04499333459904921,
"learning_rate": 9.982258793228889e-06,
"loss": 1.277,
"step": 108
},
{
"epoch": 0.07417173478501254,
"grad_norm": 0.049375961854300106,
"learning_rate": 9.981801252372094e-06,
"loss": 1.2406,
"step": 109
},
{
"epoch": 0.07485220941606771,
"grad_norm": 0.04974189474524789,
"learning_rate": 9.981337897217171e-06,
"loss": 1.2534,
"step": 110
},
{
"epoch": 0.07553268404712286,
"grad_norm": 0.10352651592275892,
"learning_rate": 9.98086872830491e-06,
"loss": 1.2137,
"step": 111
},
{
"epoch": 0.07621315867817803,
"grad_norm": 0.0424449387883707,
"learning_rate": 9.98039374618288e-06,
"loss": 1.2032,
"step": 112
},
{
"epoch": 0.07689363330923318,
"grad_norm": 0.05251256238992522,
"learning_rate": 9.979912951405433e-06,
"loss": 1.2882,
"step": 113
},
{
"epoch": 0.07757410794028835,
"grad_norm": 0.045005876875762664,
"learning_rate": 9.979426344533712e-06,
"loss": 1.193,
"step": 114
},
{
"epoch": 0.07825458257134352,
"grad_norm": 0.04628698168330066,
"learning_rate": 9.978933926135637e-06,
"loss": 1.208,
"step": 115
},
{
"epoch": 0.07893505720239867,
"grad_norm": 0.049788103173579595,
"learning_rate": 9.978435696785918e-06,
"loss": 1.2284,
"step": 116
},
{
"epoch": 0.07961553183345384,
"grad_norm": 0.06249884166750058,
"learning_rate": 9.977931657066035e-06,
"loss": 1.2145,
"step": 117
},
{
"epoch": 0.08029600646450899,
"grad_norm": 0.05637445736291963,
"learning_rate": 9.977421807564264e-06,
"loss": 1.215,
"step": 118
},
{
"epoch": 0.08097648109556416,
"grad_norm": 0.10253444716580337,
"learning_rate": 9.97690614887565e-06,
"loss": 1.2171,
"step": 119
},
{
"epoch": 0.08165695572661932,
"grad_norm": 0.07539804013220634,
"learning_rate": 9.976384681602023e-06,
"loss": 1.2934,
"step": 120
},
{
"epoch": 0.08233743035767448,
"grad_norm": 0.04739495861507311,
"learning_rate": 9.975857406351989e-06,
"loss": 1.2177,
"step": 121
},
{
"epoch": 0.08301790498872964,
"grad_norm": 0.05116793533886343,
"learning_rate": 9.97532432374094e-06,
"loss": 1.2643,
"step": 122
},
{
"epoch": 0.0836983796197848,
"grad_norm": 0.046820360115744135,
"learning_rate": 9.974785434391039e-06,
"loss": 1.2435,
"step": 123
},
{
"epoch": 0.08437885425083996,
"grad_norm": 0.04507944227068329,
"learning_rate": 9.974240738931224e-06,
"loss": 1.2415,
"step": 124
},
{
"epoch": 0.08505932888189512,
"grad_norm": 0.04090241249528373,
"learning_rate": 9.973690237997219e-06,
"loss": 1.2593,
"step": 125
},
{
"epoch": 0.08573980351295028,
"grad_norm": 0.04479964424134036,
"learning_rate": 9.973133932231514e-06,
"loss": 1.2366,
"step": 126
},
{
"epoch": 0.08642027814400545,
"grad_norm": 0.045830171128613484,
"learning_rate": 9.972571822283377e-06,
"loss": 1.1708,
"step": 127
},
{
"epoch": 0.0871007527750606,
"grad_norm": 0.04542526816421652,
"learning_rate": 9.972003908808854e-06,
"loss": 1.1654,
"step": 128
},
{
"epoch": 0.08778122740611577,
"grad_norm": 0.04442527448494435,
"learning_rate": 9.97143019247076e-06,
"loss": 1.1908,
"step": 129
},
{
"epoch": 0.08846170203717092,
"grad_norm": 0.047830426000382995,
"learning_rate": 9.970850673938684e-06,
"loss": 1.2238,
"step": 130
},
{
"epoch": 0.08914217666822609,
"grad_norm": 0.04634296002464362,
"learning_rate": 9.970265353888984e-06,
"loss": 1.2184,
"step": 131
},
{
"epoch": 0.08982265129928124,
"grad_norm": 0.06603137079206937,
"learning_rate": 9.969674233004794e-06,
"loss": 1.2198,
"step": 132
},
{
"epoch": 0.09050312593033641,
"grad_norm": 0.05155382756399447,
"learning_rate": 9.969077311976017e-06,
"loss": 1.2494,
"step": 133
},
{
"epoch": 0.09118360056139158,
"grad_norm": 0.04496152538308371,
"learning_rate": 9.96847459149932e-06,
"loss": 1.2012,
"step": 134
},
{
"epoch": 0.09186407519244673,
"grad_norm": 0.06359552431281164,
"learning_rate": 9.967866072278143e-06,
"loss": 1.2318,
"step": 135
},
{
"epoch": 0.0925445498235019,
"grad_norm": 0.04505084814063137,
"learning_rate": 9.967251755022697e-06,
"loss": 1.1958,
"step": 136
},
{
"epoch": 0.09322502445455705,
"grad_norm": 0.05653878193789085,
"learning_rate": 9.966631640449957e-06,
"loss": 1.1779,
"step": 137
},
{
"epoch": 0.09390549908561222,
"grad_norm": 0.050975422603819855,
"learning_rate": 9.966005729283658e-06,
"loss": 1.2907,
"step": 138
},
{
"epoch": 0.09458597371666738,
"grad_norm": 0.04751783774889539,
"learning_rate": 9.965374022254308e-06,
"loss": 1.2631,
"step": 139
},
{
"epoch": 0.09526644834772253,
"grad_norm": 0.05430873799441208,
"learning_rate": 9.96473652009918e-06,
"loss": 1.2217,
"step": 140
},
{
"epoch": 0.0959469229787777,
"grad_norm": 0.04302277589419479,
"learning_rate": 9.964093223562303e-06,
"loss": 1.2189,
"step": 141
},
{
"epoch": 0.09662739760983285,
"grad_norm": 0.05547820845328659,
"learning_rate": 9.963444133394478e-06,
"loss": 1.1957,
"step": 142
},
{
"epoch": 0.09730787224088802,
"grad_norm": 0.046119760443999236,
"learning_rate": 9.96278925035326e-06,
"loss": 1.1941,
"step": 143
},
{
"epoch": 0.09798834687194317,
"grad_norm": 0.0425094476231613,
"learning_rate": 9.962128575202967e-06,
"loss": 1.2133,
"step": 144
},
{
"epoch": 0.09866882150299834,
"grad_norm": 0.05969426342259366,
"learning_rate": 9.961462108714682e-06,
"loss": 1.2265,
"step": 145
},
{
"epoch": 0.09934929613405351,
"grad_norm": 0.05148433733859586,
"learning_rate": 9.960789851666237e-06,
"loss": 1.1851,
"step": 146
},
{
"epoch": 0.10002977076510866,
"grad_norm": 0.047879792473061435,
"learning_rate": 9.960111804842236e-06,
"loss": 1.1993,
"step": 147
},
{
"epoch": 0.10071024539616383,
"grad_norm": 0.043575578986480415,
"learning_rate": 9.959427969034025e-06,
"loss": 1.2409,
"step": 148
},
{
"epoch": 0.10139072002721898,
"grad_norm": 0.04853533402339514,
"learning_rate": 9.95873834503972e-06,
"loss": 1.18,
"step": 149
},
{
"epoch": 0.10207119465827415,
"grad_norm": 0.04148668368535789,
"learning_rate": 9.958042933664186e-06,
"loss": 1.2072,
"step": 150
},
{
"epoch": 0.1027516692893293,
"grad_norm": 0.050262677145106,
"learning_rate": 9.957341735719038e-06,
"loss": 1.2421,
"step": 151
},
{
"epoch": 0.10343214392038447,
"grad_norm": 0.05381296667313772,
"learning_rate": 9.956634752022651e-06,
"loss": 1.1788,
"step": 152
},
{
"epoch": 0.10411261855143963,
"grad_norm": 0.0460656819463052,
"learning_rate": 9.955921983400154e-06,
"loss": 1.268,
"step": 153
},
{
"epoch": 0.10479309318249479,
"grad_norm": 0.04629006602169957,
"learning_rate": 9.955203430683425e-06,
"loss": 1.2141,
"step": 154
},
{
"epoch": 0.10547356781354995,
"grad_norm": 0.10610645634311688,
"learning_rate": 9.954479094711087e-06,
"loss": 1.1513,
"step": 155
},
{
"epoch": 0.1061540424446051,
"grad_norm": 0.047759609676506984,
"learning_rate": 9.953748976328524e-06,
"loss": 1.2524,
"step": 156
},
{
"epoch": 0.10683451707566027,
"grad_norm": 0.06066630318457405,
"learning_rate": 9.95301307638786e-06,
"loss": 1.2156,
"step": 157
},
{
"epoch": 0.10751499170671544,
"grad_norm": 0.060137314636137626,
"learning_rate": 9.952271395747969e-06,
"loss": 1.2911,
"step": 158
},
{
"epoch": 0.1081954663377706,
"grad_norm": 0.04525634411513257,
"learning_rate": 9.951523935274472e-06,
"loss": 1.2769,
"step": 159
},
{
"epoch": 0.10887594096882576,
"grad_norm": 0.08126804083378383,
"learning_rate": 9.950770695839737e-06,
"loss": 1.2353,
"step": 160
},
{
"epoch": 0.10955641559988091,
"grad_norm": 0.041722971864926184,
"learning_rate": 9.950011678322874e-06,
"loss": 1.242,
"step": 161
},
{
"epoch": 0.11023689023093608,
"grad_norm": 0.051351975206530606,
"learning_rate": 9.949246883609743e-06,
"loss": 1.2496,
"step": 162
},
{
"epoch": 0.11091736486199123,
"grad_norm": 0.04577763125389706,
"learning_rate": 9.948476312592934e-06,
"loss": 1.1765,
"step": 163
},
{
"epoch": 0.1115978394930464,
"grad_norm": 0.04938183947356605,
"learning_rate": 9.94769996617179e-06,
"loss": 1.2442,
"step": 164
},
{
"epoch": 0.11227831412410157,
"grad_norm": 0.055581793213831124,
"learning_rate": 9.946917845252394e-06,
"loss": 1.2078,
"step": 165
},
{
"epoch": 0.11295878875515672,
"grad_norm": 0.044103376011085695,
"learning_rate": 9.94612995074756e-06,
"loss": 1.2132,
"step": 166
},
{
"epoch": 0.11363926338621189,
"grad_norm": 0.04664849459604924,
"learning_rate": 9.945336283576849e-06,
"loss": 1.2146,
"step": 167
},
{
"epoch": 0.11431973801726704,
"grad_norm": 0.04580570356372435,
"learning_rate": 9.944536844666554e-06,
"loss": 1.2057,
"step": 168
},
{
"epoch": 0.1150002126483222,
"grad_norm": 0.06380708184717693,
"learning_rate": 9.943731634949706e-06,
"loss": 1.1959,
"step": 169
},
{
"epoch": 0.11568068727937736,
"grad_norm": 0.04507640293433379,
"learning_rate": 9.942920655366075e-06,
"loss": 1.217,
"step": 170
},
{
"epoch": 0.11636116191043253,
"grad_norm": 0.04623512542605048,
"learning_rate": 9.942103906862158e-06,
"loss": 1.256,
"step": 171
},
{
"epoch": 0.11704163654148769,
"grad_norm": 0.05119569489839227,
"learning_rate": 9.941281390391189e-06,
"loss": 1.2385,
"step": 172
},
{
"epoch": 0.11772211117254285,
"grad_norm": 0.04129595751504458,
"learning_rate": 9.940453106913133e-06,
"loss": 1.1411,
"step": 173
},
{
"epoch": 0.11840258580359801,
"grad_norm": 0.050324914581852453,
"learning_rate": 9.939619057394687e-06,
"loss": 1.1612,
"step": 174
},
{
"epoch": 0.11908306043465317,
"grad_norm": 0.04642915301072864,
"learning_rate": 9.938779242809275e-06,
"loss": 1.2602,
"step": 175
},
{
"epoch": 0.11976353506570833,
"grad_norm": 0.06544881435642524,
"learning_rate": 9.937933664137054e-06,
"loss": 1.2322,
"step": 176
},
{
"epoch": 0.1204440096967635,
"grad_norm": 0.044569198571487,
"learning_rate": 9.937082322364901e-06,
"loss": 1.1582,
"step": 177
},
{
"epoch": 0.12112448432781865,
"grad_norm": 0.044360855871756734,
"learning_rate": 9.936225218486428e-06,
"loss": 1.1625,
"step": 178
},
{
"epoch": 0.12180495895887382,
"grad_norm": 0.08199554083425169,
"learning_rate": 9.935362353501964e-06,
"loss": 1.2174,
"step": 179
},
{
"epoch": 0.12248543358992897,
"grad_norm": 0.05309082729598163,
"learning_rate": 9.934493728418567e-06,
"loss": 1.2118,
"step": 180
},
{
"epoch": 0.12316590822098414,
"grad_norm": 0.041864080294537066,
"learning_rate": 9.933619344250015e-06,
"loss": 1.1722,
"step": 181
},
{
"epoch": 0.12384638285203929,
"grad_norm": 0.04446226582689757,
"learning_rate": 9.93273920201681e-06,
"loss": 1.2495,
"step": 182
},
{
"epoch": 0.12452685748309446,
"grad_norm": 0.047990304215477446,
"learning_rate": 9.931853302746169e-06,
"loss": 1.234,
"step": 183
},
{
"epoch": 0.1252073321141496,
"grad_norm": 0.04212367388032856,
"learning_rate": 9.930961647472038e-06,
"loss": 1.1731,
"step": 184
},
{
"epoch": 0.12588780674520478,
"grad_norm": 0.06449124084893802,
"learning_rate": 9.930064237235068e-06,
"loss": 1.2172,
"step": 185
},
{
"epoch": 0.12656828137625994,
"grad_norm": 0.08477554547026132,
"learning_rate": 9.929161073082636e-06,
"loss": 1.2148,
"step": 186
},
{
"epoch": 0.1272487560073151,
"grad_norm": 0.052427645171519884,
"learning_rate": 9.928252156068834e-06,
"loss": 1.1658,
"step": 187
},
{
"epoch": 0.12792923063837025,
"grad_norm": 0.047378342779356215,
"learning_rate": 9.927337487254463e-06,
"loss": 1.2226,
"step": 188
},
{
"epoch": 0.12860970526942542,
"grad_norm": 0.04679296935388863,
"learning_rate": 9.926417067707042e-06,
"loss": 1.1998,
"step": 189
},
{
"epoch": 0.12929017990048058,
"grad_norm": 0.04455647990857127,
"learning_rate": 9.925490898500796e-06,
"loss": 1.2327,
"step": 190
},
{
"epoch": 0.12997065453153575,
"grad_norm": 0.044070283911945306,
"learning_rate": 9.92455898071667e-06,
"loss": 1.2344,
"step": 191
},
{
"epoch": 0.13065112916259092,
"grad_norm": 0.04539911979386842,
"learning_rate": 9.923621315442307e-06,
"loss": 1.2403,
"step": 192
},
{
"epoch": 0.13133160379364606,
"grad_norm": 0.0514164363522223,
"learning_rate": 9.922677903772064e-06,
"loss": 1.2672,
"step": 193
},
{
"epoch": 0.13201207842470122,
"grad_norm": 0.09067422957089419,
"learning_rate": 9.921728746807008e-06,
"loss": 1.1436,
"step": 194
},
{
"epoch": 0.1326925530557564,
"grad_norm": 0.045517924932712686,
"learning_rate": 9.920773845654904e-06,
"loss": 1.2372,
"step": 195
},
{
"epoch": 0.13337302768681156,
"grad_norm": 0.05196755683813489,
"learning_rate": 9.919813201430224e-06,
"loss": 1.2409,
"step": 196
},
{
"epoch": 0.13405350231786672,
"grad_norm": 0.05169407064154835,
"learning_rate": 9.918846815254145e-06,
"loss": 1.2079,
"step": 197
},
{
"epoch": 0.13473397694892186,
"grad_norm": 0.04363688042005936,
"learning_rate": 9.917874688254542e-06,
"loss": 1.1753,
"step": 198
},
{
"epoch": 0.13541445157997703,
"grad_norm": 0.04334945706219354,
"learning_rate": 9.916896821565993e-06,
"loss": 1.118,
"step": 199
},
{
"epoch": 0.1360949262110322,
"grad_norm": 0.04552794490125636,
"learning_rate": 9.915913216329774e-06,
"loss": 1.2277,
"step": 200
},
{
"epoch": 0.13677540084208736,
"grad_norm": 0.05101757914560778,
"learning_rate": 9.914923873693857e-06,
"loss": 1.1945,
"step": 201
},
{
"epoch": 0.13745587547314253,
"grad_norm": 0.048828585397579644,
"learning_rate": 9.913928794812909e-06,
"loss": 1.2251,
"step": 202
},
{
"epoch": 0.13813635010419767,
"grad_norm": 0.04138830247218384,
"learning_rate": 9.9129279808483e-06,
"loss": 1.2602,
"step": 203
},
{
"epoch": 0.13881682473525284,
"grad_norm": 0.057028225152475985,
"learning_rate": 9.911921432968084e-06,
"loss": 1.1871,
"step": 204
},
{
"epoch": 0.139497299366308,
"grad_norm": 0.04860831660885843,
"learning_rate": 9.91090915234701e-06,
"loss": 1.1847,
"step": 205
},
{
"epoch": 0.14017777399736317,
"grad_norm": 0.042541636250671894,
"learning_rate": 9.90989114016652e-06,
"loss": 1.2557,
"step": 206
},
{
"epoch": 0.1408582486284183,
"grad_norm": 0.04376838000627094,
"learning_rate": 9.908867397614744e-06,
"loss": 1.2278,
"step": 207
},
{
"epoch": 0.14153872325947348,
"grad_norm": 0.0437006298461342,
"learning_rate": 9.907837925886498e-06,
"loss": 1.2318,
"step": 208
},
{
"epoch": 0.14221919789052864,
"grad_norm": 0.0415712474078778,
"learning_rate": 9.906802726183287e-06,
"loss": 1.1363,
"step": 209
},
{
"epoch": 0.1428996725215838,
"grad_norm": 0.04311235737970915,
"learning_rate": 9.905761799713302e-06,
"loss": 1.2332,
"step": 210
},
{
"epoch": 0.14358014715263898,
"grad_norm": 0.042894303106773696,
"learning_rate": 9.904715147691414e-06,
"loss": 1.1575,
"step": 211
},
{
"epoch": 0.14426062178369412,
"grad_norm": 0.05761703850924853,
"learning_rate": 9.90366277133918e-06,
"loss": 1.158,
"step": 212
},
{
"epoch": 0.14494109641474928,
"grad_norm": 0.042582887002480536,
"learning_rate": 9.902604671884835e-06,
"loss": 1.1791,
"step": 213
},
{
"epoch": 0.14562157104580445,
"grad_norm": 0.07215445203960888,
"learning_rate": 9.901540850563295e-06,
"loss": 1.2191,
"step": 214
},
{
"epoch": 0.14630204567685962,
"grad_norm": 0.09549043059285575,
"learning_rate": 9.900471308616158e-06,
"loss": 1.2162,
"step": 215
},
{
"epoch": 0.14698252030791478,
"grad_norm": 0.048833759803382525,
"learning_rate": 9.899396047291689e-06,
"loss": 1.2463,
"step": 216
},
{
"epoch": 0.14766299493896992,
"grad_norm": 0.05937202127360593,
"learning_rate": 9.898315067844838e-06,
"loss": 1.2395,
"step": 217
},
{
"epoch": 0.1483434695700251,
"grad_norm": 0.04988675707454313,
"learning_rate": 9.89722837153722e-06,
"loss": 1.2342,
"step": 218
},
{
"epoch": 0.14902394420108026,
"grad_norm": 0.04301984592186057,
"learning_rate": 9.89613595963713e-06,
"loss": 1.2118,
"step": 219
},
{
"epoch": 0.14970441883213542,
"grad_norm": 0.06245932533144223,
"learning_rate": 9.895037833419529e-06,
"loss": 1.1783,
"step": 220
},
{
"epoch": 0.1503848934631906,
"grad_norm": 0.07128420567084552,
"learning_rate": 9.893933994166047e-06,
"loss": 1.102,
"step": 221
},
{
"epoch": 0.15106536809424573,
"grad_norm": 0.05079938460169106,
"learning_rate": 9.892824443164987e-06,
"loss": 1.2147,
"step": 222
},
{
"epoch": 0.1517458427253009,
"grad_norm": 0.05144702512972884,
"learning_rate": 9.89170918171131e-06,
"loss": 1.2153,
"step": 223
},
{
"epoch": 0.15242631735635606,
"grad_norm": 0.049782140308005665,
"learning_rate": 9.89058821110665e-06,
"loss": 1.2774,
"step": 224
},
{
"epoch": 0.15310679198741123,
"grad_norm": 0.052741452049593476,
"learning_rate": 9.889461532659297e-06,
"loss": 1.2052,
"step": 225
},
{
"epoch": 0.15378726661846637,
"grad_norm": 0.19564397372349063,
"learning_rate": 9.88832914768421e-06,
"loss": 1.2134,
"step": 226
},
{
"epoch": 0.15446774124952153,
"grad_norm": 0.06625440045571748,
"learning_rate": 9.887191057503001e-06,
"loss": 1.1719,
"step": 227
},
{
"epoch": 0.1551482158805767,
"grad_norm": 0.04784858786092269,
"learning_rate": 9.886047263443943e-06,
"loss": 1.2114,
"step": 228
},
{
"epoch": 0.15582869051163187,
"grad_norm": 0.05382621869617639,
"learning_rate": 9.884897766841967e-06,
"loss": 1.2844,
"step": 229
},
{
"epoch": 0.15650916514268703,
"grad_norm": 0.05462666626066152,
"learning_rate": 9.883742569038663e-06,
"loss": 1.1639,
"step": 230
},
{
"epoch": 0.15718963977374217,
"grad_norm": 0.05422579385130804,
"learning_rate": 9.882581671382267e-06,
"loss": 1.2443,
"step": 231
},
{
"epoch": 0.15787011440479734,
"grad_norm": 0.04760945352489805,
"learning_rate": 9.881415075227674e-06,
"loss": 1.1816,
"step": 232
},
{
"epoch": 0.1585505890358525,
"grad_norm": 0.044763636579990364,
"learning_rate": 9.880242781936426e-06,
"loss": 1.1893,
"step": 233
},
{
"epoch": 0.15923106366690767,
"grad_norm": 0.06334514242152707,
"learning_rate": 9.879064792876717e-06,
"loss": 1.2101,
"step": 234
},
{
"epoch": 0.15991153829796284,
"grad_norm": 0.04715853107602709,
"learning_rate": 9.877881109423383e-06,
"loss": 1.2493,
"step": 235
},
{
"epoch": 0.16059201292901798,
"grad_norm": 0.052667670524755336,
"learning_rate": 9.876691732957913e-06,
"loss": 1.2147,
"step": 236
},
{
"epoch": 0.16127248756007315,
"grad_norm": 0.05018947690087094,
"learning_rate": 9.875496664868437e-06,
"loss": 1.2356,
"step": 237
},
{
"epoch": 0.1619529621911283,
"grad_norm": 0.04964589440245944,
"learning_rate": 9.874295906549728e-06,
"loss": 1.2312,
"step": 238
},
{
"epoch": 0.16263343682218348,
"grad_norm": 0.10087586960994378,
"learning_rate": 9.8730894594032e-06,
"loss": 1.2022,
"step": 239
},
{
"epoch": 0.16331391145323865,
"grad_norm": 0.06957648656259531,
"learning_rate": 9.871877324836906e-06,
"loss": 1.2022,
"step": 240
},
{
"epoch": 0.1639943860842938,
"grad_norm": 0.05476431304114644,
"learning_rate": 9.87065950426554e-06,
"loss": 1.2631,
"step": 241
},
{
"epoch": 0.16467486071534895,
"grad_norm": 0.054645531400643356,
"learning_rate": 9.869435999110428e-06,
"loss": 1.175,
"step": 242
},
{
"epoch": 0.16535533534640412,
"grad_norm": 0.04841286729948775,
"learning_rate": 9.868206810799532e-06,
"loss": 1.2281,
"step": 243
},
{
"epoch": 0.1660358099774593,
"grad_norm": 0.06671782922853765,
"learning_rate": 9.866971940767447e-06,
"loss": 1.2494,
"step": 244
},
{
"epoch": 0.16671628460851443,
"grad_norm": 0.05664726380742502,
"learning_rate": 9.865731390455398e-06,
"loss": 1.208,
"step": 245
},
{
"epoch": 0.1673967592395696,
"grad_norm": 0.04523763510128454,
"learning_rate": 9.864485161311242e-06,
"loss": 1.1982,
"step": 246
},
{
"epoch": 0.16807723387062476,
"grad_norm": 0.05127005431853173,
"learning_rate": 9.863233254789463e-06,
"loss": 1.1681,
"step": 247
},
{
"epoch": 0.16875770850167993,
"grad_norm": 0.04424702438605936,
"learning_rate": 9.861975672351172e-06,
"loss": 1.1789,
"step": 248
},
{
"epoch": 0.1694381831327351,
"grad_norm": 0.05100375949853296,
"learning_rate": 9.860712415464097e-06,
"loss": 1.2632,
"step": 249
},
{
"epoch": 0.17011865776379023,
"grad_norm": 0.05953363240829583,
"learning_rate": 9.859443485602603e-06,
"loss": 1.2008,
"step": 250
},
{
"epoch": 0.1707991323948454,
"grad_norm": 0.054042165361209704,
"learning_rate": 9.85816888424766e-06,
"loss": 1.235,
"step": 251
},
{
"epoch": 0.17147960702590057,
"grad_norm": 0.04707724622685731,
"learning_rate": 9.856888612886872e-06,
"loss": 1.2035,
"step": 252
},
{
"epoch": 0.17216008165695573,
"grad_norm": 0.047616861849738026,
"learning_rate": 9.855602673014448e-06,
"loss": 1.151,
"step": 253
},
{
"epoch": 0.1728405562880109,
"grad_norm": 0.043705026997731614,
"learning_rate": 9.85431106613122e-06,
"loss": 1.1755,
"step": 254
},
{
"epoch": 0.17352103091906604,
"grad_norm": 0.04245727665918003,
"learning_rate": 9.853013793744632e-06,
"loss": 1.1992,
"step": 255
},
{
"epoch": 0.1742015055501212,
"grad_norm": 0.06445634670806195,
"learning_rate": 9.851710857368741e-06,
"loss": 1.2532,
"step": 256
},
{
"epoch": 0.17488198018117637,
"grad_norm": 0.04579696157112681,
"learning_rate": 9.850402258524215e-06,
"loss": 1.152,
"step": 257
},
{
"epoch": 0.17556245481223154,
"grad_norm": 0.04700554126760869,
"learning_rate": 9.849087998738328e-06,
"loss": 1.1999,
"step": 258
},
{
"epoch": 0.1762429294432867,
"grad_norm": 0.05177514136462001,
"learning_rate": 9.847768079544962e-06,
"loss": 1.2184,
"step": 259
},
{
"epoch": 0.17692340407434184,
"grad_norm": 0.0691656394965647,
"learning_rate": 9.846442502484608e-06,
"loss": 1.1522,
"step": 260
},
{
"epoch": 0.177603878705397,
"grad_norm": 0.0448531515266734,
"learning_rate": 9.845111269104353e-06,
"loss": 1.1852,
"step": 261
},
{
"epoch": 0.17828435333645218,
"grad_norm": 0.05284284218380461,
"learning_rate": 9.84377438095789e-06,
"loss": 1.2506,
"step": 262
},
{
"epoch": 0.17896482796750735,
"grad_norm": 0.04821826066415016,
"learning_rate": 9.842431839605516e-06,
"loss": 1.1945,
"step": 263
},
{
"epoch": 0.17964530259856248,
"grad_norm": 0.04937302535184858,
"learning_rate": 9.841083646614117e-06,
"loss": 1.2315,
"step": 264
},
{
"epoch": 0.18032577722961765,
"grad_norm": 0.052269506285700276,
"learning_rate": 9.839729803557178e-06,
"loss": 1.1814,
"step": 265
},
{
"epoch": 0.18100625186067282,
"grad_norm": 0.051431348651855786,
"learning_rate": 9.838370312014783e-06,
"loss": 1.2109,
"step": 266
},
{
"epoch": 0.18168672649172798,
"grad_norm": 0.04675037466687018,
"learning_rate": 9.837005173573603e-06,
"loss": 1.2236,
"step": 267
},
{
"epoch": 0.18236720112278315,
"grad_norm": 0.04876302678884042,
"learning_rate": 9.835634389826905e-06,
"loss": 1.213,
"step": 268
},
{
"epoch": 0.1830476757538383,
"grad_norm": 0.047487981056904216,
"learning_rate": 9.834257962374536e-06,
"loss": 1.1964,
"step": 269
},
{
"epoch": 0.18372815038489346,
"grad_norm": 0.046084020203186686,
"learning_rate": 9.832875892822937e-06,
"loss": 1.1624,
"step": 270
},
{
"epoch": 0.18440862501594862,
"grad_norm": 0.3042850666890626,
"learning_rate": 9.831488182785134e-06,
"loss": 1.1522,
"step": 271
},
{
"epoch": 0.1850890996470038,
"grad_norm": 0.043991134946647886,
"learning_rate": 9.83009483388073e-06,
"loss": 1.1931,
"step": 272
},
{
"epoch": 0.18576957427805896,
"grad_norm": 0.05453712505153417,
"learning_rate": 9.828695847735916e-06,
"loss": 1.1765,
"step": 273
},
{
"epoch": 0.1864500489091141,
"grad_norm": 0.050719360959302975,
"learning_rate": 9.827291225983458e-06,
"loss": 1.1895,
"step": 274
},
{
"epoch": 0.18713052354016926,
"grad_norm": 0.050213041260094814,
"learning_rate": 9.825880970262703e-06,
"loss": 1.1846,
"step": 275
},
{
"epoch": 0.18781099817122443,
"grad_norm": 0.05430648584833771,
"learning_rate": 9.824465082219567e-06,
"loss": 1.1972,
"step": 276
},
{
"epoch": 0.1884914728022796,
"grad_norm": 0.057977753358927374,
"learning_rate": 9.823043563506547e-06,
"loss": 1.1812,
"step": 277
},
{
"epoch": 0.18917194743333476,
"grad_norm": 0.05570573203783343,
"learning_rate": 9.821616415782708e-06,
"loss": 1.207,
"step": 278
},
{
"epoch": 0.1898524220643899,
"grad_norm": 0.04558771138495919,
"learning_rate": 9.820183640713685e-06,
"loss": 1.204,
"step": 279
},
{
"epoch": 0.19053289669544507,
"grad_norm": 0.07199615810821337,
"learning_rate": 9.818745239971679e-06,
"loss": 1.2522,
"step": 280
},
{
"epoch": 0.19121337132650024,
"grad_norm": 0.042919127224821454,
"learning_rate": 9.817301215235459e-06,
"loss": 1.2091,
"step": 281
},
{
"epoch": 0.1918938459575554,
"grad_norm": 0.05419946981753212,
"learning_rate": 9.815851568190358e-06,
"loss": 1.17,
"step": 282
},
{
"epoch": 0.19257432058861054,
"grad_norm": 0.061238252534998636,
"learning_rate": 9.81439630052827e-06,
"loss": 1.2271,
"step": 283
},
{
"epoch": 0.1932547952196657,
"grad_norm": 0.04663661207277225,
"learning_rate": 9.812935413947649e-06,
"loss": 1.2129,
"step": 284
},
{
"epoch": 0.19393526985072088,
"grad_norm": 0.047774001967259005,
"learning_rate": 9.811468910153507e-06,
"loss": 1.1284,
"step": 285
},
{
"epoch": 0.19461574448177604,
"grad_norm": 0.05448933158184023,
"learning_rate": 9.80999679085741e-06,
"loss": 1.1589,
"step": 286
},
{
"epoch": 0.1952962191128312,
"grad_norm": 0.03907919503901335,
"learning_rate": 9.808519057777484e-06,
"loss": 1.2586,
"step": 287
},
{
"epoch": 0.19597669374388635,
"grad_norm": 0.0429913843647421,
"learning_rate": 9.807035712638397e-06,
"loss": 1.2313,
"step": 288
},
{
"epoch": 0.19665716837494152,
"grad_norm": 0.045825796250954225,
"learning_rate": 9.805546757171376e-06,
"loss": 1.2343,
"step": 289
},
{
"epoch": 0.19733764300599668,
"grad_norm": 0.08101836247529706,
"learning_rate": 9.80405219311419e-06,
"loss": 1.1668,
"step": 290
},
{
"epoch": 0.19801811763705185,
"grad_norm": 0.04192874905440864,
"learning_rate": 9.802552022211157e-06,
"loss": 1.2155,
"step": 291
},
{
"epoch": 0.19869859226810702,
"grad_norm": 0.05504083460811146,
"learning_rate": 9.801046246213139e-06,
"loss": 1.1945,
"step": 292
},
{
"epoch": 0.19937906689916216,
"grad_norm": 0.04839230603866535,
"learning_rate": 9.799534866877538e-06,
"loss": 1.2198,
"step": 293
},
{
"epoch": 0.20005954153021732,
"grad_norm": 0.13950074251104766,
"learning_rate": 9.798017885968295e-06,
"loss": 1.2247,
"step": 294
},
{
"epoch": 0.2007400161612725,
"grad_norm": 0.04898866813757472,
"learning_rate": 9.796495305255893e-06,
"loss": 1.1844,
"step": 295
},
{
"epoch": 0.20142049079232766,
"grad_norm": 0.05470315462133326,
"learning_rate": 9.794967126517342e-06,
"loss": 1.1631,
"step": 296
},
{
"epoch": 0.20210096542338282,
"grad_norm": 0.05224619111865233,
"learning_rate": 9.793433351536199e-06,
"loss": 1.1829,
"step": 297
},
{
"epoch": 0.20278144005443796,
"grad_norm": 0.045803400776143144,
"learning_rate": 9.791893982102537e-06,
"loss": 1.1372,
"step": 298
},
{
"epoch": 0.20346191468549313,
"grad_norm": 0.05008591147736681,
"learning_rate": 9.790349020012969e-06,
"loss": 1.2191,
"step": 299
},
{
"epoch": 0.2041423893165483,
"grad_norm": 0.05594808080061322,
"learning_rate": 9.788798467070633e-06,
"loss": 1.2266,
"step": 300
},
{
"epoch": 0.20482286394760346,
"grad_norm": 0.045045756947026755,
"learning_rate": 9.787242325085189e-06,
"loss": 1.224,
"step": 301
},
{
"epoch": 0.2055033385786586,
"grad_norm": 0.04409673624514148,
"learning_rate": 9.785680595872824e-06,
"loss": 1.224,
"step": 302
},
{
"epoch": 0.20618381320971377,
"grad_norm": 0.04074518560860722,
"learning_rate": 9.78411328125624e-06,
"loss": 1.1849,
"step": 303
},
{
"epoch": 0.20686428784076893,
"grad_norm": 0.08415190666671774,
"learning_rate": 9.782540383064668e-06,
"loss": 1.1846,
"step": 304
},
{
"epoch": 0.2075447624718241,
"grad_norm": 0.07213446530578103,
"learning_rate": 9.780961903133845e-06,
"loss": 1.1588,
"step": 305
},
{
"epoch": 0.20822523710287927,
"grad_norm": 0.05122335062166945,
"learning_rate": 9.77937784330603e-06,
"loss": 1.1319,
"step": 306
},
{
"epoch": 0.2089057117339344,
"grad_norm": 0.045982551577382964,
"learning_rate": 9.777788205429988e-06,
"loss": 1.1896,
"step": 307
},
{
"epoch": 0.20958618636498957,
"grad_norm": 0.04972258893092704,
"learning_rate": 9.776192991360998e-06,
"loss": 1.2052,
"step": 308
},
{
"epoch": 0.21026666099604474,
"grad_norm": 0.05070740014134389,
"learning_rate": 9.774592202960849e-06,
"loss": 1.1848,
"step": 309
},
{
"epoch": 0.2109471356270999,
"grad_norm": 0.04060056294082241,
"learning_rate": 9.772985842097832e-06,
"loss": 1.1554,
"step": 310
},
{
"epoch": 0.21162761025815507,
"grad_norm": 0.04286891666827905,
"learning_rate": 9.771373910646742e-06,
"loss": 1.1595,
"step": 311
},
{
"epoch": 0.2123080848892102,
"grad_norm": 0.045021190520363845,
"learning_rate": 9.769756410488877e-06,
"loss": 1.1497,
"step": 312
},
{
"epoch": 0.21298855952026538,
"grad_norm": 0.048611355949365866,
"learning_rate": 9.768133343512034e-06,
"loss": 1.2751,
"step": 313
},
{
"epoch": 0.21366903415132055,
"grad_norm": 0.044831247346836245,
"learning_rate": 9.766504711610507e-06,
"loss": 1.1666,
"step": 314
},
{
"epoch": 0.21434950878237571,
"grad_norm": 0.044769242161053394,
"learning_rate": 9.764870516685085e-06,
"loss": 1.175,
"step": 315
},
{
"epoch": 0.21502998341343088,
"grad_norm": 0.06166834973309736,
"learning_rate": 9.763230760643048e-06,
"loss": 1.2267,
"step": 316
},
{
"epoch": 0.21571045804448602,
"grad_norm": 0.045496850620496514,
"learning_rate": 9.761585445398168e-06,
"loss": 1.1725,
"step": 317
},
{
"epoch": 0.2163909326755412,
"grad_norm": 0.050049929676678366,
"learning_rate": 9.759934572870706e-06,
"loss": 1.1582,
"step": 318
},
{
"epoch": 0.21707140730659635,
"grad_norm": 0.04231191912768742,
"learning_rate": 9.758278144987408e-06,
"loss": 1.2227,
"step": 319
},
{
"epoch": 0.21775188193765152,
"grad_norm": 0.332446688550663,
"learning_rate": 9.756616163681503e-06,
"loss": 1.1243,
"step": 320
},
{
"epoch": 0.21843235656870666,
"grad_norm": 0.046179771954275556,
"learning_rate": 9.7549486308927e-06,
"loss": 1.2498,
"step": 321
},
{
"epoch": 0.21911283119976183,
"grad_norm": 0.0623421181732995,
"learning_rate": 9.753275548567192e-06,
"loss": 1.2113,
"step": 322
},
{
"epoch": 0.219793305830817,
"grad_norm": 0.053361270032192355,
"learning_rate": 9.751596918657646e-06,
"loss": 1.246,
"step": 323
},
{
"epoch": 0.22047378046187216,
"grad_norm": 0.04867134806348243,
"learning_rate": 9.749912743123202e-06,
"loss": 1.1905,
"step": 324
},
{
"epoch": 0.22115425509292733,
"grad_norm": 0.060255684810490574,
"learning_rate": 9.748223023929476e-06,
"loss": 1.1761,
"step": 325
},
{
"epoch": 0.22183472972398247,
"grad_norm": 0.05715294954556239,
"learning_rate": 9.74652776304855e-06,
"loss": 1.2812,
"step": 326
},
{
"epoch": 0.22251520435503763,
"grad_norm": 0.05164227157483807,
"learning_rate": 9.744826962458977e-06,
"loss": 1.1956,
"step": 327
},
{
"epoch": 0.2231956789860928,
"grad_norm": 0.04774119620059092,
"learning_rate": 9.743120624145776e-06,
"loss": 1.1538,
"step": 328
},
{
"epoch": 0.22387615361714797,
"grad_norm": 0.05129453385290744,
"learning_rate": 9.741408750100424e-06,
"loss": 1.1453,
"step": 329
},
{
"epoch": 0.22455662824820313,
"grad_norm": 0.06734783625068147,
"learning_rate": 9.739691342320866e-06,
"loss": 1.2209,
"step": 330
},
{
"epoch": 0.22523710287925827,
"grad_norm": 0.04789002077924269,
"learning_rate": 9.737968402811497e-06,
"loss": 1.2396,
"step": 331
},
{
"epoch": 0.22591757751031344,
"grad_norm": 0.04367749038463707,
"learning_rate": 9.736239933583177e-06,
"loss": 1.2372,
"step": 332
},
{
"epoch": 0.2265980521413686,
"grad_norm": 0.06025885176615586,
"learning_rate": 9.734505936653214e-06,
"loss": 1.1022,
"step": 333
},
{
"epoch": 0.22727852677242377,
"grad_norm": 0.04799522348098899,
"learning_rate": 9.732766414045368e-06,
"loss": 1.2403,
"step": 334
},
{
"epoch": 0.22795900140347894,
"grad_norm": 0.05320768993231738,
"learning_rate": 9.73102136778985e-06,
"loss": 1.1591,
"step": 335
},
{
"epoch": 0.22863947603453408,
"grad_norm": 0.05659410183258992,
"learning_rate": 9.729270799923319e-06,
"loss": 1.1582,
"step": 336
},
{
"epoch": 0.22931995066558925,
"grad_norm": 0.05562485812021002,
"learning_rate": 9.727514712488871e-06,
"loss": 1.1808,
"step": 337
},
{
"epoch": 0.2300004252966444,
"grad_norm": 0.05688262826735776,
"learning_rate": 9.725753107536053e-06,
"loss": 1.2197,
"step": 338
},
{
"epoch": 0.23068089992769958,
"grad_norm": 0.04336195320780082,
"learning_rate": 9.723985987120848e-06,
"loss": 1.2233,
"step": 339
},
{
"epoch": 0.23136137455875472,
"grad_norm": 0.05006619094350356,
"learning_rate": 9.722213353305672e-06,
"loss": 1.2157,
"step": 340
},
{
"epoch": 0.23204184918980988,
"grad_norm": 0.051087758971063546,
"learning_rate": 9.720435208159382e-06,
"loss": 1.2709,
"step": 341
},
{
"epoch": 0.23272232382086505,
"grad_norm": 0.046987562802023916,
"learning_rate": 9.718651553757266e-06,
"loss": 1.2585,
"step": 342
},
{
"epoch": 0.23340279845192022,
"grad_norm": 0.04766257543097715,
"learning_rate": 9.716862392181036e-06,
"loss": 1.2761,
"step": 343
},
{
"epoch": 0.23408327308297538,
"grad_norm": 0.09361014366689609,
"learning_rate": 9.715067725518842e-06,
"loss": 1.1961,
"step": 344
},
{
"epoch": 0.23476374771403052,
"grad_norm": 0.04360733792595929,
"learning_rate": 9.713267555865247e-06,
"loss": 1.1781,
"step": 345
},
{
"epoch": 0.2354442223450857,
"grad_norm": 0.057158041969233964,
"learning_rate": 9.711461885321247e-06,
"loss": 1.2386,
"step": 346
},
{
"epoch": 0.23612469697614086,
"grad_norm": 0.045100749986830585,
"learning_rate": 9.709650715994253e-06,
"loss": 1.1951,
"step": 347
},
{
"epoch": 0.23680517160719602,
"grad_norm": 0.05740293932782018,
"learning_rate": 9.707834049998093e-06,
"loss": 1.1706,
"step": 348
},
{
"epoch": 0.2374856462382512,
"grad_norm": 0.05431722318838959,
"learning_rate": 9.706011889453013e-06,
"loss": 1.1906,
"step": 349
},
{
"epoch": 0.23816612086930633,
"grad_norm": 0.07193219027684113,
"learning_rate": 9.704184236485672e-06,
"loss": 1.1017,
"step": 350
},
{
"epoch": 0.2388465955003615,
"grad_norm": 0.05114798899715518,
"learning_rate": 9.702351093229133e-06,
"loss": 1.2499,
"step": 351
},
{
"epoch": 0.23952707013141666,
"grad_norm": 0.04748191234685159,
"learning_rate": 9.700512461822875e-06,
"loss": 1.1908,
"step": 352
},
{
"epoch": 0.24020754476247183,
"grad_norm": 0.08356574869948455,
"learning_rate": 9.69866834441278e-06,
"loss": 1.1585,
"step": 353
},
{
"epoch": 0.240888019393527,
"grad_norm": 0.046493368628761834,
"learning_rate": 9.696818743151128e-06,
"loss": 1.1491,
"step": 354
},
{
"epoch": 0.24156849402458214,
"grad_norm": 0.045313373883776854,
"learning_rate": 9.694963660196603e-06,
"loss": 1.1125,
"step": 355
},
{
"epoch": 0.2422489686556373,
"grad_norm": 0.044233082198614604,
"learning_rate": 9.69310309771429e-06,
"loss": 1.2128,
"step": 356
},
{
"epoch": 0.24292944328669247,
"grad_norm": 0.053393583715076506,
"learning_rate": 9.691237057875662e-06,
"loss": 1.1631,
"step": 357
},
{
"epoch": 0.24360991791774764,
"grad_norm": 0.05491977568490273,
"learning_rate": 9.68936554285859e-06,
"loss": 1.1712,
"step": 358
},
{
"epoch": 0.24429039254880278,
"grad_norm": 0.06425210798070492,
"learning_rate": 9.687488554847332e-06,
"loss": 1.184,
"step": 359
},
{
"epoch": 0.24497086717985794,
"grad_norm": 0.062965712116679,
"learning_rate": 9.685606096032536e-06,
"loss": 1.1326,
"step": 360
},
{
"epoch": 0.2456513418109131,
"grad_norm": 0.048778316344698425,
"learning_rate": 9.683718168611233e-06,
"loss": 1.167,
"step": 361
},
{
"epoch": 0.24633181644196828,
"grad_norm": 0.049233982480725684,
"learning_rate": 9.68182477478684e-06,
"loss": 1.2294,
"step": 362
},
{
"epoch": 0.24701229107302344,
"grad_norm": 0.0443112999178113,
"learning_rate": 9.67992591676915e-06,
"loss": 1.218,
"step": 363
},
{
"epoch": 0.24769276570407858,
"grad_norm": 0.04869766258013373,
"learning_rate": 9.678021596774332e-06,
"loss": 1.2027,
"step": 364
},
{
"epoch": 0.24837324033513375,
"grad_norm": 0.052032926949957266,
"learning_rate": 9.676111817024935e-06,
"loss": 1.2244,
"step": 365
},
{
"epoch": 0.24905371496618892,
"grad_norm": 0.04940706754832445,
"learning_rate": 9.67419657974988e-06,
"loss": 1.2217,
"step": 366
},
{
"epoch": 0.24973418959724408,
"grad_norm": 0.08905408551465017,
"learning_rate": 9.672275887184449e-06,
"loss": 1.1463,
"step": 367
},
{
"epoch": 0.2504146642282992,
"grad_norm": 0.0420124504182442,
"learning_rate": 9.670349741570302e-06,
"loss": 1.1608,
"step": 368
},
{
"epoch": 0.2510951388593544,
"grad_norm": 0.047640638525214364,
"learning_rate": 9.668418145155453e-06,
"loss": 1.1924,
"step": 369
},
{
"epoch": 0.25177561349040956,
"grad_norm": 0.04411995714560134,
"learning_rate": 9.66648110019429e-06,
"loss": 1.1924,
"step": 370
},
{
"epoch": 0.2524560881214647,
"grad_norm": 0.0954266561963091,
"learning_rate": 9.664538608947547e-06,
"loss": 1.194,
"step": 371
},
{
"epoch": 0.2531365627525199,
"grad_norm": 0.04832973819001474,
"learning_rate": 9.662590673682322e-06,
"loss": 1.1294,
"step": 372
},
{
"epoch": 0.25381703738357503,
"grad_norm": 0.0907247205159215,
"learning_rate": 9.660637296672065e-06,
"loss": 1.203,
"step": 373
},
{
"epoch": 0.2544975120146302,
"grad_norm": 0.046786600325019814,
"learning_rate": 9.658678480196579e-06,
"loss": 1.246,
"step": 374
},
{
"epoch": 0.25517798664568536,
"grad_norm": 0.05704081940461117,
"learning_rate": 9.65671422654201e-06,
"loss": 1.1417,
"step": 375
},
{
"epoch": 0.2558584612767405,
"grad_norm": 0.051937899288344185,
"learning_rate": 9.654744538000857e-06,
"loss": 1.1278,
"step": 376
},
{
"epoch": 0.2565389359077957,
"grad_norm": 0.04508971069443915,
"learning_rate": 9.652769416871956e-06,
"loss": 1.1809,
"step": 377
},
{
"epoch": 0.25721941053885083,
"grad_norm": 0.054979374091871036,
"learning_rate": 9.650788865460487e-06,
"loss": 1.2295,
"step": 378
},
{
"epoch": 0.25789988516990603,
"grad_norm": 0.049044653730871066,
"learning_rate": 9.648802886077968e-06,
"loss": 1.224,
"step": 379
},
{
"epoch": 0.25858035980096117,
"grad_norm": 0.044961130694949406,
"learning_rate": 9.646811481042246e-06,
"loss": 1.1668,
"step": 380
},
{
"epoch": 0.2592608344320163,
"grad_norm": 0.04699265461262461,
"learning_rate": 9.64481465267751e-06,
"loss": 1.2107,
"step": 381
},
{
"epoch": 0.2599413090630715,
"grad_norm": 0.0577640863727447,
"learning_rate": 9.642812403314272e-06,
"loss": 1.251,
"step": 382
},
{
"epoch": 0.26062178369412664,
"grad_norm": 0.04912205190629825,
"learning_rate": 9.640804735289371e-06,
"loss": 1.1478,
"step": 383
},
{
"epoch": 0.26130225832518184,
"grad_norm": 0.04635375436327543,
"learning_rate": 9.638791650945974e-06,
"loss": 1.2266,
"step": 384
},
{
"epoch": 0.261982732956237,
"grad_norm": 0.06771008302412035,
"learning_rate": 9.636773152633564e-06,
"loss": 1.1653,
"step": 385
},
{
"epoch": 0.2626632075872921,
"grad_norm": 0.0421485079049122,
"learning_rate": 9.634749242707948e-06,
"loss": 1.1996,
"step": 386
},
{
"epoch": 0.2633436822183473,
"grad_norm": 0.046003609818600986,
"learning_rate": 9.632719923531246e-06,
"loss": 1.2091,
"step": 387
},
{
"epoch": 0.26402415684940245,
"grad_norm": 0.041416979164638455,
"learning_rate": 9.630685197471893e-06,
"loss": 1.1664,
"step": 388
},
{
"epoch": 0.26470463148045764,
"grad_norm": 0.04464379751775676,
"learning_rate": 9.628645066904631e-06,
"loss": 1.2814,
"step": 389
},
{
"epoch": 0.2653851061115128,
"grad_norm": 0.04506408101193006,
"learning_rate": 9.626599534210514e-06,
"loss": 1.1831,
"step": 390
},
{
"epoch": 0.2660655807425679,
"grad_norm": 0.041696760747256145,
"learning_rate": 9.624548601776897e-06,
"loss": 1.1438,
"step": 391
},
{
"epoch": 0.2667460553736231,
"grad_norm": 0.04180033834556875,
"learning_rate": 9.62249227199744e-06,
"loss": 1.1683,
"step": 392
},
{
"epoch": 0.26742653000467825,
"grad_norm": 0.0653569019317656,
"learning_rate": 9.620430547272101e-06,
"loss": 1.1412,
"step": 393
},
{
"epoch": 0.26810700463573345,
"grad_norm": 0.0508490046254163,
"learning_rate": 9.618363430007134e-06,
"loss": 1.1703,
"step": 394
},
{
"epoch": 0.2687874792667886,
"grad_norm": 0.04192674002870443,
"learning_rate": 9.616290922615089e-06,
"loss": 1.1928,
"step": 395
},
{
"epoch": 0.2694679538978437,
"grad_norm": 0.045135315573927554,
"learning_rate": 9.614213027514802e-06,
"loss": 1.1368,
"step": 396
},
{
"epoch": 0.2701484285288989,
"grad_norm": 0.04283907706648696,
"learning_rate": 9.612129747131403e-06,
"loss": 1.2029,
"step": 397
},
{
"epoch": 0.27082890315995406,
"grad_norm": 0.047826740996491424,
"learning_rate": 9.610041083896304e-06,
"loss": 1.1292,
"step": 398
},
{
"epoch": 0.27150937779100925,
"grad_norm": 0.042865391928449584,
"learning_rate": 9.6079470402472e-06,
"loss": 1.1786,
"step": 399
},
{
"epoch": 0.2721898524220644,
"grad_norm": 0.04272434572582662,
"learning_rate": 9.60584761862806e-06,
"loss": 1.1714,
"step": 400
},
{
"epoch": 0.27287032705311953,
"grad_norm": 0.047643431915423036,
"learning_rate": 9.603742821489143e-06,
"loss": 1.1807,
"step": 401
},
{
"epoch": 0.2735508016841747,
"grad_norm": 0.04305667573692588,
"learning_rate": 9.60163265128697e-06,
"loss": 1.2203,
"step": 402
},
{
"epoch": 0.27423127631522987,
"grad_norm": 0.05262690402608385,
"learning_rate": 9.599517110484335e-06,
"loss": 1.1727,
"step": 403
},
{
"epoch": 0.27491175094628506,
"grad_norm": 0.04326143595632641,
"learning_rate": 9.597396201550307e-06,
"loss": 1.1839,
"step": 404
},
{
"epoch": 0.2755922255773402,
"grad_norm": 0.04480016734569184,
"learning_rate": 9.595269926960207e-06,
"loss": 1.132,
"step": 405
},
{
"epoch": 0.27627270020839534,
"grad_norm": 0.07616330002731382,
"learning_rate": 9.593138289195634e-06,
"loss": 1.1795,
"step": 406
},
{
"epoch": 0.27695317483945053,
"grad_norm": 0.0463946895471615,
"learning_rate": 9.591001290744433e-06,
"loss": 1.1722,
"step": 407
},
{
"epoch": 0.2776336494705057,
"grad_norm": 0.06517585252270597,
"learning_rate": 9.588858934100715e-06,
"loss": 1.2092,
"step": 408
},
{
"epoch": 0.2783141241015608,
"grad_norm": 0.04668726252763982,
"learning_rate": 9.58671122176484e-06,
"loss": 1.1454,
"step": 409
},
{
"epoch": 0.278994598732616,
"grad_norm": 0.044076602793079296,
"learning_rate": 9.584558156243418e-06,
"loss": 1.2057,
"step": 410
},
{
"epoch": 0.27967507336367115,
"grad_norm": 0.05755797965942513,
"learning_rate": 9.582399740049309e-06,
"loss": 1.1866,
"step": 411
},
{
"epoch": 0.28035554799472634,
"grad_norm": 0.04909298957494496,
"learning_rate": 9.580235975701615e-06,
"loss": 1.1608,
"step": 412
},
{
"epoch": 0.2810360226257815,
"grad_norm": 0.04760184092450246,
"learning_rate": 9.578066865725685e-06,
"loss": 1.2187,
"step": 413
},
{
"epoch": 0.2817164972568366,
"grad_norm": 0.041651343733572356,
"learning_rate": 9.575892412653102e-06,
"loss": 1.1959,
"step": 414
},
{
"epoch": 0.2823969718878918,
"grad_norm": 0.04152859840201504,
"learning_rate": 9.573712619021687e-06,
"loss": 1.1998,
"step": 415
},
{
"epoch": 0.28307744651894695,
"grad_norm": 0.39015214046356017,
"learning_rate": 9.571527487375494e-06,
"loss": 1.2253,
"step": 416
},
{
"epoch": 0.28375792115000215,
"grad_norm": 0.043403289521190484,
"learning_rate": 9.569337020264805e-06,
"loss": 1.1732,
"step": 417
},
{
"epoch": 0.2844383957810573,
"grad_norm": 0.04464700023306462,
"learning_rate": 9.567141220246136e-06,
"loss": 1.1997,
"step": 418
},
{
"epoch": 0.2851188704121124,
"grad_norm": 0.048346937249993505,
"learning_rate": 9.564940089882215e-06,
"loss": 1.1355,
"step": 419
},
{
"epoch": 0.2857993450431676,
"grad_norm": 0.046040306984666865,
"learning_rate": 9.562733631742003e-06,
"loss": 1.1288,
"step": 420
},
{
"epoch": 0.28647981967422276,
"grad_norm": 0.04680136513725823,
"learning_rate": 9.560521848400672e-06,
"loss": 1.1765,
"step": 421
},
{
"epoch": 0.28716029430527795,
"grad_norm": 0.06484527922937486,
"learning_rate": 9.55830474243961e-06,
"loss": 1.1584,
"step": 422
},
{
"epoch": 0.2878407689363331,
"grad_norm": 0.04158095847228888,
"learning_rate": 9.55608231644642e-06,
"loss": 1.1548,
"step": 423
},
{
"epoch": 0.28852124356738823,
"grad_norm": 0.04628073262185198,
"learning_rate": 9.553854573014913e-06,
"loss": 1.205,
"step": 424
},
{
"epoch": 0.2892017181984434,
"grad_norm": 0.05281944608701771,
"learning_rate": 9.551621514745104e-06,
"loss": 1.1915,
"step": 425
},
{
"epoch": 0.28988219282949856,
"grad_norm": 0.045211220884478216,
"learning_rate": 9.549383144243213e-06,
"loss": 1.2551,
"step": 426
},
{
"epoch": 0.29056266746055376,
"grad_norm": 0.04715624838570465,
"learning_rate": 9.547139464121658e-06,
"loss": 1.2004,
"step": 427
},
{
"epoch": 0.2912431420916089,
"grad_norm": 0.04606784209554793,
"learning_rate": 9.544890476999056e-06,
"loss": 1.2783,
"step": 428
},
{
"epoch": 0.29192361672266404,
"grad_norm": 0.0475193656164289,
"learning_rate": 9.542636185500216e-06,
"loss": 1.1615,
"step": 429
},
{
"epoch": 0.29260409135371923,
"grad_norm": 0.047942679226840895,
"learning_rate": 9.540376592256142e-06,
"loss": 1.1779,
"step": 430
},
{
"epoch": 0.29328456598477437,
"grad_norm": 0.045498747981138,
"learning_rate": 9.538111699904018e-06,
"loss": 1.1603,
"step": 431
},
{
"epoch": 0.29396504061582956,
"grad_norm": 0.044436179641190246,
"learning_rate": 9.53584151108722e-06,
"loss": 1.2382,
"step": 432
},
{
"epoch": 0.2946455152468847,
"grad_norm": 0.04879803811355419,
"learning_rate": 9.533566028455303e-06,
"loss": 1.1918,
"step": 433
},
{
"epoch": 0.29532598987793984,
"grad_norm": 0.04653543136888824,
"learning_rate": 9.531285254663997e-06,
"loss": 1.2306,
"step": 434
},
{
"epoch": 0.29600646450899504,
"grad_norm": 0.06928602870846744,
"learning_rate": 9.528999192375213e-06,
"loss": 1.1711,
"step": 435
},
{
"epoch": 0.2966869391400502,
"grad_norm": 0.048157969271924075,
"learning_rate": 9.526707844257031e-06,
"loss": 1.1653,
"step": 436
},
{
"epoch": 0.29736741377110537,
"grad_norm": 0.04703493966240612,
"learning_rate": 9.5244112129837e-06,
"loss": 1.1545,
"step": 437
},
{
"epoch": 0.2980478884021605,
"grad_norm": 0.048950716795177485,
"learning_rate": 9.522109301235637e-06,
"loss": 1.1691,
"step": 438
},
{
"epoch": 0.29872836303321565,
"grad_norm": 0.04164149250851006,
"learning_rate": 9.519802111699423e-06,
"loss": 1.1418,
"step": 439
},
{
"epoch": 0.29940883766427084,
"grad_norm": 0.04543803394559561,
"learning_rate": 9.51748964706779e-06,
"loss": 1.1861,
"step": 440
},
{
"epoch": 0.300089312295326,
"grad_norm": 0.07807641340896274,
"learning_rate": 9.51517191003964e-06,
"loss": 1.2208,
"step": 441
},
{
"epoch": 0.3007697869263812,
"grad_norm": 0.10605080357630506,
"learning_rate": 9.512848903320017e-06,
"loss": 1.2167,
"step": 442
},
{
"epoch": 0.3014502615574363,
"grad_norm": 0.04651371237040995,
"learning_rate": 9.51052062962012e-06,
"loss": 1.1747,
"step": 443
},
{
"epoch": 0.30213073618849146,
"grad_norm": 0.04203698338216257,
"learning_rate": 9.508187091657297e-06,
"loss": 1.211,
"step": 444
},
{
"epoch": 0.30281121081954665,
"grad_norm": 0.05027100823388268,
"learning_rate": 9.505848292155037e-06,
"loss": 1.2042,
"step": 445
},
{
"epoch": 0.3034916854506018,
"grad_norm": 0.04398023883831587,
"learning_rate": 9.503504233842973e-06,
"loss": 1.198,
"step": 446
},
{
"epoch": 0.30417216008165693,
"grad_norm": 0.046003976924450816,
"learning_rate": 9.501154919456867e-06,
"loss": 1.2206,
"step": 447
},
{
"epoch": 0.3048526347127121,
"grad_norm": 0.04649602460300257,
"learning_rate": 9.498800351738629e-06,
"loss": 1.2181,
"step": 448
},
{
"epoch": 0.30553310934376726,
"grad_norm": 0.04705078110397738,
"learning_rate": 9.496440533436289e-06,
"loss": 1.1873,
"step": 449
},
{
"epoch": 0.30621358397482246,
"grad_norm": 0.050720730862168675,
"learning_rate": 9.494075467304007e-06,
"loss": 1.2447,
"step": 450
},
{
"epoch": 0.3068940586058776,
"grad_norm": 0.04370216331760348,
"learning_rate": 9.491705156102075e-06,
"loss": 1.1935,
"step": 451
},
{
"epoch": 0.30757453323693273,
"grad_norm": 0.04589375799644584,
"learning_rate": 9.489329602596898e-06,
"loss": 1.2247,
"step": 452
},
{
"epoch": 0.30825500786798793,
"grad_norm": 0.03939021373478932,
"learning_rate": 9.486948809561001e-06,
"loss": 1.1618,
"step": 453
},
{
"epoch": 0.30893548249904307,
"grad_norm": 0.05050778826919447,
"learning_rate": 9.484562779773027e-06,
"loss": 1.1586,
"step": 454
},
{
"epoch": 0.30961595713009826,
"grad_norm": 0.04298915132168142,
"learning_rate": 9.482171516017733e-06,
"loss": 1.0995,
"step": 455
},
{
"epoch": 0.3102964317611534,
"grad_norm": 0.06024263160262278,
"learning_rate": 9.479775021085977e-06,
"loss": 1.1715,
"step": 456
},
{
"epoch": 0.31097690639220854,
"grad_norm": 0.04047944888546823,
"learning_rate": 9.477373297774729e-06,
"loss": 1.1799,
"step": 457
},
{
"epoch": 0.31165738102326374,
"grad_norm": 0.04576627432425469,
"learning_rate": 9.474966348887055e-06,
"loss": 1.1236,
"step": 458
},
{
"epoch": 0.3123378556543189,
"grad_norm": 0.04608165755019771,
"learning_rate": 9.472554177232126e-06,
"loss": 1.1901,
"step": 459
},
{
"epoch": 0.31301833028537407,
"grad_norm": 0.04572503154737356,
"learning_rate": 9.470136785625206e-06,
"loss": 1.2308,
"step": 460
},
{
"epoch": 0.3136988049164292,
"grad_norm": 0.04703098129663038,
"learning_rate": 9.46771417688765e-06,
"loss": 1.1518,
"step": 461
},
{
"epoch": 0.31437927954748435,
"grad_norm": 0.057810641121546665,
"learning_rate": 9.465286353846905e-06,
"loss": 1.1359,
"step": 462
},
{
"epoch": 0.31505975417853954,
"grad_norm": 0.04007218378292392,
"learning_rate": 9.462853319336498e-06,
"loss": 1.2377,
"step": 463
},
{
"epoch": 0.3157402288095947,
"grad_norm": 0.04044711828026744,
"learning_rate": 9.460415076196046e-06,
"loss": 1.1777,
"step": 464
},
{
"epoch": 0.3164207034406499,
"grad_norm": 0.039984993410920745,
"learning_rate": 9.457971627271239e-06,
"loss": 1.1856,
"step": 465
},
{
"epoch": 0.317101178071705,
"grad_norm": 0.04595448821615046,
"learning_rate": 9.455522975413846e-06,
"loss": 1.1121,
"step": 466
},
{
"epoch": 0.31778165270276015,
"grad_norm": 0.06911224440534725,
"learning_rate": 9.453069123481706e-06,
"loss": 1.1776,
"step": 467
},
{
"epoch": 0.31846212733381535,
"grad_norm": 0.0496062942406621,
"learning_rate": 9.45061007433873e-06,
"loss": 1.2229,
"step": 468
},
{
"epoch": 0.3191426019648705,
"grad_norm": 0.05285388792564055,
"learning_rate": 9.44814583085489e-06,
"loss": 1.2224,
"step": 469
},
{
"epoch": 0.3198230765959257,
"grad_norm": 0.04266731996830014,
"learning_rate": 9.445676395906226e-06,
"loss": 1.1885,
"step": 470
},
{
"epoch": 0.3205035512269808,
"grad_norm": 0.04794045206701465,
"learning_rate": 9.443201772374834e-06,
"loss": 1.1875,
"step": 471
},
{
"epoch": 0.32118402585803596,
"grad_norm": 0.04276835398618904,
"learning_rate": 9.440721963148864e-06,
"loss": 1.186,
"step": 472
},
{
"epoch": 0.32186450048909115,
"grad_norm": 0.04059867789220637,
"learning_rate": 9.438236971122523e-06,
"loss": 1.1746,
"step": 473
},
{
"epoch": 0.3225449751201463,
"grad_norm": 0.05427813930596811,
"learning_rate": 9.435746799196061e-06,
"loss": 1.2627,
"step": 474
},
{
"epoch": 0.3232254497512015,
"grad_norm": 0.06052704105903962,
"learning_rate": 9.43325145027578e-06,
"loss": 1.1896,
"step": 475
},
{
"epoch": 0.3239059243822566,
"grad_norm": 0.05502079517421927,
"learning_rate": 9.430750927274018e-06,
"loss": 1.2125,
"step": 476
},
{
"epoch": 0.32458639901331177,
"grad_norm": 0.05404712729764853,
"learning_rate": 9.428245233109154e-06,
"loss": 1.1966,
"step": 477
},
{
"epoch": 0.32526687364436696,
"grad_norm": 0.047489497158267946,
"learning_rate": 9.425734370705606e-06,
"loss": 1.153,
"step": 478
},
{
"epoch": 0.3259473482754221,
"grad_norm": 0.05518270929161413,
"learning_rate": 9.42321834299382e-06,
"loss": 1.1214,
"step": 479
},
{
"epoch": 0.3266278229064773,
"grad_norm": 0.04183323442062128,
"learning_rate": 9.420697152910268e-06,
"loss": 1.1737,
"step": 480
},
{
"epoch": 0.32730829753753243,
"grad_norm": 0.047353508292032734,
"learning_rate": 9.41817080339745e-06,
"loss": 1.1852,
"step": 481
},
{
"epoch": 0.3279887721685876,
"grad_norm": 0.045798051211923665,
"learning_rate": 9.415639297403891e-06,
"loss": 1.1832,
"step": 482
},
{
"epoch": 0.32866924679964277,
"grad_norm": 0.04709531868030345,
"learning_rate": 9.413102637884131e-06,
"loss": 1.2718,
"step": 483
},
{
"epoch": 0.3293497214306979,
"grad_norm": 0.04915959430799309,
"learning_rate": 9.410560827798721e-06,
"loss": 1.148,
"step": 484
},
{
"epoch": 0.3300301960617531,
"grad_norm": 0.09966635739398834,
"learning_rate": 9.40801387011423e-06,
"loss": 1.163,
"step": 485
},
{
"epoch": 0.33071067069280824,
"grad_norm": 0.049745442405997646,
"learning_rate": 9.40546176780323e-06,
"loss": 1.1862,
"step": 486
},
{
"epoch": 0.3313911453238634,
"grad_norm": 0.04671464675596574,
"learning_rate": 9.402904523844301e-06,
"loss": 1.2282,
"step": 487
},
{
"epoch": 0.3320716199549186,
"grad_norm": 0.05486432598075293,
"learning_rate": 9.400342141222019e-06,
"loss": 1.1915,
"step": 488
},
{
"epoch": 0.3327520945859737,
"grad_norm": 0.05033392877701764,
"learning_rate": 9.397774622926963e-06,
"loss": 1.169,
"step": 489
},
{
"epoch": 0.33343256921702885,
"grad_norm": 0.04237024604944351,
"learning_rate": 9.395201971955701e-06,
"loss": 1.1909,
"step": 490
},
{
"epoch": 0.33411304384808405,
"grad_norm": 0.05093227421957238,
"learning_rate": 9.392624191310795e-06,
"loss": 1.2164,
"step": 491
},
{
"epoch": 0.3347935184791392,
"grad_norm": 0.046659153381013445,
"learning_rate": 9.390041284000793e-06,
"loss": 1.116,
"step": 492
},
{
"epoch": 0.3354739931101944,
"grad_norm": 0.04607818909638517,
"learning_rate": 9.387453253040221e-06,
"loss": 1.1698,
"step": 493
},
{
"epoch": 0.3361544677412495,
"grad_norm": 0.04582104423266983,
"learning_rate": 9.384860101449598e-06,
"loss": 1.1578,
"step": 494
},
{
"epoch": 0.33683494237230466,
"grad_norm": 0.055032457656983766,
"learning_rate": 9.382261832255402e-06,
"loss": 1.2005,
"step": 495
},
{
"epoch": 0.33751541700335985,
"grad_norm": 0.045420525383448394,
"learning_rate": 9.3796584484901e-06,
"loss": 1.1812,
"step": 496
},
{
"epoch": 0.338195891634415,
"grad_norm": 0.04479691200348206,
"learning_rate": 9.377049953192114e-06,
"loss": 1.1975,
"step": 497
},
{
"epoch": 0.3388763662654702,
"grad_norm": 0.04262624876760037,
"learning_rate": 9.374436349405847e-06,
"loss": 1.1979,
"step": 498
},
{
"epoch": 0.3395568408965253,
"grad_norm": 0.05984559336645253,
"learning_rate": 9.371817640181649e-06,
"loss": 1.1928,
"step": 499
},
{
"epoch": 0.34023731552758046,
"grad_norm": 0.04895162934117913,
"learning_rate": 9.369193828575838e-06,
"loss": 1.2148,
"step": 500
},
{
"epoch": 0.34091779015863566,
"grad_norm": 0.05483533156140584,
"learning_rate": 9.366564917650685e-06,
"loss": 1.1474,
"step": 501
},
{
"epoch": 0.3415982647896908,
"grad_norm": 0.04565128017297075,
"learning_rate": 9.36393091047441e-06,
"loss": 1.1957,
"step": 502
},
{
"epoch": 0.342278739420746,
"grad_norm": 0.05336905029274791,
"learning_rate": 9.361291810121184e-06,
"loss": 1.1451,
"step": 503
},
{
"epoch": 0.34295921405180113,
"grad_norm": 0.040537767793770316,
"learning_rate": 9.358647619671123e-06,
"loss": 1.1766,
"step": 504
},
{
"epoch": 0.34363968868285627,
"grad_norm": 0.043800628506550594,
"learning_rate": 9.355998342210278e-06,
"loss": 1.1679,
"step": 505
},
{
"epoch": 0.34432016331391146,
"grad_norm": 0.23872842344128314,
"learning_rate": 9.353343980830644e-06,
"loss": 1.1807,
"step": 506
},
{
"epoch": 0.3450006379449666,
"grad_norm": 0.18365918447007845,
"learning_rate": 9.350684538630146e-06,
"loss": 1.0977,
"step": 507
},
{
"epoch": 0.3456811125760218,
"grad_norm": 0.04344688652790131,
"learning_rate": 9.348020018712636e-06,
"loss": 1.1744,
"step": 508
},
{
"epoch": 0.34636158720707694,
"grad_norm": 0.06815347742969964,
"learning_rate": 9.3453504241879e-06,
"loss": 1.1922,
"step": 509
},
{
"epoch": 0.3470420618381321,
"grad_norm": 0.17668464807174913,
"learning_rate": 9.342675758171638e-06,
"loss": 1.1527,
"step": 510
},
{
"epoch": 0.34772253646918727,
"grad_norm": 0.043762024230477835,
"learning_rate": 9.339996023785477e-06,
"loss": 1.1789,
"step": 511
},
{
"epoch": 0.3484030111002424,
"grad_norm": 0.04645872519379631,
"learning_rate": 9.337311224156952e-06,
"loss": 1.2431,
"step": 512
},
{
"epoch": 0.3490834857312976,
"grad_norm": 0.05467355097949484,
"learning_rate": 9.334621362419516e-06,
"loss": 1.19,
"step": 513
},
{
"epoch": 0.34976396036235274,
"grad_norm": 0.04532675094347275,
"learning_rate": 9.331926441712522e-06,
"loss": 1.1435,
"step": 514
},
{
"epoch": 0.3504444349934079,
"grad_norm": 0.19520076722462312,
"learning_rate": 9.32922646518124e-06,
"loss": 1.2298,
"step": 515
},
{
"epoch": 0.3511249096244631,
"grad_norm": 0.04371007629560449,
"learning_rate": 9.326521435976827e-06,
"loss": 1.1918,
"step": 516
},
{
"epoch": 0.3518053842555182,
"grad_norm": 0.041218605155981726,
"learning_rate": 9.323811357256344e-06,
"loss": 1.1903,
"step": 517
},
{
"epoch": 0.3524858588865734,
"grad_norm": 0.04769161322978696,
"learning_rate": 9.32109623218275e-06,
"loss": 1.1919,
"step": 518
},
{
"epoch": 0.35316633351762855,
"grad_norm": 0.04352852116577712,
"learning_rate": 9.31837606392488e-06,
"loss": 1.2385,
"step": 519
},
{
"epoch": 0.3538468081486837,
"grad_norm": 0.04296085894799078,
"learning_rate": 9.315650855657468e-06,
"loss": 1.1762,
"step": 520
},
{
"epoch": 0.3545272827797389,
"grad_norm": 0.062008282831599724,
"learning_rate": 9.312920610561125e-06,
"loss": 1.1741,
"step": 521
},
{
"epoch": 0.355207757410794,
"grad_norm": 0.04475337460583143,
"learning_rate": 9.310185331822338e-06,
"loss": 1.2936,
"step": 522
},
{
"epoch": 0.3558882320418492,
"grad_norm": 0.04335601547887752,
"learning_rate": 9.307445022633476e-06,
"loss": 1.1844,
"step": 523
},
{
"epoch": 0.35656870667290436,
"grad_norm": 0.04763279336067209,
"learning_rate": 9.304699686192771e-06,
"loss": 1.1406,
"step": 524
},
{
"epoch": 0.3572491813039595,
"grad_norm": 0.047035165273336066,
"learning_rate": 9.301949325704326e-06,
"loss": 1.1617,
"step": 525
},
{
"epoch": 0.3579296559350147,
"grad_norm": 0.04427665615849973,
"learning_rate": 9.299193944378112e-06,
"loss": 1.1685,
"step": 526
},
{
"epoch": 0.35861013056606983,
"grad_norm": 0.04803984704079458,
"learning_rate": 9.296433545429951e-06,
"loss": 1.1551,
"step": 527
},
{
"epoch": 0.35929060519712497,
"grad_norm": 0.04228609178520076,
"learning_rate": 9.293668132081528e-06,
"loss": 1.2,
"step": 528
},
{
"epoch": 0.35997107982818016,
"grad_norm": 0.04327688602770503,
"learning_rate": 9.290897707560376e-06,
"loss": 1.1496,
"step": 529
},
{
"epoch": 0.3606515544592353,
"grad_norm": 0.03945827293494703,
"learning_rate": 9.28812227509988e-06,
"loss": 1.1688,
"step": 530
},
{
"epoch": 0.3613320290902905,
"grad_norm": 0.06770668944349958,
"learning_rate": 9.285341837939267e-06,
"loss": 1.1949,
"step": 531
},
{
"epoch": 0.36201250372134564,
"grad_norm": 0.04952482056953356,
"learning_rate": 9.282556399323608e-06,
"loss": 1.1547,
"step": 532
},
{
"epoch": 0.3626929783524008,
"grad_norm": 0.04691822525955649,
"learning_rate": 9.279765962503809e-06,
"loss": 1.2035,
"step": 533
},
{
"epoch": 0.36337345298345597,
"grad_norm": 0.04894000875415613,
"learning_rate": 9.27697053073661e-06,
"loss": 1.145,
"step": 534
},
{
"epoch": 0.3640539276145111,
"grad_norm": 0.04336478998694143,
"learning_rate": 9.27417010728458e-06,
"loss": 1.1325,
"step": 535
},
{
"epoch": 0.3647344022455663,
"grad_norm": 0.04176773876592002,
"learning_rate": 9.271364695416115e-06,
"loss": 1.1835,
"step": 536
},
{
"epoch": 0.36541487687662144,
"grad_norm": 0.05622345966356037,
"learning_rate": 9.268554298405434e-06,
"loss": 1.1717,
"step": 537
},
{
"epoch": 0.3660953515076766,
"grad_norm": 0.043400920584928455,
"learning_rate": 9.26573891953257e-06,
"loss": 1.2127,
"step": 538
},
{
"epoch": 0.3667758261387318,
"grad_norm": 0.05286638961353637,
"learning_rate": 9.262918562083374e-06,
"loss": 1.2216,
"step": 539
},
{
"epoch": 0.3674563007697869,
"grad_norm": 0.056795432935915866,
"learning_rate": 9.260093229349507e-06,
"loss": 1.1489,
"step": 540
},
{
"epoch": 0.3681367754008421,
"grad_norm": 0.04668482939527108,
"learning_rate": 9.25726292462844e-06,
"loss": 1.1994,
"step": 541
},
{
"epoch": 0.36881725003189725,
"grad_norm": 0.044822594591671316,
"learning_rate": 9.254427651223434e-06,
"loss": 1.1824,
"step": 542
},
{
"epoch": 0.3694977246629524,
"grad_norm": 0.04345980827207413,
"learning_rate": 9.251587412443567e-06,
"loss": 1.1459,
"step": 543
},
{
"epoch": 0.3701781992940076,
"grad_norm": 0.06220612289602904,
"learning_rate": 9.248742211603699e-06,
"loss": 1.0962,
"step": 544
},
{
"epoch": 0.3708586739250627,
"grad_norm": 0.04464179454313693,
"learning_rate": 9.245892052024486e-06,
"loss": 1.2087,
"step": 545
},
{
"epoch": 0.3715391485561179,
"grad_norm": 0.0482955122912646,
"learning_rate": 9.243036937032373e-06,
"loss": 1.125,
"step": 546
},
{
"epoch": 0.37221962318717305,
"grad_norm": 0.044665776636380314,
"learning_rate": 9.240176869959582e-06,
"loss": 1.138,
"step": 547
},
{
"epoch": 0.3729000978182282,
"grad_norm": 0.04680059833998865,
"learning_rate": 9.237311854144125e-06,
"loss": 1.2355,
"step": 548
},
{
"epoch": 0.3735805724492834,
"grad_norm": 0.10018674747989316,
"learning_rate": 9.23444189292978e-06,
"loss": 1.1918,
"step": 549
},
{
"epoch": 0.3742610470803385,
"grad_norm": 0.04306609979305801,
"learning_rate": 9.2315669896661e-06,
"loss": 1.1281,
"step": 550
},
{
"epoch": 0.3749415217113937,
"grad_norm": 0.05047293198274693,
"learning_rate": 9.228687147708409e-06,
"loss": 1.1483,
"step": 551
},
{
"epoch": 0.37562199634244886,
"grad_norm": 0.04210396323953163,
"learning_rate": 9.225802370417789e-06,
"loss": 1.1321,
"step": 552
},
{
"epoch": 0.376302470973504,
"grad_norm": 0.06228495287048316,
"learning_rate": 9.222912661161088e-06,
"loss": 1.17,
"step": 553
},
{
"epoch": 0.3769829456045592,
"grad_norm": 0.04060494627163399,
"learning_rate": 9.220018023310908e-06,
"loss": 1.1338,
"step": 554
},
{
"epoch": 0.37766342023561433,
"grad_norm": 0.04117475908146937,
"learning_rate": 9.217118460245602e-06,
"loss": 1.116,
"step": 555
},
{
"epoch": 0.37834389486666953,
"grad_norm": 0.05357625308494766,
"learning_rate": 9.214213975349272e-06,
"loss": 1.154,
"step": 556
},
{
"epoch": 0.37902436949772467,
"grad_norm": 0.04262138167104594,
"learning_rate": 9.211304572011765e-06,
"loss": 1.1779,
"step": 557
},
{
"epoch": 0.3797048441287798,
"grad_norm": 0.05022095152403962,
"learning_rate": 9.208390253628667e-06,
"loss": 1.2275,
"step": 558
},
{
"epoch": 0.380385318759835,
"grad_norm": 0.038697641779660466,
"learning_rate": 9.205471023601302e-06,
"loss": 1.2025,
"step": 559
},
{
"epoch": 0.38106579339089014,
"grad_norm": 0.04837293979857492,
"learning_rate": 9.202546885336725e-06,
"loss": 1.2113,
"step": 560
},
{
"epoch": 0.38174626802194533,
"grad_norm": 0.04709673453076869,
"learning_rate": 9.199617842247718e-06,
"loss": 1.2322,
"step": 561
},
{
"epoch": 0.3824267426530005,
"grad_norm": 0.038353871634371726,
"learning_rate": 9.196683897752794e-06,
"loss": 1.14,
"step": 562
},
{
"epoch": 0.3831072172840556,
"grad_norm": 0.03856451637837848,
"learning_rate": 9.193745055276177e-06,
"loss": 1.1752,
"step": 563
},
{
"epoch": 0.3837876919151108,
"grad_norm": 0.04029222501667547,
"learning_rate": 9.190801318247817e-06,
"loss": 1.134,
"step": 564
},
{
"epoch": 0.38446816654616595,
"grad_norm": 0.04718440155396733,
"learning_rate": 9.18785269010337e-06,
"loss": 1.1904,
"step": 565
},
{
"epoch": 0.3851486411772211,
"grad_norm": 0.047157298136945865,
"learning_rate": 9.184899174284201e-06,
"loss": 1.2019,
"step": 566
},
{
"epoch": 0.3858291158082763,
"grad_norm": 0.040554851143715896,
"learning_rate": 9.181940774237383e-06,
"loss": 1.1578,
"step": 567
},
{
"epoch": 0.3865095904393314,
"grad_norm": 0.04395329490897282,
"learning_rate": 9.178977493415684e-06,
"loss": 1.1691,
"step": 568
},
{
"epoch": 0.3871900650703866,
"grad_norm": 0.0433727263541022,
"learning_rate": 9.176009335277575e-06,
"loss": 1.1697,
"step": 569
},
{
"epoch": 0.38787053970144175,
"grad_norm": 0.04385387344621323,
"learning_rate": 9.173036303287215e-06,
"loss": 1.2413,
"step": 570
},
{
"epoch": 0.3885510143324969,
"grad_norm": 0.04299760073456161,
"learning_rate": 9.17005840091445e-06,
"loss": 1.2259,
"step": 571
},
{
"epoch": 0.3892314889635521,
"grad_norm": 0.04243566940825901,
"learning_rate": 9.167075631634816e-06,
"loss": 1.1939,
"step": 572
},
{
"epoch": 0.3899119635946072,
"grad_norm": 0.04414891882046903,
"learning_rate": 9.164087998929523e-06,
"loss": 1.1575,
"step": 573
},
{
"epoch": 0.3905924382256624,
"grad_norm": 0.03882576400514526,
"learning_rate": 9.16109550628546e-06,
"loss": 1.1904,
"step": 574
},
{
"epoch": 0.39127291285671756,
"grad_norm": 0.05228919194953363,
"learning_rate": 9.15809815719519e-06,
"loss": 1.2166,
"step": 575
},
{
"epoch": 0.3919533874877727,
"grad_norm": 0.03962957960394459,
"learning_rate": 9.155095955156941e-06,
"loss": 1.1888,
"step": 576
},
{
"epoch": 0.3926338621188279,
"grad_norm": 0.041080403985258,
"learning_rate": 9.152088903674605e-06,
"loss": 1.1468,
"step": 577
},
{
"epoch": 0.39331433674988303,
"grad_norm": 0.03904444499346437,
"learning_rate": 9.149077006257734e-06,
"loss": 1.2097,
"step": 578
},
{
"epoch": 0.3939948113809382,
"grad_norm": 0.055258121836283904,
"learning_rate": 9.14606026642154e-06,
"loss": 1.183,
"step": 579
},
{
"epoch": 0.39467528601199336,
"grad_norm": 0.043950342680780415,
"learning_rate": 9.143038687686877e-06,
"loss": 1.2286,
"step": 580
},
{
"epoch": 0.3953557606430485,
"grad_norm": 0.04326934798265069,
"learning_rate": 9.140012273580261e-06,
"loss": 1.1461,
"step": 581
},
{
"epoch": 0.3960362352741037,
"grad_norm": 0.11342425414721487,
"learning_rate": 9.136981027633834e-06,
"loss": 1.1621,
"step": 582
},
{
"epoch": 0.39671670990515884,
"grad_norm": 0.045256571639823724,
"learning_rate": 9.133944953385392e-06,
"loss": 1.1601,
"step": 583
},
{
"epoch": 0.39739718453621403,
"grad_norm": 0.052853385155972675,
"learning_rate": 9.130904054378358e-06,
"loss": 1.1266,
"step": 584
},
{
"epoch": 0.39807765916726917,
"grad_norm": 0.044595902031428894,
"learning_rate": 9.127858334161789e-06,
"loss": 1.1782,
"step": 585
},
{
"epoch": 0.3987581337983243,
"grad_norm": 0.050516667255890005,
"learning_rate": 9.124807796290366e-06,
"loss": 1.2294,
"step": 586
},
{
"epoch": 0.3994386084293795,
"grad_norm": 0.03737860442207068,
"learning_rate": 9.1217524443244e-06,
"loss": 1.1784,
"step": 587
},
{
"epoch": 0.40011908306043464,
"grad_norm": 0.04225329960507096,
"learning_rate": 9.118692281829813e-06,
"loss": 1.1505,
"step": 588
},
{
"epoch": 0.40079955769148984,
"grad_norm": 0.050763619906775,
"learning_rate": 9.115627312378141e-06,
"loss": 1.1469,
"step": 589
},
{
"epoch": 0.401480032322545,
"grad_norm": 0.05617411574200467,
"learning_rate": 9.112557539546535e-06,
"loss": 1.1776,
"step": 590
},
{
"epoch": 0.4021605069536001,
"grad_norm": 0.07658515208497231,
"learning_rate": 9.109482966917753e-06,
"loss": 1.1436,
"step": 591
},
{
"epoch": 0.4028409815846553,
"grad_norm": 0.05122117400200576,
"learning_rate": 9.10640359808015e-06,
"loss": 1.2159,
"step": 592
},
{
"epoch": 0.40352145621571045,
"grad_norm": 0.042423364800815176,
"learning_rate": 9.10331943662768e-06,
"loss": 1.1828,
"step": 593
},
{
"epoch": 0.40420193084676564,
"grad_norm": 0.05370052513597702,
"learning_rate": 9.100230486159893e-06,
"loss": 1.1537,
"step": 594
},
{
"epoch": 0.4048824054778208,
"grad_norm": 0.043549532367692975,
"learning_rate": 9.097136750281925e-06,
"loss": 1.1416,
"step": 595
},
{
"epoch": 0.4055628801088759,
"grad_norm": 0.04895296341337781,
"learning_rate": 9.094038232604499e-06,
"loss": 1.1718,
"step": 596
},
{
"epoch": 0.4062433547399311,
"grad_norm": 0.05538114190435457,
"learning_rate": 9.090934936743919e-06,
"loss": 1.162,
"step": 597
},
{
"epoch": 0.40692382937098626,
"grad_norm": 0.04557565748071047,
"learning_rate": 9.087826866322065e-06,
"loss": 1.1355,
"step": 598
},
{
"epoch": 0.40760430400204145,
"grad_norm": 0.04378712655891265,
"learning_rate": 9.084714024966387e-06,
"loss": 1.2115,
"step": 599
},
{
"epoch": 0.4082847786330966,
"grad_norm": 0.04170201899514444,
"learning_rate": 9.081596416309913e-06,
"loss": 1.2017,
"step": 600
},
{
"epoch": 0.40896525326415173,
"grad_norm": 0.04861323345657344,
"learning_rate": 9.07847404399122e-06,
"loss": 1.1319,
"step": 601
},
{
"epoch": 0.4096457278952069,
"grad_norm": 0.041723471269353364,
"learning_rate": 9.075346911654456e-06,
"loss": 1.2286,
"step": 602
},
{
"epoch": 0.41032620252626206,
"grad_norm": 0.04335204371295228,
"learning_rate": 9.072215022949323e-06,
"loss": 1.1921,
"step": 603
},
{
"epoch": 0.4110066771573172,
"grad_norm": 0.04528272419703535,
"learning_rate": 9.069078381531067e-06,
"loss": 1.2223,
"step": 604
},
{
"epoch": 0.4116871517883724,
"grad_norm": 0.05715384377035942,
"learning_rate": 9.06593699106049e-06,
"loss": 1.0998,
"step": 605
},
{
"epoch": 0.41236762641942754,
"grad_norm": 0.04669873043406728,
"learning_rate": 9.062790855203932e-06,
"loss": 1.2269,
"step": 606
},
{
"epoch": 0.41304810105048273,
"grad_norm": 0.04030013224679412,
"learning_rate": 9.059639977633272e-06,
"loss": 1.1337,
"step": 607
},
{
"epoch": 0.41372857568153787,
"grad_norm": 0.04483399795097766,
"learning_rate": 9.056484362025922e-06,
"loss": 1.1496,
"step": 608
},
{
"epoch": 0.414409050312593,
"grad_norm": 0.04135748479654297,
"learning_rate": 9.053324012064826e-06,
"loss": 1.1519,
"step": 609
},
{
"epoch": 0.4150895249436482,
"grad_norm": 0.038852153018098134,
"learning_rate": 9.050158931438451e-06,
"loss": 1.2013,
"step": 610
},
{
"epoch": 0.41576999957470334,
"grad_norm": 0.04120197609440938,
"learning_rate": 9.046989123840787e-06,
"loss": 1.092,
"step": 611
},
{
"epoch": 0.41645047420575854,
"grad_norm": 0.0632098125332872,
"learning_rate": 9.043814592971345e-06,
"loss": 1.1927,
"step": 612
},
{
"epoch": 0.4171309488368137,
"grad_norm": 0.05564769512803907,
"learning_rate": 9.040635342535138e-06,
"loss": 1.156,
"step": 613
},
{
"epoch": 0.4178114234678688,
"grad_norm": 0.04572723207846192,
"learning_rate": 9.037451376242696e-06,
"loss": 1.1422,
"step": 614
},
{
"epoch": 0.418491898098924,
"grad_norm": 0.04550948417572969,
"learning_rate": 9.03426269781005e-06,
"loss": 1.1934,
"step": 615
},
{
"epoch": 0.41917237272997915,
"grad_norm": 0.04731619409086017,
"learning_rate": 9.031069310958733e-06,
"loss": 1.1733,
"step": 616
},
{
"epoch": 0.41985284736103434,
"grad_norm": 0.044246541580062755,
"learning_rate": 9.027871219415768e-06,
"loss": 1.1082,
"step": 617
},
{
"epoch": 0.4205333219920895,
"grad_norm": 0.04820982477572937,
"learning_rate": 9.024668426913671e-06,
"loss": 1.1309,
"step": 618
},
{
"epoch": 0.4212137966231446,
"grad_norm": 0.08383658208257669,
"learning_rate": 9.021460937190452e-06,
"loss": 1.1564,
"step": 619
},
{
"epoch": 0.4218942712541998,
"grad_norm": 0.05604400126597725,
"learning_rate": 9.018248753989589e-06,
"loss": 1.1474,
"step": 620
},
{
"epoch": 0.42257474588525495,
"grad_norm": 0.05356260018739531,
"learning_rate": 9.015031881060049e-06,
"loss": 1.1796,
"step": 621
},
{
"epoch": 0.42325522051631015,
"grad_norm": 0.04238920741626305,
"learning_rate": 9.011810322156269e-06,
"loss": 1.2104,
"step": 622
},
{
"epoch": 0.4239356951473653,
"grad_norm": 0.04894978259042282,
"learning_rate": 9.008584081038154e-06,
"loss": 1.1594,
"step": 623
},
{
"epoch": 0.4246161697784204,
"grad_norm": 0.04426277481058919,
"learning_rate": 9.005353161471075e-06,
"loss": 1.2169,
"step": 624
},
{
"epoch": 0.4252966444094756,
"grad_norm": 0.043070948426717666,
"learning_rate": 9.002117567225864e-06,
"loss": 1.1211,
"step": 625
},
{
"epoch": 0.42597711904053076,
"grad_norm": 0.05303928667483388,
"learning_rate": 8.998877302078803e-06,
"loss": 1.149,
"step": 626
},
{
"epoch": 0.42665759367158596,
"grad_norm": 0.04516784837517743,
"learning_rate": 8.995632369811637e-06,
"loss": 1.1671,
"step": 627
},
{
"epoch": 0.4273380683026411,
"grad_norm": 0.04548405871251726,
"learning_rate": 8.992382774211546e-06,
"loss": 1.1877,
"step": 628
},
{
"epoch": 0.42801854293369623,
"grad_norm": 0.04781155444112428,
"learning_rate": 8.98912851907116e-06,
"loss": 1.1783,
"step": 629
},
{
"epoch": 0.42869901756475143,
"grad_norm": 0.052231441639144574,
"learning_rate": 8.985869608188545e-06,
"loss": 1.1696,
"step": 630
},
{
"epoch": 0.42937949219580657,
"grad_norm": 0.0431022135401387,
"learning_rate": 8.982606045367197e-06,
"loss": 1.1694,
"step": 631
},
{
"epoch": 0.43005996682686176,
"grad_norm": 0.04575883042207237,
"learning_rate": 8.97933783441605e-06,
"loss": 1.1323,
"step": 632
},
{
"epoch": 0.4307404414579169,
"grad_norm": 0.04896257001899479,
"learning_rate": 8.976064979149455e-06,
"loss": 1.1249,
"step": 633
},
{
"epoch": 0.43142091608897204,
"grad_norm": 0.04389504944422267,
"learning_rate": 8.97278748338719e-06,
"loss": 1.159,
"step": 634
},
{
"epoch": 0.43210139072002723,
"grad_norm": 0.05622702110796912,
"learning_rate": 8.969505350954437e-06,
"loss": 1.2003,
"step": 635
},
{
"epoch": 0.4327818653510824,
"grad_norm": 0.04040018206262394,
"learning_rate": 8.966218585681807e-06,
"loss": 1.1349,
"step": 636
},
{
"epoch": 0.43346233998213757,
"grad_norm": 0.047477365580878915,
"learning_rate": 8.962927191405303e-06,
"loss": 1.2024,
"step": 637
},
{
"epoch": 0.4341428146131927,
"grad_norm": 0.048555827930069,
"learning_rate": 8.95963117196634e-06,
"loss": 1.2557,
"step": 638
},
{
"epoch": 0.43482328924424785,
"grad_norm": 0.054659375585268836,
"learning_rate": 8.956330531211722e-06,
"loss": 1.188,
"step": 639
},
{
"epoch": 0.43550376387530304,
"grad_norm": 0.040645473042345025,
"learning_rate": 8.953025272993658e-06,
"loss": 1.1376,
"step": 640
},
{
"epoch": 0.4361842385063582,
"grad_norm": 0.046111292563565255,
"learning_rate": 8.949715401169736e-06,
"loss": 1.2019,
"step": 641
},
{
"epoch": 0.4368647131374133,
"grad_norm": 0.054079384073597254,
"learning_rate": 8.946400919602933e-06,
"loss": 1.0977,
"step": 642
},
{
"epoch": 0.4375451877684685,
"grad_norm": 0.05731924786742128,
"learning_rate": 8.943081832161609e-06,
"loss": 1.1544,
"step": 643
},
{
"epoch": 0.43822566239952365,
"grad_norm": 0.05630874065668251,
"learning_rate": 8.939758142719492e-06,
"loss": 1.1918,
"step": 644
},
{
"epoch": 0.43890613703057885,
"grad_norm": 0.03816990143767904,
"learning_rate": 8.936429855155689e-06,
"loss": 1.2012,
"step": 645
},
{
"epoch": 0.439586611661634,
"grad_norm": 0.04400978621701978,
"learning_rate": 8.933096973354665e-06,
"loss": 1.1453,
"step": 646
},
{
"epoch": 0.4402670862926891,
"grad_norm": 0.04665880271204441,
"learning_rate": 8.929759501206256e-06,
"loss": 1.1265,
"step": 647
},
{
"epoch": 0.4409475609237443,
"grad_norm": 0.043030911029214435,
"learning_rate": 8.926417442605648e-06,
"loss": 1.135,
"step": 648
},
{
"epoch": 0.44162803555479946,
"grad_norm": 0.04123644734185817,
"learning_rate": 8.923070801453387e-06,
"loss": 1.2076,
"step": 649
},
{
"epoch": 0.44230851018585465,
"grad_norm": 0.04506167142447191,
"learning_rate": 8.919719581655357e-06,
"loss": 1.1844,
"step": 650
},
{
"epoch": 0.4429889848169098,
"grad_norm": 0.046921451751882365,
"learning_rate": 8.916363787122799e-06,
"loss": 1.1617,
"step": 651
},
{
"epoch": 0.44366945944796493,
"grad_norm": 0.04190142210943615,
"learning_rate": 8.913003421772281e-06,
"loss": 1.1999,
"step": 652
},
{
"epoch": 0.4443499340790201,
"grad_norm": 0.04257383871526422,
"learning_rate": 8.909638489525716e-06,
"loss": 1.1247,
"step": 653
},
{
"epoch": 0.44503040871007526,
"grad_norm": 0.04284067496617002,
"learning_rate": 8.906268994310339e-06,
"loss": 1.126,
"step": 654
},
{
"epoch": 0.44571088334113046,
"grad_norm": 0.04146605514930847,
"learning_rate": 8.902894940058711e-06,
"loss": 1.147,
"step": 655
},
{
"epoch": 0.4463913579721856,
"grad_norm": 0.039662388499429706,
"learning_rate": 8.89951633070872e-06,
"loss": 1.1941,
"step": 656
},
{
"epoch": 0.44707183260324074,
"grad_norm": 0.04418598970569444,
"learning_rate": 8.896133170203568e-06,
"loss": 1.2457,
"step": 657
},
{
"epoch": 0.44775230723429593,
"grad_norm": 0.10906792539732048,
"learning_rate": 8.892745462491763e-06,
"loss": 1.1104,
"step": 658
},
{
"epoch": 0.44843278186535107,
"grad_norm": 0.0424440730081983,
"learning_rate": 8.889353211527127e-06,
"loss": 1.2128,
"step": 659
},
{
"epoch": 0.44911325649640627,
"grad_norm": 0.03940467016530127,
"learning_rate": 8.88595642126878e-06,
"loss": 1.1276,
"step": 660
},
{
"epoch": 0.4497937311274614,
"grad_norm": 0.04242304735361312,
"learning_rate": 8.882555095681146e-06,
"loss": 1.153,
"step": 661
},
{
"epoch": 0.45047420575851654,
"grad_norm": 0.045990540093621315,
"learning_rate": 8.879149238733932e-06,
"loss": 1.1676,
"step": 662
},
{
"epoch": 0.45115468038957174,
"grad_norm": 0.038752849130074755,
"learning_rate": 8.875738854402145e-06,
"loss": 1.1658,
"step": 663
},
{
"epoch": 0.4518351550206269,
"grad_norm": 0.049091711940893935,
"learning_rate": 8.872323946666068e-06,
"loss": 1.2261,
"step": 664
},
{
"epoch": 0.45251562965168207,
"grad_norm": 0.04436510007180478,
"learning_rate": 8.868904519511265e-06,
"loss": 1.1589,
"step": 665
},
{
"epoch": 0.4531961042827372,
"grad_norm": 0.04529044535448883,
"learning_rate": 8.865480576928578e-06,
"loss": 1.1663,
"step": 666
},
{
"epoch": 0.45387657891379235,
"grad_norm": 0.04350982910961256,
"learning_rate": 8.862052122914113e-06,
"loss": 1.0842,
"step": 667
},
{
"epoch": 0.45455705354484754,
"grad_norm": 0.04415118922290587,
"learning_rate": 8.858619161469246e-06,
"loss": 1.1682,
"step": 668
},
{
"epoch": 0.4552375281759027,
"grad_norm": 0.04102662678874476,
"learning_rate": 8.855181696600615e-06,
"loss": 1.225,
"step": 669
},
{
"epoch": 0.4559180028069579,
"grad_norm": 0.03978611381793182,
"learning_rate": 8.851739732320109e-06,
"loss": 1.2356,
"step": 670
},
{
"epoch": 0.456598477438013,
"grad_norm": 0.03808971840706306,
"learning_rate": 8.84829327264487e-06,
"loss": 1.1477,
"step": 671
},
{
"epoch": 0.45727895206906816,
"grad_norm": 0.04093130739375452,
"learning_rate": 8.844842321597289e-06,
"loss": 1.1476,
"step": 672
},
{
"epoch": 0.45795942670012335,
"grad_norm": 0.0397782234125982,
"learning_rate": 8.841386883204996e-06,
"loss": 1.1601,
"step": 673
},
{
"epoch": 0.4586399013311785,
"grad_norm": 0.04360741298869491,
"learning_rate": 8.83792696150086e-06,
"loss": 1.1521,
"step": 674
},
{
"epoch": 0.4593203759622337,
"grad_norm": 0.03973179592791577,
"learning_rate": 8.834462560522983e-06,
"loss": 1.1552,
"step": 675
},
{
"epoch": 0.4600008505932888,
"grad_norm": 0.0537116714743348,
"learning_rate": 8.83099368431469e-06,
"loss": 1.1733,
"step": 676
},
{
"epoch": 0.46068132522434396,
"grad_norm": 0.04194800221614982,
"learning_rate": 8.827520336924539e-06,
"loss": 1.1144,
"step": 677
},
{
"epoch": 0.46136179985539916,
"grad_norm": 0.03962763960599941,
"learning_rate": 8.824042522406295e-06,
"loss": 1.1887,
"step": 678
},
{
"epoch": 0.4620422744864543,
"grad_norm": 0.05396619916684024,
"learning_rate": 8.820560244818943e-06,
"loss": 1.186,
"step": 679
},
{
"epoch": 0.46272274911750944,
"grad_norm": 0.03922059133562145,
"learning_rate": 8.817073508226677e-06,
"loss": 1.1002,
"step": 680
},
{
"epoch": 0.46340322374856463,
"grad_norm": 0.04739443119639226,
"learning_rate": 8.813582316698892e-06,
"loss": 1.2155,
"step": 681
},
{
"epoch": 0.46408369837961977,
"grad_norm": 0.04637500866593254,
"learning_rate": 8.810086674310184e-06,
"loss": 1.1297,
"step": 682
},
{
"epoch": 0.46476417301067496,
"grad_norm": 0.04244486346065373,
"learning_rate": 8.806586585140346e-06,
"loss": 1.1765,
"step": 683
},
{
"epoch": 0.4654446476417301,
"grad_norm": 0.04328360465585307,
"learning_rate": 8.803082053274357e-06,
"loss": 1.2268,
"step": 684
},
{
"epoch": 0.46612512227278524,
"grad_norm": 0.05890001367764087,
"learning_rate": 8.799573082802384e-06,
"loss": 1.1906,
"step": 685
},
{
"epoch": 0.46680559690384044,
"grad_norm": 0.041636831966172676,
"learning_rate": 8.796059677819773e-06,
"loss": 1.1916,
"step": 686
},
{
"epoch": 0.4674860715348956,
"grad_norm": 0.045474654484024504,
"learning_rate": 8.792541842427043e-06,
"loss": 1.1641,
"step": 687
},
{
"epoch": 0.46816654616595077,
"grad_norm": 0.04576863707481716,
"learning_rate": 8.789019580729889e-06,
"loss": 1.1878,
"step": 688
},
{
"epoch": 0.4688470207970059,
"grad_norm": 0.06478144940333726,
"learning_rate": 8.78549289683917e-06,
"loss": 1.2141,
"step": 689
},
{
"epoch": 0.46952749542806105,
"grad_norm": 0.043456374490342724,
"learning_rate": 8.781961794870903e-06,
"loss": 1.2166,
"step": 690
},
{
"epoch": 0.47020797005911624,
"grad_norm": 0.049269380160544426,
"learning_rate": 8.778426278946266e-06,
"loss": 1.1901,
"step": 691
},
{
"epoch": 0.4708884446901714,
"grad_norm": 0.044225703845258736,
"learning_rate": 8.774886353191587e-06,
"loss": 1.1272,
"step": 692
},
{
"epoch": 0.4715689193212266,
"grad_norm": 0.05178632319183577,
"learning_rate": 8.771342021738338e-06,
"loss": 1.165,
"step": 693
},
{
"epoch": 0.4722493939522817,
"grad_norm": 0.04521221449695577,
"learning_rate": 8.767793288723137e-06,
"loss": 1.1625,
"step": 694
},
{
"epoch": 0.47292986858333685,
"grad_norm": 0.04707321365536953,
"learning_rate": 8.764240158287738e-06,
"loss": 1.2437,
"step": 695
},
{
"epoch": 0.47361034321439205,
"grad_norm": 0.07074717036076185,
"learning_rate": 8.760682634579023e-06,
"loss": 1.1775,
"step": 696
},
{
"epoch": 0.4742908178454472,
"grad_norm": 0.040897170854665654,
"learning_rate": 8.757120721749008e-06,
"loss": 1.1933,
"step": 697
},
{
"epoch": 0.4749712924765024,
"grad_norm": 0.048850521888701544,
"learning_rate": 8.753554423954828e-06,
"loss": 1.1267,
"step": 698
},
{
"epoch": 0.4756517671075575,
"grad_norm": 0.06210572694991292,
"learning_rate": 8.749983745358737e-06,
"loss": 1.1855,
"step": 699
},
{
"epoch": 0.47633224173861266,
"grad_norm": 0.048857526652923464,
"learning_rate": 8.746408690128098e-06,
"loss": 1.1665,
"step": 700
},
{
"epoch": 0.47701271636966786,
"grad_norm": 0.0414540516695226,
"learning_rate": 8.74282926243539e-06,
"loss": 1.1863,
"step": 701
},
{
"epoch": 0.477693191000723,
"grad_norm": 0.055584695327975414,
"learning_rate": 8.739245466458187e-06,
"loss": 1.1967,
"step": 702
},
{
"epoch": 0.4783736656317782,
"grad_norm": 0.04030451886055771,
"learning_rate": 8.735657306379163e-06,
"loss": 1.2083,
"step": 703
},
{
"epoch": 0.47905414026283333,
"grad_norm": 0.04719451383173251,
"learning_rate": 8.73206478638609e-06,
"loss": 1.1757,
"step": 704
},
{
"epoch": 0.47973461489388847,
"grad_norm": 0.04763825535086207,
"learning_rate": 8.728467910671824e-06,
"loss": 1.1618,
"step": 705
},
{
"epoch": 0.48041508952494366,
"grad_norm": 0.044435320565230585,
"learning_rate": 8.72486668343431e-06,
"loss": 1.1431,
"step": 706
},
{
"epoch": 0.4810955641559988,
"grad_norm": 0.040290417518756755,
"learning_rate": 8.72126110887656e-06,
"loss": 1.209,
"step": 707
},
{
"epoch": 0.481776038787054,
"grad_norm": 0.04438079005488205,
"learning_rate": 8.717651191206675e-06,
"loss": 1.1499,
"step": 708
},
{
"epoch": 0.48245651341810913,
"grad_norm": 0.04056501410888882,
"learning_rate": 8.714036934637811e-06,
"loss": 1.1974,
"step": 709
},
{
"epoch": 0.4831369880491643,
"grad_norm": 0.04206798497439385,
"learning_rate": 8.7104183433882e-06,
"loss": 1.2026,
"step": 710
},
{
"epoch": 0.48381746268021947,
"grad_norm": 0.05388403958303696,
"learning_rate": 8.706795421681123e-06,
"loss": 1.1419,
"step": 711
},
{
"epoch": 0.4844979373112746,
"grad_norm": 0.05393628641759076,
"learning_rate": 8.703168173744922e-06,
"loss": 1.1001,
"step": 712
},
{
"epoch": 0.4851784119423298,
"grad_norm": 0.038014835242398314,
"learning_rate": 8.699536603812985e-06,
"loss": 1.1714,
"step": 713
},
{
"epoch": 0.48585888657338494,
"grad_norm": 0.04159366635634062,
"learning_rate": 8.695900716123744e-06,
"loss": 1.1331,
"step": 714
},
{
"epoch": 0.4865393612044401,
"grad_norm": 0.04599652383543539,
"learning_rate": 8.692260514920673e-06,
"loss": 1.1626,
"step": 715
},
{
"epoch": 0.4872198358354953,
"grad_norm": 0.04701383530546749,
"learning_rate": 8.688616004452277e-06,
"loss": 1.1981,
"step": 716
},
{
"epoch": 0.4879003104665504,
"grad_norm": 0.04314451118569952,
"learning_rate": 8.684967188972092e-06,
"loss": 1.2204,
"step": 717
},
{
"epoch": 0.48858078509760555,
"grad_norm": 0.04650286004160289,
"learning_rate": 8.681314072738678e-06,
"loss": 1.1316,
"step": 718
},
{
"epoch": 0.48926125972866075,
"grad_norm": 0.051839407512247186,
"learning_rate": 8.677656660015616e-06,
"loss": 1.1252,
"step": 719
},
{
"epoch": 0.4899417343597159,
"grad_norm": 0.046793968679010596,
"learning_rate": 8.6739949550715e-06,
"loss": 1.1836,
"step": 720
},
{
"epoch": 0.4906222089907711,
"grad_norm": 0.05835913550371476,
"learning_rate": 8.670328962179933e-06,
"loss": 1.1668,
"step": 721
},
{
"epoch": 0.4913026836218262,
"grad_norm": 0.05277610249888075,
"learning_rate": 8.666658685619523e-06,
"loss": 1.1915,
"step": 722
},
{
"epoch": 0.49198315825288136,
"grad_norm": 0.0439462908397666,
"learning_rate": 8.662984129673878e-06,
"loss": 1.2047,
"step": 723
},
{
"epoch": 0.49266363288393655,
"grad_norm": 0.04834786737069348,
"learning_rate": 8.6593052986316e-06,
"loss": 1.1874,
"step": 724
},
{
"epoch": 0.4933441075149917,
"grad_norm": 0.04629847715145555,
"learning_rate": 8.655622196786281e-06,
"loss": 1.2091,
"step": 725
},
{
"epoch": 0.4940245821460469,
"grad_norm": 0.05364431842807492,
"learning_rate": 8.651934828436497e-06,
"loss": 1.137,
"step": 726
},
{
"epoch": 0.494705056777102,
"grad_norm": 0.044196454811106574,
"learning_rate": 8.648243197885805e-06,
"loss": 1.1291,
"step": 727
},
{
"epoch": 0.49538553140815716,
"grad_norm": 0.05481533638606025,
"learning_rate": 8.644547309442734e-06,
"loss": 1.2078,
"step": 728
},
{
"epoch": 0.49606600603921236,
"grad_norm": 0.03746499887219231,
"learning_rate": 8.640847167420782e-06,
"loss": 1.1583,
"step": 729
},
{
"epoch": 0.4967464806702675,
"grad_norm": 0.04187639740119247,
"learning_rate": 8.637142776138415e-06,
"loss": 1.173,
"step": 730
},
{
"epoch": 0.4974269553013227,
"grad_norm": 0.03998133984926249,
"learning_rate": 8.633434139919054e-06,
"loss": 1.1314,
"step": 731
},
{
"epoch": 0.49810742993237783,
"grad_norm": 0.04252691510243095,
"learning_rate": 8.62972126309108e-06,
"loss": 1.1895,
"step": 732
},
{
"epoch": 0.49878790456343297,
"grad_norm": 0.04184153795045174,
"learning_rate": 8.626004149987816e-06,
"loss": 1.1987,
"step": 733
},
{
"epoch": 0.49946837919448817,
"grad_norm": 0.041100921428193674,
"learning_rate": 8.622282804947537e-06,
"loss": 1.1745,
"step": 734
},
{
"epoch": 0.5001488538255433,
"grad_norm": 0.04508926878431586,
"learning_rate": 8.61855723231345e-06,
"loss": 1.1887,
"step": 735
},
{
"epoch": 0.5008293284565984,
"grad_norm": 0.04409577879983121,
"learning_rate": 8.614827436433699e-06,
"loss": 1.1979,
"step": 736
},
{
"epoch": 0.5015098030876537,
"grad_norm": 0.03695764179065035,
"learning_rate": 8.611093421661358e-06,
"loss": 1.1751,
"step": 737
},
{
"epoch": 0.5021902777187088,
"grad_norm": 0.053081097299524024,
"learning_rate": 8.607355192354425e-06,
"loss": 1.1036,
"step": 738
},
{
"epoch": 0.502870752349764,
"grad_norm": 0.04241717636125699,
"learning_rate": 8.603612752875816e-06,
"loss": 1.1677,
"step": 739
},
{
"epoch": 0.5035512269808191,
"grad_norm": 0.03994841133844269,
"learning_rate": 8.599866107593358e-06,
"loss": 1.1408,
"step": 740
},
{
"epoch": 0.5042317016118743,
"grad_norm": 0.04832878049821652,
"learning_rate": 8.596115260879792e-06,
"loss": 1.1854,
"step": 741
},
{
"epoch": 0.5049121762429294,
"grad_norm": 0.043935479378240015,
"learning_rate": 8.592360217112759e-06,
"loss": 1.1735,
"step": 742
},
{
"epoch": 0.5055926508739846,
"grad_norm": 0.03876754371980919,
"learning_rate": 8.588600980674796e-06,
"loss": 1.1186,
"step": 743
},
{
"epoch": 0.5062731255050398,
"grad_norm": 0.04326584981845795,
"learning_rate": 8.584837555953342e-06,
"loss": 1.1096,
"step": 744
},
{
"epoch": 0.5069536001360949,
"grad_norm": 0.04322087234927761,
"learning_rate": 8.581069947340715e-06,
"loss": 1.2033,
"step": 745
},
{
"epoch": 0.5076340747671501,
"grad_norm": 0.05173559138768165,
"learning_rate": 8.57729815923412e-06,
"loss": 1.1938,
"step": 746
},
{
"epoch": 0.5083145493982052,
"grad_norm": 0.06449777733963566,
"learning_rate": 8.57352219603564e-06,
"loss": 1.1486,
"step": 747
},
{
"epoch": 0.5089950240292604,
"grad_norm": 0.04675747699296703,
"learning_rate": 8.569742062152229e-06,
"loss": 1.1599,
"step": 748
},
{
"epoch": 0.5096754986603156,
"grad_norm": 0.03849730253733784,
"learning_rate": 8.565957761995713e-06,
"loss": 1.1435,
"step": 749
},
{
"epoch": 0.5103559732913707,
"grad_norm": 0.03856499927206122,
"learning_rate": 8.562169299982776e-06,
"loss": 1.1163,
"step": 750
},
{
"epoch": 0.5110364479224259,
"grad_norm": 0.03896826300900588,
"learning_rate": 8.558376680534959e-06,
"loss": 1.1434,
"step": 751
},
{
"epoch": 0.511716922553481,
"grad_norm": 0.04122931215472077,
"learning_rate": 8.55457990807866e-06,
"loss": 1.1932,
"step": 752
},
{
"epoch": 0.5123973971845363,
"grad_norm": 0.05047069333478561,
"learning_rate": 8.55077898704512e-06,
"loss": 1.2146,
"step": 753
},
{
"epoch": 0.5130778718155914,
"grad_norm": 0.06501739740811575,
"learning_rate": 8.546973921870421e-06,
"loss": 1.1243,
"step": 754
},
{
"epoch": 0.5137583464466465,
"grad_norm": 0.04916876628412371,
"learning_rate": 8.543164716995485e-06,
"loss": 1.1483,
"step": 755
},
{
"epoch": 0.5144388210777017,
"grad_norm": 0.04363604007516074,
"learning_rate": 8.539351376866066e-06,
"loss": 1.1375,
"step": 756
},
{
"epoch": 0.5151192957087568,
"grad_norm": 0.04021328761311774,
"learning_rate": 8.535533905932739e-06,
"loss": 1.191,
"step": 757
},
{
"epoch": 0.5157997703398121,
"grad_norm": 0.05141232392473441,
"learning_rate": 8.531712308650904e-06,
"loss": 1.2164,
"step": 758
},
{
"epoch": 0.5164802449708672,
"grad_norm": 0.045271342705341014,
"learning_rate": 8.527886589480779e-06,
"loss": 1.168,
"step": 759
},
{
"epoch": 0.5171607196019223,
"grad_norm": 0.03874295659270498,
"learning_rate": 8.524056752887385e-06,
"loss": 1.199,
"step": 760
},
{
"epoch": 0.5178411942329775,
"grad_norm": 0.05126557194804418,
"learning_rate": 8.520222803340557e-06,
"loss": 1.1758,
"step": 761
},
{
"epoch": 0.5185216688640326,
"grad_norm": 0.05042131118814898,
"learning_rate": 8.516384745314926e-06,
"loss": 1.2066,
"step": 762
},
{
"epoch": 0.5192021434950879,
"grad_norm": 0.045232732616894734,
"learning_rate": 8.512542583289918e-06,
"loss": 1.2174,
"step": 763
},
{
"epoch": 0.519882618126143,
"grad_norm": 0.04619546370697277,
"learning_rate": 8.508696321749752e-06,
"loss": 1.0822,
"step": 764
},
{
"epoch": 0.5205630927571981,
"grad_norm": 0.04503407035517795,
"learning_rate": 8.504845965183425e-06,
"loss": 1.1598,
"step": 765
},
{
"epoch": 0.5212435673882533,
"grad_norm": 0.046533137015248104,
"learning_rate": 8.50099151808472e-06,
"loss": 1.1587,
"step": 766
},
{
"epoch": 0.5219240420193084,
"grad_norm": 0.04467925059354398,
"learning_rate": 8.497132984952193e-06,
"loss": 1.1864,
"step": 767
},
{
"epoch": 0.5226045166503637,
"grad_norm": 0.07178437484801423,
"learning_rate": 8.493270370289164e-06,
"loss": 1.1798,
"step": 768
},
{
"epoch": 0.5232849912814188,
"grad_norm": 0.043296657654657934,
"learning_rate": 8.489403678603722e-06,
"loss": 1.155,
"step": 769
},
{
"epoch": 0.523965465912474,
"grad_norm": 0.04222421990019537,
"learning_rate": 8.485532914408712e-06,
"loss": 1.1231,
"step": 770
},
{
"epoch": 0.5246459405435291,
"grad_norm": 0.1841013854808659,
"learning_rate": 8.481658082221731e-06,
"loss": 1.1786,
"step": 771
},
{
"epoch": 0.5253264151745842,
"grad_norm": 0.04266946865706051,
"learning_rate": 8.477779186565125e-06,
"loss": 1.1543,
"step": 772
},
{
"epoch": 0.5260068898056395,
"grad_norm": 0.04197901267584969,
"learning_rate": 8.473896231965986e-06,
"loss": 1.1929,
"step": 773
},
{
"epoch": 0.5266873644366946,
"grad_norm": 0.05041076462281314,
"learning_rate": 8.470009222956138e-06,
"loss": 1.1678,
"step": 774
},
{
"epoch": 0.5273678390677498,
"grad_norm": 0.041756207467147366,
"learning_rate": 8.466118164072136e-06,
"loss": 1.1429,
"step": 775
},
{
"epoch": 0.5280483136988049,
"grad_norm": 0.04053005066881133,
"learning_rate": 8.462223059855268e-06,
"loss": 1.173,
"step": 776
},
{
"epoch": 0.52872878832986,
"grad_norm": 0.04058960808847781,
"learning_rate": 8.458323914851538e-06,
"loss": 1.1985,
"step": 777
},
{
"epoch": 0.5294092629609153,
"grad_norm": 0.07665799540243598,
"learning_rate": 8.45442073361167e-06,
"loss": 1.1475,
"step": 778
},
{
"epoch": 0.5300897375919704,
"grad_norm": 0.041452895159887655,
"learning_rate": 8.450513520691092e-06,
"loss": 1.1557,
"step": 779
},
{
"epoch": 0.5307702122230256,
"grad_norm": 0.05551330076968528,
"learning_rate": 8.446602280649947e-06,
"loss": 1.1192,
"step": 780
},
{
"epoch": 0.5314506868540807,
"grad_norm": 0.05125405070156681,
"learning_rate": 8.442687018053071e-06,
"loss": 1.1216,
"step": 781
},
{
"epoch": 0.5321311614851358,
"grad_norm": 0.050472767925999515,
"learning_rate": 8.438767737469995e-06,
"loss": 1.1259,
"step": 782
},
{
"epoch": 0.5328116361161911,
"grad_norm": 0.046872210857747565,
"learning_rate": 8.434844443474943e-06,
"loss": 1.1789,
"step": 783
},
{
"epoch": 0.5334921107472462,
"grad_norm": 0.061293610059609004,
"learning_rate": 8.430917140646821e-06,
"loss": 1.1632,
"step": 784
},
{
"epoch": 0.5341725853783014,
"grad_norm": 0.03912377936631386,
"learning_rate": 8.426985833569214e-06,
"loss": 1.2133,
"step": 785
},
{
"epoch": 0.5348530600093565,
"grad_norm": 0.04117780131854185,
"learning_rate": 8.42305052683038e-06,
"loss": 1.2032,
"step": 786
},
{
"epoch": 0.5355335346404116,
"grad_norm": 0.04132479790723434,
"learning_rate": 8.419111225023246e-06,
"loss": 1.138,
"step": 787
},
{
"epoch": 0.5362140092714669,
"grad_norm": 0.03842549922082187,
"learning_rate": 8.4151679327454e-06,
"loss": 1.1066,
"step": 788
},
{
"epoch": 0.536894483902522,
"grad_norm": 0.04080954335709224,
"learning_rate": 8.411220654599091e-06,
"loss": 1.1718,
"step": 789
},
{
"epoch": 0.5375749585335772,
"grad_norm": 0.043858387197321595,
"learning_rate": 8.407269395191216e-06,
"loss": 1.1219,
"step": 790
},
{
"epoch": 0.5382554331646323,
"grad_norm": 0.040515133708555966,
"learning_rate": 8.403314159133318e-06,
"loss": 1.2029,
"step": 791
},
{
"epoch": 0.5389359077956875,
"grad_norm": 0.0390219527076627,
"learning_rate": 8.399354951041584e-06,
"loss": 1.1463,
"step": 792
},
{
"epoch": 0.5396163824267427,
"grad_norm": 0.04574993504539064,
"learning_rate": 8.395391775536836e-06,
"loss": 1.173,
"step": 793
},
{
"epoch": 0.5402968570577978,
"grad_norm": 0.04294242766768708,
"learning_rate": 8.391424637244528e-06,
"loss": 1.1496,
"step": 794
},
{
"epoch": 0.540977331688853,
"grad_norm": 0.053654917019259465,
"learning_rate": 8.387453540794736e-06,
"loss": 1.1548,
"step": 795
},
{
"epoch": 0.5416578063199081,
"grad_norm": 0.04494280593223373,
"learning_rate": 8.383478490822157e-06,
"loss": 1.1672,
"step": 796
},
{
"epoch": 0.5423382809509633,
"grad_norm": 0.04793116519219443,
"learning_rate": 8.379499491966101e-06,
"loss": 1.1856,
"step": 797
},
{
"epoch": 0.5430187555820185,
"grad_norm": 0.04378062326698956,
"learning_rate": 8.375516548870489e-06,
"loss": 1.1574,
"step": 798
},
{
"epoch": 0.5436992302130736,
"grad_norm": 0.04142629682629962,
"learning_rate": 8.371529666183844e-06,
"loss": 1.2067,
"step": 799
},
{
"epoch": 0.5443797048441288,
"grad_norm": 0.04170169929693674,
"learning_rate": 8.367538848559287e-06,
"loss": 1.1378,
"step": 800
},
{
"epoch": 0.5450601794751839,
"grad_norm": 0.03970775100468027,
"learning_rate": 8.36354410065453e-06,
"loss": 1.0798,
"step": 801
},
{
"epoch": 0.5457406541062391,
"grad_norm": 0.04648578513446181,
"learning_rate": 8.359545427131876e-06,
"loss": 1.1443,
"step": 802
},
{
"epoch": 0.5464211287372943,
"grad_norm": 0.03858386824567043,
"learning_rate": 8.355542832658208e-06,
"loss": 1.0972,
"step": 803
},
{
"epoch": 0.5471016033683495,
"grad_norm": 0.04975907375302422,
"learning_rate": 8.351536321904983e-06,
"loss": 1.1998,
"step": 804
},
{
"epoch": 0.5477820779994046,
"grad_norm": 0.04287409473941564,
"learning_rate": 8.347525899548227e-06,
"loss": 1.1532,
"step": 805
},
{
"epoch": 0.5484625526304597,
"grad_norm": 0.04837832904766864,
"learning_rate": 8.343511570268541e-06,
"loss": 1.1088,
"step": 806
},
{
"epoch": 0.5491430272615149,
"grad_norm": 0.040964713063851035,
"learning_rate": 8.339493338751074e-06,
"loss": 1.1229,
"step": 807
},
{
"epoch": 0.5498235018925701,
"grad_norm": 0.04588890740643771,
"learning_rate": 8.335471209685538e-06,
"loss": 1.1096,
"step": 808
},
{
"epoch": 0.5505039765236253,
"grad_norm": 0.03744543486694232,
"learning_rate": 8.331445187766187e-06,
"loss": 1.1837,
"step": 809
},
{
"epoch": 0.5511844511546804,
"grad_norm": 0.05037176808744111,
"learning_rate": 8.327415277691824e-06,
"loss": 1.1767,
"step": 810
},
{
"epoch": 0.5518649257857355,
"grad_norm": 0.04432193944851549,
"learning_rate": 8.323381484165786e-06,
"loss": 1.1407,
"step": 811
},
{
"epoch": 0.5525454004167907,
"grad_norm": 0.04201447987523638,
"learning_rate": 8.319343811895946e-06,
"loss": 1.1581,
"step": 812
},
{
"epoch": 0.5532258750478459,
"grad_norm": 0.10203501157632323,
"learning_rate": 8.315302265594703e-06,
"loss": 1.1178,
"step": 813
},
{
"epoch": 0.5539063496789011,
"grad_norm": 0.05339937261467158,
"learning_rate": 8.311256849978974e-06,
"loss": 1.163,
"step": 814
},
{
"epoch": 0.5545868243099562,
"grad_norm": 0.044099665331117054,
"learning_rate": 8.307207569770193e-06,
"loss": 1.1013,
"step": 815
},
{
"epoch": 0.5552672989410113,
"grad_norm": 0.03931050767646318,
"learning_rate": 8.303154429694311e-06,
"loss": 1.123,
"step": 816
},
{
"epoch": 0.5559477735720665,
"grad_norm": 0.05335957788738985,
"learning_rate": 8.299097434481773e-06,
"loss": 1.2092,
"step": 817
},
{
"epoch": 0.5566282482031216,
"grad_norm": 0.0636990809879659,
"learning_rate": 8.295036588867533e-06,
"loss": 1.1319,
"step": 818
},
{
"epoch": 0.5573087228341769,
"grad_norm": 0.040551829004693854,
"learning_rate": 8.290971897591034e-06,
"loss": 1.136,
"step": 819
},
{
"epoch": 0.557989197465232,
"grad_norm": 0.03780136567377228,
"learning_rate": 8.286903365396205e-06,
"loss": 1.1784,
"step": 820
},
{
"epoch": 0.5586696720962872,
"grad_norm": 0.060039887239810956,
"learning_rate": 8.282830997031464e-06,
"loss": 1.1906,
"step": 821
},
{
"epoch": 0.5593501467273423,
"grad_norm": 0.04217363382120905,
"learning_rate": 8.278754797249702e-06,
"loss": 1.1789,
"step": 822
},
{
"epoch": 0.5600306213583974,
"grad_norm": 0.040100264548723535,
"learning_rate": 8.274674770808282e-06,
"loss": 1.1957,
"step": 823
},
{
"epoch": 0.5607110959894527,
"grad_norm": 0.04773398339267,
"learning_rate": 8.270590922469037e-06,
"loss": 1.1811,
"step": 824
},
{
"epoch": 0.5613915706205078,
"grad_norm": 0.045457042631204134,
"learning_rate": 8.266503256998256e-06,
"loss": 1.1744,
"step": 825
},
{
"epoch": 0.562072045251563,
"grad_norm": 0.04476890640827831,
"learning_rate": 8.262411779166681e-06,
"loss": 1.1706,
"step": 826
},
{
"epoch": 0.5627525198826181,
"grad_norm": 0.03782510037969898,
"learning_rate": 8.25831649374951e-06,
"loss": 1.1531,
"step": 827
},
{
"epoch": 0.5634329945136732,
"grad_norm": 0.043194762074185,
"learning_rate": 8.254217405526383e-06,
"loss": 1.1873,
"step": 828
},
{
"epoch": 0.5641134691447285,
"grad_norm": 0.04524945915938302,
"learning_rate": 8.250114519281374e-06,
"loss": 1.1377,
"step": 829
},
{
"epoch": 0.5647939437757836,
"grad_norm": 0.04165386202173816,
"learning_rate": 8.246007839802997e-06,
"loss": 1.1915,
"step": 830
},
{
"epoch": 0.5654744184068388,
"grad_norm": 0.050892456215113235,
"learning_rate": 8.241897371884183e-06,
"loss": 1.1828,
"step": 831
},
{
"epoch": 0.5661548930378939,
"grad_norm": 0.041485798471332706,
"learning_rate": 8.237783120322293e-06,
"loss": 1.1548,
"step": 832
},
{
"epoch": 0.566835367668949,
"grad_norm": 0.039661123365582446,
"learning_rate": 8.233665089919105e-06,
"loss": 1.1862,
"step": 833
},
{
"epoch": 0.5675158423000043,
"grad_norm": 0.04154176571972414,
"learning_rate": 8.229543285480797e-06,
"loss": 1.1389,
"step": 834
},
{
"epoch": 0.5681963169310594,
"grad_norm": 0.0424519182968462,
"learning_rate": 8.225417711817965e-06,
"loss": 1.1691,
"step": 835
},
{
"epoch": 0.5688767915621146,
"grad_norm": 0.042775332017904656,
"learning_rate": 8.221288373745591e-06,
"loss": 1.1646,
"step": 836
},
{
"epoch": 0.5695572661931697,
"grad_norm": 0.04110139091161163,
"learning_rate": 8.217155276083059e-06,
"loss": 1.1696,
"step": 837
},
{
"epoch": 0.5702377408242248,
"grad_norm": 0.04488592848228418,
"learning_rate": 8.213018423654144e-06,
"loss": 1.179,
"step": 838
},
{
"epoch": 0.5709182154552801,
"grad_norm": 0.04747192432701841,
"learning_rate": 8.20887782128699e-06,
"loss": 1.1498,
"step": 839
},
{
"epoch": 0.5715986900863352,
"grad_norm": 0.04711201935733248,
"learning_rate": 8.20473347381413e-06,
"loss": 1.1074,
"step": 840
},
{
"epoch": 0.5722791647173904,
"grad_norm": 0.044368574866158736,
"learning_rate": 8.200585386072464e-06,
"loss": 1.115,
"step": 841
},
{
"epoch": 0.5729596393484455,
"grad_norm": 0.04243245457403774,
"learning_rate": 8.196433562903252e-06,
"loss": 1.2098,
"step": 842
},
{
"epoch": 0.5736401139795007,
"grad_norm": 0.0411309807077596,
"learning_rate": 8.192278009152124e-06,
"loss": 1.1829,
"step": 843
},
{
"epoch": 0.5743205886105559,
"grad_norm": 0.04434792075597989,
"learning_rate": 8.188118729669054e-06,
"loss": 1.1559,
"step": 844
},
{
"epoch": 0.575001063241611,
"grad_norm": 0.04531013671919297,
"learning_rate": 8.183955729308373e-06,
"loss": 1.1544,
"step": 845
},
{
"epoch": 0.5756815378726662,
"grad_norm": 0.04415644124258442,
"learning_rate": 8.179789012928747e-06,
"loss": 1.1403,
"step": 846
},
{
"epoch": 0.5763620125037213,
"grad_norm": 0.041849175257147715,
"learning_rate": 8.175618585393183e-06,
"loss": 1.1735,
"step": 847
},
{
"epoch": 0.5770424871347765,
"grad_norm": 0.050924581330910657,
"learning_rate": 8.171444451569019e-06,
"loss": 1.1479,
"step": 848
},
{
"epoch": 0.5777229617658317,
"grad_norm": 0.03974823826416723,
"learning_rate": 8.167266616327921e-06,
"loss": 1.17,
"step": 849
},
{
"epoch": 0.5784034363968868,
"grad_norm": 0.042109166474074936,
"learning_rate": 8.163085084545867e-06,
"loss": 1.1912,
"step": 850
},
{
"epoch": 0.579083911027942,
"grad_norm": 0.043426415287164635,
"learning_rate": 8.158899861103159e-06,
"loss": 1.1243,
"step": 851
},
{
"epoch": 0.5797643856589971,
"grad_norm": 0.04306964323847212,
"learning_rate": 8.1547109508844e-06,
"loss": 1.1525,
"step": 852
},
{
"epoch": 0.5804448602900523,
"grad_norm": 0.04274847934636146,
"learning_rate": 8.150518358778501e-06,
"loss": 1.1823,
"step": 853
},
{
"epoch": 0.5811253349211075,
"grad_norm": 0.043729941677878165,
"learning_rate": 8.146322089678668e-06,
"loss": 1.2141,
"step": 854
},
{
"epoch": 0.5818058095521627,
"grad_norm": 0.04315912213214879,
"learning_rate": 8.142122148482397e-06,
"loss": 1.1823,
"step": 855
},
{
"epoch": 0.5824862841832178,
"grad_norm": 0.04231782214011899,
"learning_rate": 8.137918540091473e-06,
"loss": 1.2023,
"step": 856
},
{
"epoch": 0.5831667588142729,
"grad_norm": 0.0475718942693006,
"learning_rate": 8.13371126941196e-06,
"loss": 1.2363,
"step": 857
},
{
"epoch": 0.5838472334453281,
"grad_norm": 0.04626266969743917,
"learning_rate": 8.129500341354192e-06,
"loss": 1.1519,
"step": 858
},
{
"epoch": 0.5845277080763833,
"grad_norm": 0.038413332073930426,
"learning_rate": 8.125285760832778e-06,
"loss": 1.1381,
"step": 859
},
{
"epoch": 0.5852081827074385,
"grad_norm": 0.04042454155496313,
"learning_rate": 8.121067532766587e-06,
"loss": 1.1217,
"step": 860
},
{
"epoch": 0.5858886573384936,
"grad_norm": 0.04085148523310831,
"learning_rate": 8.116845662078744e-06,
"loss": 1.1591,
"step": 861
},
{
"epoch": 0.5865691319695487,
"grad_norm": 0.04377818146863928,
"learning_rate": 8.11262015369663e-06,
"loss": 1.2255,
"step": 862
},
{
"epoch": 0.5872496066006039,
"grad_norm": 0.039014134138926265,
"learning_rate": 8.10839101255186e-06,
"loss": 1.2082,
"step": 863
},
{
"epoch": 0.5879300812316591,
"grad_norm": 0.04200666559613162,
"learning_rate": 8.104158243580305e-06,
"loss": 1.112,
"step": 864
},
{
"epoch": 0.5886105558627143,
"grad_norm": 0.03911561242988071,
"learning_rate": 8.099921851722057e-06,
"loss": 1.1382,
"step": 865
},
{
"epoch": 0.5892910304937694,
"grad_norm": 0.04045230119388504,
"learning_rate": 8.095681841921441e-06,
"loss": 1.1266,
"step": 866
},
{
"epoch": 0.5899715051248245,
"grad_norm": 0.040116238529695106,
"learning_rate": 8.09143821912701e-06,
"loss": 1.1853,
"step": 867
},
{
"epoch": 0.5906519797558797,
"grad_norm": 0.04062447640292501,
"learning_rate": 8.087190988291523e-06,
"loss": 1.14,
"step": 868
},
{
"epoch": 0.5913324543869349,
"grad_norm": 0.04283812178287952,
"learning_rate": 8.082940154371956e-06,
"loss": 1.1273,
"step": 869
},
{
"epoch": 0.5920129290179901,
"grad_norm": 0.05571713855883823,
"learning_rate": 8.07868572232949e-06,
"loss": 1.049,
"step": 870
},
{
"epoch": 0.5926934036490452,
"grad_norm": 0.04016667844328468,
"learning_rate": 8.07442769712951e-06,
"loss": 1.1482,
"step": 871
},
{
"epoch": 0.5933738782801004,
"grad_norm": 0.038282189867659924,
"learning_rate": 8.070166083741583e-06,
"loss": 1.0814,
"step": 872
},
{
"epoch": 0.5940543529111555,
"grad_norm": 0.04240818562157267,
"learning_rate": 8.065900887139473e-06,
"loss": 1.1789,
"step": 873
},
{
"epoch": 0.5947348275422107,
"grad_norm": 0.03941580524487074,
"learning_rate": 8.061632112301122e-06,
"loss": 1.1855,
"step": 874
},
{
"epoch": 0.5954153021732659,
"grad_norm": 0.04368634784808724,
"learning_rate": 8.057359764208652e-06,
"loss": 1.1038,
"step": 875
},
{
"epoch": 0.596095776804321,
"grad_norm": 0.04874993152125096,
"learning_rate": 8.053083847848351e-06,
"loss": 1.1881,
"step": 876
},
{
"epoch": 0.5967762514353762,
"grad_norm": 0.04283185831118963,
"learning_rate": 8.048804368210675e-06,
"loss": 1.2021,
"step": 877
},
{
"epoch": 0.5974567260664313,
"grad_norm": 0.04963061963040367,
"learning_rate": 8.044521330290235e-06,
"loss": 1.1638,
"step": 878
},
{
"epoch": 0.5981372006974865,
"grad_norm": 0.04194658535985922,
"learning_rate": 8.040234739085799e-06,
"loss": 1.1806,
"step": 879
},
{
"epoch": 0.5988176753285417,
"grad_norm": 0.045232655719741964,
"learning_rate": 8.03594459960028e-06,
"loss": 1.1432,
"step": 880
},
{
"epoch": 0.5994981499595968,
"grad_norm": 0.041615174460038984,
"learning_rate": 8.031650916840732e-06,
"loss": 1.1535,
"step": 881
},
{
"epoch": 0.600178624590652,
"grad_norm": 0.04311597640305818,
"learning_rate": 8.027353695818345e-06,
"loss": 1.1866,
"step": 882
},
{
"epoch": 0.6008590992217071,
"grad_norm": 0.04648275659267898,
"learning_rate": 8.023052941548437e-06,
"loss": 1.2174,
"step": 883
},
{
"epoch": 0.6015395738527624,
"grad_norm": 0.041183174687582695,
"learning_rate": 8.018748659050456e-06,
"loss": 1.1922,
"step": 884
},
{
"epoch": 0.6022200484838175,
"grad_norm": 0.042641768711080094,
"learning_rate": 8.014440853347956e-06,
"loss": 1.1547,
"step": 885
},
{
"epoch": 0.6029005231148726,
"grad_norm": 0.041451603388394896,
"learning_rate": 8.010129529468614e-06,
"loss": 1.1317,
"step": 886
},
{
"epoch": 0.6035809977459278,
"grad_norm": 0.041272961165301054,
"learning_rate": 8.005814692444205e-06,
"loss": 1.1355,
"step": 887
},
{
"epoch": 0.6042614723769829,
"grad_norm": 0.04057221028324148,
"learning_rate": 8.001496347310614e-06,
"loss": 1.1873,
"step": 888
},
{
"epoch": 0.6049419470080382,
"grad_norm": 0.04135206077100455,
"learning_rate": 7.99717449910781e-06,
"loss": 1.1646,
"step": 889
},
{
"epoch": 0.6056224216390933,
"grad_norm": 0.040316079764168004,
"learning_rate": 7.992849152879857e-06,
"loss": 1.1466,
"step": 890
},
{
"epoch": 0.6063028962701484,
"grad_norm": 0.04092957611261813,
"learning_rate": 7.988520313674897e-06,
"loss": 1.151,
"step": 891
},
{
"epoch": 0.6069833709012036,
"grad_norm": 0.039374332097630076,
"learning_rate": 7.984187986545154e-06,
"loss": 1.1883,
"step": 892
},
{
"epoch": 0.6076638455322587,
"grad_norm": 0.03922720540085873,
"learning_rate": 7.97985217654692e-06,
"loss": 1.1423,
"step": 893
},
{
"epoch": 0.6083443201633139,
"grad_norm": 0.09045429744114127,
"learning_rate": 7.97551288874055e-06,
"loss": 1.0891,
"step": 894
},
{
"epoch": 0.6090247947943691,
"grad_norm": 0.04254293782715242,
"learning_rate": 7.97117012819046e-06,
"loss": 1.1897,
"step": 895
},
{
"epoch": 0.6097052694254242,
"grad_norm": 0.03929737573713484,
"learning_rate": 7.96682389996512e-06,
"loss": 1.22,
"step": 896
},
{
"epoch": 0.6103857440564794,
"grad_norm": 0.1188554224437093,
"learning_rate": 7.962474209137044e-06,
"loss": 1.0757,
"step": 897
},
{
"epoch": 0.6110662186875345,
"grad_norm": 0.041241461473570426,
"learning_rate": 7.95812106078279e-06,
"loss": 1.0574,
"step": 898
},
{
"epoch": 0.6117466933185897,
"grad_norm": 0.050560273673583955,
"learning_rate": 7.953764459982951e-06,
"loss": 1.1539,
"step": 899
},
{
"epoch": 0.6124271679496449,
"grad_norm": 0.04401496162998291,
"learning_rate": 7.94940441182215e-06,
"loss": 1.2764,
"step": 900
},
{
"epoch": 0.6131076425807,
"grad_norm": 0.040619036348773584,
"learning_rate": 7.945040921389032e-06,
"loss": 1.1664,
"step": 901
},
{
"epoch": 0.6137881172117552,
"grad_norm": 0.08684240225413613,
"learning_rate": 7.940673993776258e-06,
"loss": 1.1613,
"step": 902
},
{
"epoch": 0.6144685918428103,
"grad_norm": 0.047950332726318455,
"learning_rate": 7.936303634080505e-06,
"loss": 1.1246,
"step": 903
},
{
"epoch": 0.6151490664738655,
"grad_norm": 0.044138108104567156,
"learning_rate": 7.93192984740245e-06,
"loss": 1.1557,
"step": 904
},
{
"epoch": 0.6158295411049207,
"grad_norm": 0.04205279272151933,
"learning_rate": 7.927552638846776e-06,
"loss": 1.1822,
"step": 905
},
{
"epoch": 0.6165100157359759,
"grad_norm": 0.037151685883760985,
"learning_rate": 7.923172013522153e-06,
"loss": 1.1407,
"step": 906
},
{
"epoch": 0.617190490367031,
"grad_norm": 0.05084646852036924,
"learning_rate": 7.918787976541246e-06,
"loss": 1.1158,
"step": 907
},
{
"epoch": 0.6178709649980861,
"grad_norm": 0.03683258235559903,
"learning_rate": 7.914400533020695e-06,
"loss": 1.1817,
"step": 908
},
{
"epoch": 0.6185514396291413,
"grad_norm": 0.03971414515196644,
"learning_rate": 7.91000968808112e-06,
"loss": 1.1357,
"step": 909
},
{
"epoch": 0.6192319142601965,
"grad_norm": 0.0432587997178803,
"learning_rate": 7.905615446847107e-06,
"loss": 1.1191,
"step": 910
},
{
"epoch": 0.6199123888912517,
"grad_norm": 0.04881605419073854,
"learning_rate": 7.901217814447212e-06,
"loss": 1.2064,
"step": 911
},
{
"epoch": 0.6205928635223068,
"grad_norm": 0.03993595120249041,
"learning_rate": 7.896816796013943e-06,
"loss": 1.1752,
"step": 912
},
{
"epoch": 0.6212733381533619,
"grad_norm": 0.03966047093864206,
"learning_rate": 7.892412396683764e-06,
"loss": 1.12,
"step": 913
},
{
"epoch": 0.6219538127844171,
"grad_norm": 0.04878533637282711,
"learning_rate": 7.888004621597079e-06,
"loss": 1.2101,
"step": 914
},
{
"epoch": 0.6226342874154723,
"grad_norm": 0.039415215318022966,
"learning_rate": 7.88359347589824e-06,
"loss": 1.1367,
"step": 915
},
{
"epoch": 0.6233147620465275,
"grad_norm": 0.03780897863930396,
"learning_rate": 7.879178964735528e-06,
"loss": 1.1652,
"step": 916
},
{
"epoch": 0.6239952366775826,
"grad_norm": 0.04060525430372581,
"learning_rate": 7.87476109326115e-06,
"loss": 1.147,
"step": 917
},
{
"epoch": 0.6246757113086377,
"grad_norm": 0.036870223512593225,
"learning_rate": 7.87033986663124e-06,
"loss": 1.1562,
"step": 918
},
{
"epoch": 0.6253561859396929,
"grad_norm": 0.03936602479028799,
"learning_rate": 7.865915290005844e-06,
"loss": 1.1624,
"step": 919
},
{
"epoch": 0.6260366605707481,
"grad_norm": 0.04111384698224138,
"learning_rate": 7.86148736854892e-06,
"loss": 1.1901,
"step": 920
},
{
"epoch": 0.6267171352018033,
"grad_norm": 0.04290620077150831,
"learning_rate": 7.857056107428327e-06,
"loss": 1.1619,
"step": 921
},
{
"epoch": 0.6273976098328584,
"grad_norm": 0.04056550194174836,
"learning_rate": 7.852621511815825e-06,
"loss": 1.1564,
"step": 922
},
{
"epoch": 0.6280780844639136,
"grad_norm": 0.03639283604390965,
"learning_rate": 7.848183586887065e-06,
"loss": 1.1189,
"step": 923
},
{
"epoch": 0.6287585590949687,
"grad_norm": 0.04463427786180277,
"learning_rate": 7.84374233782158e-06,
"loss": 1.1558,
"step": 924
},
{
"epoch": 0.6294390337260239,
"grad_norm": 0.040588305003388336,
"learning_rate": 7.839297769802789e-06,
"loss": 1.1413,
"step": 925
},
{
"epoch": 0.6301195083570791,
"grad_norm": 0.04145526084985425,
"learning_rate": 7.834849888017979e-06,
"loss": 1.1731,
"step": 926
},
{
"epoch": 0.6307999829881342,
"grad_norm": 0.04064670523198378,
"learning_rate": 7.830398697658308e-06,
"loss": 1.1576,
"step": 927
},
{
"epoch": 0.6314804576191894,
"grad_norm": 0.08504837862750378,
"learning_rate": 7.825944203918792e-06,
"loss": 1.2067,
"step": 928
},
{
"epoch": 0.6321609322502445,
"grad_norm": 0.03618607526419996,
"learning_rate": 7.821486411998307e-06,
"loss": 1.0653,
"step": 929
},
{
"epoch": 0.6328414068812998,
"grad_norm": 0.041163906158028174,
"learning_rate": 7.817025327099574e-06,
"loss": 1.1231,
"step": 930
},
{
"epoch": 0.6335218815123549,
"grad_norm": 0.04096146701418479,
"learning_rate": 7.812560954429159e-06,
"loss": 1.1642,
"step": 931
},
{
"epoch": 0.63420235614341,
"grad_norm": 0.04045705537799017,
"learning_rate": 7.808093299197466e-06,
"loss": 1.1682,
"step": 932
},
{
"epoch": 0.6348828307744652,
"grad_norm": 0.04170510551800386,
"learning_rate": 7.80362236661873e-06,
"loss": 1.1601,
"step": 933
},
{
"epoch": 0.6355633054055203,
"grad_norm": 0.04017895703551513,
"learning_rate": 7.799148161911013e-06,
"loss": 1.1291,
"step": 934
},
{
"epoch": 0.6362437800365756,
"grad_norm": 0.054022188115136816,
"learning_rate": 7.794670690296187e-06,
"loss": 1.1904,
"step": 935
},
{
"epoch": 0.6369242546676307,
"grad_norm": 0.05538165228123039,
"learning_rate": 7.790189956999945e-06,
"loss": 1.0752,
"step": 936
},
{
"epoch": 0.6376047292986858,
"grad_norm": 0.04367149070029591,
"learning_rate": 7.785705967251789e-06,
"loss": 1.1167,
"step": 937
},
{
"epoch": 0.638285203929741,
"grad_norm": 0.06605426061696518,
"learning_rate": 7.781218726285014e-06,
"loss": 1.1585,
"step": 938
},
{
"epoch": 0.6389656785607961,
"grad_norm": 0.03912000963283677,
"learning_rate": 7.776728239336714e-06,
"loss": 1.1371,
"step": 939
},
{
"epoch": 0.6396461531918514,
"grad_norm": 0.04273969229969486,
"learning_rate": 7.772234511647771e-06,
"loss": 1.1149,
"step": 940
},
{
"epoch": 0.6403266278229065,
"grad_norm": 0.0522448351136916,
"learning_rate": 7.767737548462849e-06,
"loss": 1.1133,
"step": 941
},
{
"epoch": 0.6410071024539616,
"grad_norm": 0.03809056120688157,
"learning_rate": 7.763237355030384e-06,
"loss": 1.0985,
"step": 942
},
{
"epoch": 0.6416875770850168,
"grad_norm": 0.04208113112324736,
"learning_rate": 7.758733936602591e-06,
"loss": 1.1542,
"step": 943
},
{
"epoch": 0.6423680517160719,
"grad_norm": 0.038232104737570445,
"learning_rate": 7.754227298435442e-06,
"loss": 1.1449,
"step": 944
},
{
"epoch": 0.6430485263471272,
"grad_norm": 0.07327134336204114,
"learning_rate": 7.749717445788667e-06,
"loss": 1.1229,
"step": 945
},
{
"epoch": 0.6437290009781823,
"grad_norm": 0.04302215159609038,
"learning_rate": 7.745204383925753e-06,
"loss": 1.2152,
"step": 946
},
{
"epoch": 0.6444094756092374,
"grad_norm": 0.04050395590672295,
"learning_rate": 7.740688118113926e-06,
"loss": 1.1187,
"step": 947
},
{
"epoch": 0.6450899502402926,
"grad_norm": 0.041047644523210476,
"learning_rate": 7.736168653624154e-06,
"loss": 1.1622,
"step": 948
},
{
"epoch": 0.6457704248713477,
"grad_norm": 0.042773261634885006,
"learning_rate": 7.73164599573114e-06,
"loss": 1.093,
"step": 949
},
{
"epoch": 0.646450899502403,
"grad_norm": 0.3472748679357863,
"learning_rate": 7.727120149713313e-06,
"loss": 1.1358,
"step": 950
},
{
"epoch": 0.6471313741334581,
"grad_norm": 0.05249313678647316,
"learning_rate": 7.722591120852821e-06,
"loss": 1.1488,
"step": 951
},
{
"epoch": 0.6478118487645133,
"grad_norm": 0.04857838907769981,
"learning_rate": 7.718058914435526e-06,
"loss": 1.1395,
"step": 952
},
{
"epoch": 0.6484923233955684,
"grad_norm": 0.06056048993060849,
"learning_rate": 7.713523535751003e-06,
"loss": 1.1554,
"step": 953
},
{
"epoch": 0.6491727980266235,
"grad_norm": 0.043417866931708946,
"learning_rate": 7.708984990092528e-06,
"loss": 1.1399,
"step": 954
},
{
"epoch": 0.6498532726576788,
"grad_norm": 0.040899332718403396,
"learning_rate": 7.70444328275707e-06,
"loss": 1.2209,
"step": 955
},
{
"epoch": 0.6505337472887339,
"grad_norm": 0.042139179785609125,
"learning_rate": 7.69989841904529e-06,
"loss": 1.1771,
"step": 956
},
{
"epoch": 0.6512142219197891,
"grad_norm": 0.04497423442726415,
"learning_rate": 7.695350404261535e-06,
"loss": 1.1852,
"step": 957
},
{
"epoch": 0.6518946965508442,
"grad_norm": 0.04299815900672657,
"learning_rate": 7.690799243713825e-06,
"loss": 1.1422,
"step": 958
},
{
"epoch": 0.6525751711818993,
"grad_norm": 0.057667778993934805,
"learning_rate": 7.686244942713856e-06,
"loss": 1.1643,
"step": 959
},
{
"epoch": 0.6532556458129546,
"grad_norm": 0.05055586475309829,
"learning_rate": 7.681687506576988e-06,
"loss": 1.1675,
"step": 960
},
{
"epoch": 0.6539361204440097,
"grad_norm": 0.04837690007921328,
"learning_rate": 7.677126940622237e-06,
"loss": 1.1458,
"step": 961
},
{
"epoch": 0.6546165950750649,
"grad_norm": 0.04746849990432907,
"learning_rate": 7.672563250172278e-06,
"loss": 1.1614,
"step": 962
},
{
"epoch": 0.65529706970612,
"grad_norm": 0.042700698425812394,
"learning_rate": 7.667996440553424e-06,
"loss": 1.1787,
"step": 963
},
{
"epoch": 0.6559775443371751,
"grad_norm": 0.04663433423637838,
"learning_rate": 7.663426517095637e-06,
"loss": 1.1899,
"step": 964
},
{
"epoch": 0.6566580189682304,
"grad_norm": 0.05089210963658719,
"learning_rate": 7.658853485132507e-06,
"loss": 1.1114,
"step": 965
},
{
"epoch": 0.6573384935992855,
"grad_norm": 0.04835691542508216,
"learning_rate": 7.654277350001255e-06,
"loss": 1.1472,
"step": 966
},
{
"epoch": 0.6580189682303407,
"grad_norm": 0.04286980226281031,
"learning_rate": 7.649698117042725e-06,
"loss": 1.1704,
"step": 967
},
{
"epoch": 0.6586994428613958,
"grad_norm": 0.0413107954780928,
"learning_rate": 7.645115791601371e-06,
"loss": 1.1628,
"step": 968
},
{
"epoch": 0.659379917492451,
"grad_norm": 0.06469008546831921,
"learning_rate": 7.640530379025265e-06,
"loss": 1.1286,
"step": 969
},
{
"epoch": 0.6600603921235062,
"grad_norm": 0.0929348477800054,
"learning_rate": 7.635941884666072e-06,
"loss": 1.0714,
"step": 970
},
{
"epoch": 0.6607408667545613,
"grad_norm": 0.04918686842612326,
"learning_rate": 7.631350313879061e-06,
"loss": 1.1324,
"step": 971
},
{
"epoch": 0.6614213413856165,
"grad_norm": 0.04516364692323498,
"learning_rate": 7.626755672023087e-06,
"loss": 1.1094,
"step": 972
},
{
"epoch": 0.6621018160166716,
"grad_norm": 0.040036511801984,
"learning_rate": 7.6221579644605945e-06,
"loss": 1.1856,
"step": 973
},
{
"epoch": 0.6627822906477268,
"grad_norm": 0.04221452365231738,
"learning_rate": 7.617557196557601e-06,
"loss": 1.1605,
"step": 974
},
{
"epoch": 0.6634627652787819,
"grad_norm": 0.04969957841604738,
"learning_rate": 7.6129533736837e-06,
"loss": 1.1842,
"step": 975
},
{
"epoch": 0.6641432399098371,
"grad_norm": 0.03868376326252715,
"learning_rate": 7.608346501212045e-06,
"loss": 1.1228,
"step": 976
},
{
"epoch": 0.6648237145408923,
"grad_norm": 0.03621072764785827,
"learning_rate": 7.603736584519354e-06,
"loss": 1.094,
"step": 977
},
{
"epoch": 0.6655041891719474,
"grad_norm": 0.050499205066227806,
"learning_rate": 7.599123628985894e-06,
"loss": 1.189,
"step": 978
},
{
"epoch": 0.6661846638030026,
"grad_norm": 0.04076810819723509,
"learning_rate": 7.594507639995483e-06,
"loss": 1.1811,
"step": 979
},
{
"epoch": 0.6668651384340577,
"grad_norm": 0.04391491864766983,
"learning_rate": 7.5898886229354754e-06,
"loss": 1.2108,
"step": 980
},
{
"epoch": 0.667545613065113,
"grad_norm": 0.04859358265845105,
"learning_rate": 7.585266583196761e-06,
"loss": 1.1159,
"step": 981
},
{
"epoch": 0.6682260876961681,
"grad_norm": 0.0405482507873175,
"learning_rate": 7.580641526173758e-06,
"loss": 1.1659,
"step": 982
},
{
"epoch": 0.6689065623272232,
"grad_norm": 0.04257613233842775,
"learning_rate": 7.576013457264406e-06,
"loss": 1.1459,
"step": 983
},
{
"epoch": 0.6695870369582784,
"grad_norm": 0.0393176411971146,
"learning_rate": 7.571382381870157e-06,
"loss": 1.1448,
"step": 984
},
{
"epoch": 0.6702675115893335,
"grad_norm": 0.08112350996396892,
"learning_rate": 7.566748305395979e-06,
"loss": 1.1257,
"step": 985
},
{
"epoch": 0.6709479862203888,
"grad_norm": 0.041738073653021145,
"learning_rate": 7.5621112332503325e-06,
"loss": 1.1379,
"step": 986
},
{
"epoch": 0.6716284608514439,
"grad_norm": 0.06458657107739593,
"learning_rate": 7.557471170845183e-06,
"loss": 1.2003,
"step": 987
},
{
"epoch": 0.672308935482499,
"grad_norm": 0.038958895409908184,
"learning_rate": 7.552828123595981e-06,
"loss": 1.1231,
"step": 988
},
{
"epoch": 0.6729894101135542,
"grad_norm": 0.042481360817148296,
"learning_rate": 7.548182096921665e-06,
"loss": 1.0685,
"step": 989
},
{
"epoch": 0.6736698847446093,
"grad_norm": 0.041831775106497764,
"learning_rate": 7.543533096244644e-06,
"loss": 1.1738,
"step": 990
},
{
"epoch": 0.6743503593756646,
"grad_norm": 0.15815365507476095,
"learning_rate": 7.538881126990807e-06,
"loss": 1.1475,
"step": 991
},
{
"epoch": 0.6750308340067197,
"grad_norm": 0.04437893080698295,
"learning_rate": 7.534226194589498e-06,
"loss": 1.0809,
"step": 992
},
{
"epoch": 0.6757113086377748,
"grad_norm": 0.04365643846646936,
"learning_rate": 7.529568304473525e-06,
"loss": 1.1849,
"step": 993
},
{
"epoch": 0.67639178326883,
"grad_norm": 0.04471078501455037,
"learning_rate": 7.524907462079149e-06,
"loss": 1.1777,
"step": 994
},
{
"epoch": 0.6770722578998851,
"grad_norm": 0.041659935941889464,
"learning_rate": 7.5202436728460746e-06,
"loss": 1.2081,
"step": 995
},
{
"epoch": 0.6777527325309404,
"grad_norm": 0.04288201099681714,
"learning_rate": 7.5155769422174445e-06,
"loss": 1.1962,
"step": 996
},
{
"epoch": 0.6784332071619955,
"grad_norm": 0.04368684136906247,
"learning_rate": 7.510907275639832e-06,
"loss": 1.1922,
"step": 997
},
{
"epoch": 0.6791136817930506,
"grad_norm": 0.050476486903071514,
"learning_rate": 7.506234678563248e-06,
"loss": 1.0598,
"step": 998
},
{
"epoch": 0.6797941564241058,
"grad_norm": 0.046810799877727786,
"learning_rate": 7.501559156441109e-06,
"loss": 1.1796,
"step": 999
},
{
"epoch": 0.6804746310551609,
"grad_norm": 0.047812912893326426,
"learning_rate": 7.496880714730259e-06,
"loss": 1.1428,
"step": 1000
},
{
"epoch": 0.6811551056862162,
"grad_norm": 0.04257189142775785,
"learning_rate": 7.492199358890937e-06,
"loss": 1.1715,
"step": 1001
},
{
"epoch": 0.6818355803172713,
"grad_norm": 0.06136667792321527,
"learning_rate": 7.487515094386792e-06,
"loss": 1.1517,
"step": 1002
},
{
"epoch": 0.6825160549483265,
"grad_norm": 0.055646896518561265,
"learning_rate": 7.482827926684864e-06,
"loss": 1.168,
"step": 1003
},
{
"epoch": 0.6831965295793816,
"grad_norm": 0.04766220568836885,
"learning_rate": 7.478137861255583e-06,
"loss": 1.2104,
"step": 1004
},
{
"epoch": 0.6838770042104367,
"grad_norm": 0.04290035924469161,
"learning_rate": 7.473444903572757e-06,
"loss": 1.1736,
"step": 1005
},
{
"epoch": 0.684557478841492,
"grad_norm": 0.041199593080418005,
"learning_rate": 7.468749059113578e-06,
"loss": 1.1848,
"step": 1006
},
{
"epoch": 0.6852379534725471,
"grad_norm": 0.042266818870596565,
"learning_rate": 7.464050333358597e-06,
"loss": 1.178,
"step": 1007
},
{
"epoch": 0.6859184281036023,
"grad_norm": 0.15107754375835664,
"learning_rate": 7.459348731791733e-06,
"loss": 1.1783,
"step": 1008
},
{
"epoch": 0.6865989027346574,
"grad_norm": 0.04106006697859163,
"learning_rate": 7.454644259900263e-06,
"loss": 1.14,
"step": 1009
},
{
"epoch": 0.6872793773657125,
"grad_norm": 0.04333746328735867,
"learning_rate": 7.449936923174813e-06,
"loss": 1.1426,
"step": 1010
},
{
"epoch": 0.6879598519967678,
"grad_norm": 0.07004970303311875,
"learning_rate": 7.445226727109347e-06,
"loss": 1.0842,
"step": 1011
},
{
"epoch": 0.6886403266278229,
"grad_norm": 0.039638832880001425,
"learning_rate": 7.440513677201175e-06,
"loss": 1.1659,
"step": 1012
},
{
"epoch": 0.6893208012588781,
"grad_norm": 0.03910289252801824,
"learning_rate": 7.43579777895093e-06,
"loss": 1.1611,
"step": 1013
},
{
"epoch": 0.6900012758899332,
"grad_norm": 0.07027920752871429,
"learning_rate": 7.431079037862575e-06,
"loss": 1.0465,
"step": 1014
},
{
"epoch": 0.6906817505209883,
"grad_norm": 0.04094509224077025,
"learning_rate": 7.426357459443388e-06,
"loss": 1.1603,
"step": 1015
},
{
"epoch": 0.6913622251520436,
"grad_norm": 0.10598504072232526,
"learning_rate": 7.42163304920396e-06,
"loss": 1.1153,
"step": 1016
},
{
"epoch": 0.6920426997830987,
"grad_norm": 0.041160683871352216,
"learning_rate": 7.416905812658183e-06,
"loss": 1.1559,
"step": 1017
},
{
"epoch": 0.6927231744141539,
"grad_norm": 0.05473785616653893,
"learning_rate": 7.412175755323254e-06,
"loss": 1.125,
"step": 1018
},
{
"epoch": 0.693403649045209,
"grad_norm": 0.04770187460065998,
"learning_rate": 7.407442882719658e-06,
"loss": 1.1188,
"step": 1019
},
{
"epoch": 0.6940841236762642,
"grad_norm": 0.04381659899522137,
"learning_rate": 7.402707200371165e-06,
"loss": 1.1356,
"step": 1020
},
{
"epoch": 0.6947645983073194,
"grad_norm": 0.04051251755932694,
"learning_rate": 7.397968713804828e-06,
"loss": 1.1174,
"step": 1021
},
{
"epoch": 0.6954450729383745,
"grad_norm": 0.044516396502584286,
"learning_rate": 7.39322742855097e-06,
"loss": 1.1777,
"step": 1022
},
{
"epoch": 0.6961255475694297,
"grad_norm": 0.04117597756872166,
"learning_rate": 7.38848335014318e-06,
"loss": 1.1387,
"step": 1023
},
{
"epoch": 0.6968060222004848,
"grad_norm": 0.04296180208703936,
"learning_rate": 7.383736484118311e-06,
"loss": 1.1206,
"step": 1024
},
{
"epoch": 0.69748649683154,
"grad_norm": 0.050477126239160564,
"learning_rate": 7.378986836016462e-06,
"loss": 1.1443,
"step": 1025
},
{
"epoch": 0.6981669714625952,
"grad_norm": 0.06939371082738612,
"learning_rate": 7.374234411380987e-06,
"loss": 1.1321,
"step": 1026
},
{
"epoch": 0.6988474460936503,
"grad_norm": 0.05320770806066322,
"learning_rate": 7.369479215758476e-06,
"loss": 1.218,
"step": 1027
},
{
"epoch": 0.6995279207247055,
"grad_norm": 0.04193911746958924,
"learning_rate": 7.364721254698752e-06,
"loss": 1.1404,
"step": 1028
},
{
"epoch": 0.7002083953557606,
"grad_norm": 0.05140219246531558,
"learning_rate": 7.359960533754872e-06,
"loss": 1.1954,
"step": 1029
},
{
"epoch": 0.7008888699868158,
"grad_norm": 0.043307053037638495,
"learning_rate": 7.355197058483103e-06,
"loss": 1.1012,
"step": 1030
},
{
"epoch": 0.701569344617871,
"grad_norm": 0.04976551632482346,
"learning_rate": 7.350430834442941e-06,
"loss": 1.1041,
"step": 1031
},
{
"epoch": 0.7022498192489262,
"grad_norm": 0.047734146085657715,
"learning_rate": 7.345661867197076e-06,
"loss": 1.1625,
"step": 1032
},
{
"epoch": 0.7029302938799813,
"grad_norm": 0.04720201465179049,
"learning_rate": 7.340890162311411e-06,
"loss": 1.1787,
"step": 1033
},
{
"epoch": 0.7036107685110364,
"grad_norm": 0.04247509490814496,
"learning_rate": 7.336115725355033e-06,
"loss": 1.146,
"step": 1034
},
{
"epoch": 0.7042912431420916,
"grad_norm": 0.21669885262049804,
"learning_rate": 7.33133856190023e-06,
"loss": 1.1569,
"step": 1035
},
{
"epoch": 0.7049717177731468,
"grad_norm": 0.0437961700372451,
"learning_rate": 7.3265586775224595e-06,
"loss": 1.1802,
"step": 1036
},
{
"epoch": 0.705652192404202,
"grad_norm": 0.03735584537487382,
"learning_rate": 7.321776077800367e-06,
"loss": 1.125,
"step": 1037
},
{
"epoch": 0.7063326670352571,
"grad_norm": 0.054935897560418445,
"learning_rate": 7.316990768315757e-06,
"loss": 1.0937,
"step": 1038
},
{
"epoch": 0.7070131416663122,
"grad_norm": 0.053625956317227996,
"learning_rate": 7.3122027546536025e-06,
"loss": 1.1672,
"step": 1039
},
{
"epoch": 0.7076936162973674,
"grad_norm": 0.04328762046243775,
"learning_rate": 7.307412042402029e-06,
"loss": 1.1584,
"step": 1040
},
{
"epoch": 0.7083740909284226,
"grad_norm": 0.042645405158309085,
"learning_rate": 7.302618637152316e-06,
"loss": 1.2284,
"step": 1041
},
{
"epoch": 0.7090545655594778,
"grad_norm": 0.043540366546701605,
"learning_rate": 7.297822544498879e-06,
"loss": 1.1746,
"step": 1042
},
{
"epoch": 0.7097350401905329,
"grad_norm": 0.09388727485260404,
"learning_rate": 7.293023770039279e-06,
"loss": 1.106,
"step": 1043
},
{
"epoch": 0.710415514821588,
"grad_norm": 0.045183692217649854,
"learning_rate": 7.288222319374199e-06,
"loss": 1.1561,
"step": 1044
},
{
"epoch": 0.7110959894526432,
"grad_norm": 0.04419303783873857,
"learning_rate": 7.283418198107449e-06,
"loss": 1.1907,
"step": 1045
},
{
"epoch": 0.7117764640836984,
"grad_norm": 0.039431406730855954,
"learning_rate": 7.2786114118459564e-06,
"loss": 1.1463,
"step": 1046
},
{
"epoch": 0.7124569387147536,
"grad_norm": 0.08697265203912855,
"learning_rate": 7.273801966199756e-06,
"loss": 1.1178,
"step": 1047
},
{
"epoch": 0.7131374133458087,
"grad_norm": 0.046605391918640805,
"learning_rate": 7.2689898667819915e-06,
"loss": 1.1882,
"step": 1048
},
{
"epoch": 0.7138178879768639,
"grad_norm": 0.04535224059552782,
"learning_rate": 7.2641751192088986e-06,
"loss": 1.1587,
"step": 1049
},
{
"epoch": 0.714498362607919,
"grad_norm": 0.04609371369265137,
"learning_rate": 7.259357729099805e-06,
"loss": 1.1624,
"step": 1050
},
{
"epoch": 0.7151788372389741,
"grad_norm": 0.04837539578364566,
"learning_rate": 7.254537702077127e-06,
"loss": 1.1592,
"step": 1051
},
{
"epoch": 0.7158593118700294,
"grad_norm": 0.0415706447025837,
"learning_rate": 7.2497150437663495e-06,
"loss": 1.1592,
"step": 1052
},
{
"epoch": 0.7165397865010845,
"grad_norm": 0.0466672846650458,
"learning_rate": 7.244889759796039e-06,
"loss": 1.1646,
"step": 1053
},
{
"epoch": 0.7172202611321397,
"grad_norm": 0.041322845555446144,
"learning_rate": 7.240061855797818e-06,
"loss": 1.0669,
"step": 1054
},
{
"epoch": 0.7179007357631948,
"grad_norm": 0.047295050030634445,
"learning_rate": 7.23523133740637e-06,
"loss": 1.1195,
"step": 1055
},
{
"epoch": 0.7185812103942499,
"grad_norm": 0.041282823374409,
"learning_rate": 7.230398210259431e-06,
"loss": 1.2188,
"step": 1056
},
{
"epoch": 0.7192616850253052,
"grad_norm": 0.0673203156095105,
"learning_rate": 7.225562479997781e-06,
"loss": 1.1166,
"step": 1057
},
{
"epoch": 0.7199421596563603,
"grad_norm": 0.059975488393622454,
"learning_rate": 7.220724152265234e-06,
"loss": 1.2001,
"step": 1058
},
{
"epoch": 0.7206226342874155,
"grad_norm": 0.05733949411047725,
"learning_rate": 7.215883232708642e-06,
"loss": 1.1777,
"step": 1059
},
{
"epoch": 0.7213031089184706,
"grad_norm": 0.038815327874534795,
"learning_rate": 7.21103972697788e-06,
"loss": 1.1425,
"step": 1060
},
{
"epoch": 0.7219835835495257,
"grad_norm": 0.04180710340753343,
"learning_rate": 7.206193640725838e-06,
"loss": 1.1311,
"step": 1061
},
{
"epoch": 0.722664058180581,
"grad_norm": 0.049442802384658265,
"learning_rate": 7.201344979608423e-06,
"loss": 1.0722,
"step": 1062
},
{
"epoch": 0.7233445328116361,
"grad_norm": 0.03691998702875071,
"learning_rate": 7.1964937492845376e-06,
"loss": 1.2127,
"step": 1063
},
{
"epoch": 0.7240250074426913,
"grad_norm": 0.04551488458030917,
"learning_rate": 7.191639955416097e-06,
"loss": 1.1855,
"step": 1064
},
{
"epoch": 0.7247054820737464,
"grad_norm": 0.05620018748524324,
"learning_rate": 7.1867836036679984e-06,
"loss": 1.1669,
"step": 1065
},
{
"epoch": 0.7253859567048015,
"grad_norm": 0.03770804716932438,
"learning_rate": 7.181924699708127e-06,
"loss": 1.1324,
"step": 1066
},
{
"epoch": 0.7260664313358568,
"grad_norm": 0.043007638479637705,
"learning_rate": 7.1770632492073455e-06,
"loss": 1.188,
"step": 1067
},
{
"epoch": 0.7267469059669119,
"grad_norm": 0.04846905600603435,
"learning_rate": 7.172199257839492e-06,
"loss": 1.1112,
"step": 1068
},
{
"epoch": 0.7274273805979671,
"grad_norm": 0.06760771174774231,
"learning_rate": 7.167332731281363e-06,
"loss": 1.1739,
"step": 1069
},
{
"epoch": 0.7281078552290222,
"grad_norm": 0.04414778875172002,
"learning_rate": 7.162463675212726e-06,
"loss": 1.1207,
"step": 1070
},
{
"epoch": 0.7287883298600774,
"grad_norm": 0.044146176097507095,
"learning_rate": 7.157592095316286e-06,
"loss": 1.1655,
"step": 1071
},
{
"epoch": 0.7294688044911326,
"grad_norm": 0.04415213012398565,
"learning_rate": 7.152717997277706e-06,
"loss": 1.0841,
"step": 1072
},
{
"epoch": 0.7301492791221877,
"grad_norm": 0.04683320609148643,
"learning_rate": 7.14784138678558e-06,
"loss": 1.103,
"step": 1073
},
{
"epoch": 0.7308297537532429,
"grad_norm": 0.041857344288494074,
"learning_rate": 7.142962269531439e-06,
"loss": 1.1669,
"step": 1074
},
{
"epoch": 0.731510228384298,
"grad_norm": 0.042008130220727205,
"learning_rate": 7.138080651209738e-06,
"loss": 1.189,
"step": 1075
},
{
"epoch": 0.7321907030153532,
"grad_norm": 1.0221404712041966,
"learning_rate": 7.133196537517848e-06,
"loss": 1.1469,
"step": 1076
},
{
"epoch": 0.7328711776464084,
"grad_norm": 0.042322130073403956,
"learning_rate": 7.128309934156059e-06,
"loss": 1.1318,
"step": 1077
},
{
"epoch": 0.7335516522774636,
"grad_norm": 0.042057265771129046,
"learning_rate": 7.12342084682756e-06,
"loss": 1.1443,
"step": 1078
},
{
"epoch": 0.7342321269085187,
"grad_norm": 0.047450147809600454,
"learning_rate": 7.118529281238444e-06,
"loss": 1.1628,
"step": 1079
},
{
"epoch": 0.7349126015395738,
"grad_norm": 0.038929774779211966,
"learning_rate": 7.113635243097694e-06,
"loss": 1.1716,
"step": 1080
},
{
"epoch": 0.735593076170629,
"grad_norm": 0.09452731162534289,
"learning_rate": 7.108738738117178e-06,
"loss": 1.1475,
"step": 1081
},
{
"epoch": 0.7362735508016842,
"grad_norm": 0.04475327075474272,
"learning_rate": 7.1038397720116445e-06,
"loss": 1.1252,
"step": 1082
},
{
"epoch": 0.7369540254327394,
"grad_norm": 0.049494950615798054,
"learning_rate": 7.098938350498716e-06,
"loss": 1.1659,
"step": 1083
},
{
"epoch": 0.7376345000637945,
"grad_norm": 0.04152173426603925,
"learning_rate": 7.094034479298877e-06,
"loss": 1.0856,
"step": 1084
},
{
"epoch": 0.7383149746948496,
"grad_norm": 0.047822463422239274,
"learning_rate": 7.0891281641354725e-06,
"loss": 1.1294,
"step": 1085
},
{
"epoch": 0.7389954493259048,
"grad_norm": 0.04237806087000458,
"learning_rate": 7.084219410734701e-06,
"loss": 1.1766,
"step": 1086
},
{
"epoch": 0.73967592395696,
"grad_norm": 0.039594916551205596,
"learning_rate": 7.079308224825606e-06,
"loss": 1.196,
"step": 1087
},
{
"epoch": 0.7403563985880152,
"grad_norm": 0.04336458299557206,
"learning_rate": 7.0743946121400695e-06,
"loss": 1.1839,
"step": 1088
},
{
"epoch": 0.7410368732190703,
"grad_norm": 0.04375341715007609,
"learning_rate": 7.069478578412807e-06,
"loss": 1.1983,
"step": 1089
},
{
"epoch": 0.7417173478501254,
"grad_norm": 0.05157083535351173,
"learning_rate": 7.064560129381359e-06,
"loss": 1.1507,
"step": 1090
},
{
"epoch": 0.7423978224811806,
"grad_norm": 0.060666298376872534,
"learning_rate": 7.059639270786083e-06,
"loss": 1.2219,
"step": 1091
},
{
"epoch": 0.7430782971122358,
"grad_norm": 0.07282006900214394,
"learning_rate": 7.054716008370152e-06,
"loss": 1.1525,
"step": 1092
},
{
"epoch": 0.743758771743291,
"grad_norm": 0.04284510073276491,
"learning_rate": 7.049790347879543e-06,
"loss": 1.1965,
"step": 1093
},
{
"epoch": 0.7444392463743461,
"grad_norm": 0.042066556853576444,
"learning_rate": 7.0448622950630305e-06,
"loss": 1.1347,
"step": 1094
},
{
"epoch": 0.7451197210054012,
"grad_norm": 0.04330476577042554,
"learning_rate": 7.039931855672185e-06,
"loss": 1.1636,
"step": 1095
},
{
"epoch": 0.7458001956364564,
"grad_norm": 0.0656755371298587,
"learning_rate": 7.034999035461356e-06,
"loss": 1.0869,
"step": 1096
},
{
"epoch": 0.7464806702675116,
"grad_norm": 0.04884741709734115,
"learning_rate": 7.03006384018768e-06,
"loss": 1.1266,
"step": 1097
},
{
"epoch": 0.7471611448985668,
"grad_norm": 0.07686213616910825,
"learning_rate": 7.025126275611058e-06,
"loss": 1.0971,
"step": 1098
},
{
"epoch": 0.7478416195296219,
"grad_norm": 0.04391594900179587,
"learning_rate": 7.020186347494159e-06,
"loss": 1.1419,
"step": 1099
},
{
"epoch": 0.748522094160677,
"grad_norm": 0.04022498193871379,
"learning_rate": 7.0152440616024105e-06,
"loss": 1.1534,
"step": 1100
},
{
"epoch": 0.7492025687917322,
"grad_norm": 0.04069958195471514,
"learning_rate": 7.010299423703996e-06,
"loss": 1.123,
"step": 1101
},
{
"epoch": 0.7498830434227874,
"grad_norm": 0.04279031084289905,
"learning_rate": 7.0053524395698345e-06,
"loss": 1.1211,
"step": 1102
},
{
"epoch": 0.7505635180538426,
"grad_norm": 0.03907354097090339,
"learning_rate": 7.000403114973591e-06,
"loss": 1.1609,
"step": 1103
},
{
"epoch": 0.7512439926848977,
"grad_norm": 0.04384068220483148,
"learning_rate": 6.99545145569166e-06,
"loss": 1.0995,
"step": 1104
},
{
"epoch": 0.7519244673159529,
"grad_norm": 0.045265801938899335,
"learning_rate": 6.990497467503163e-06,
"loss": 1.1096,
"step": 1105
},
{
"epoch": 0.752604941947008,
"grad_norm": 0.09957810788867909,
"learning_rate": 6.985541156189932e-06,
"loss": 1.132,
"step": 1106
},
{
"epoch": 0.7532854165780632,
"grad_norm": 0.0429281647908113,
"learning_rate": 6.98058252753652e-06,
"loss": 1.1622,
"step": 1107
},
{
"epoch": 0.7539658912091184,
"grad_norm": 0.052743801016877874,
"learning_rate": 6.975621587330179e-06,
"loss": 1.2174,
"step": 1108
},
{
"epoch": 0.7546463658401735,
"grad_norm": 0.03881047949712702,
"learning_rate": 6.970658341360859e-06,
"loss": 1.1805,
"step": 1109
},
{
"epoch": 0.7553268404712287,
"grad_norm": 0.04225458626164949,
"learning_rate": 6.965692795421206e-06,
"loss": 1.1722,
"step": 1110
},
{
"epoch": 0.7560073151022838,
"grad_norm": 0.10543672989935722,
"learning_rate": 6.9607249553065405e-06,
"loss": 1.2019,
"step": 1111
},
{
"epoch": 0.7566877897333391,
"grad_norm": 0.0565342929312989,
"learning_rate": 6.955754826814871e-06,
"loss": 1.0947,
"step": 1112
},
{
"epoch": 0.7573682643643942,
"grad_norm": 0.03930712197341347,
"learning_rate": 6.950782415746869e-06,
"loss": 1.1346,
"step": 1113
},
{
"epoch": 0.7580487389954493,
"grad_norm": 0.03760938964092718,
"learning_rate": 6.945807727905876e-06,
"loss": 1.1054,
"step": 1114
},
{
"epoch": 0.7587292136265045,
"grad_norm": 0.0379152054111856,
"learning_rate": 6.940830769097886e-06,
"loss": 1.0448,
"step": 1115
},
{
"epoch": 0.7594096882575596,
"grad_norm": 0.06937325137974219,
"learning_rate": 6.935851545131549e-06,
"loss": 1.1911,
"step": 1116
},
{
"epoch": 0.7600901628886149,
"grad_norm": 0.06566055465040227,
"learning_rate": 6.9308700618181505e-06,
"loss": 1.1573,
"step": 1117
},
{
"epoch": 0.76077063751967,
"grad_norm": 0.04493419709913744,
"learning_rate": 6.925886324971619e-06,
"loss": 1.1628,
"step": 1118
},
{
"epoch": 0.7614511121507251,
"grad_norm": 0.0418786470658152,
"learning_rate": 6.920900340408513e-06,
"loss": 1.2156,
"step": 1119
},
{
"epoch": 0.7621315867817803,
"grad_norm": 0.038634168191586375,
"learning_rate": 6.915912113948013e-06,
"loss": 1.1652,
"step": 1120
},
{
"epoch": 0.7628120614128354,
"grad_norm": 0.06168446681123073,
"learning_rate": 6.910921651411915e-06,
"loss": 1.2015,
"step": 1121
},
{
"epoch": 0.7634925360438907,
"grad_norm": 0.04635986156126397,
"learning_rate": 6.905928958624627e-06,
"loss": 1.1536,
"step": 1122
},
{
"epoch": 0.7641730106749458,
"grad_norm": 0.041883937869239204,
"learning_rate": 6.900934041413157e-06,
"loss": 1.1758,
"step": 1123
},
{
"epoch": 0.764853485306001,
"grad_norm": 0.04598375681816885,
"learning_rate": 6.89593690560711e-06,
"loss": 1.1464,
"step": 1124
},
{
"epoch": 0.7655339599370561,
"grad_norm": 0.04369752473272923,
"learning_rate": 6.890937557038685e-06,
"loss": 1.061,
"step": 1125
},
{
"epoch": 0.7662144345681112,
"grad_norm": 0.8223791208274646,
"learning_rate": 6.885936001542658e-06,
"loss": 1.189,
"step": 1126
},
{
"epoch": 0.7668949091991664,
"grad_norm": 0.041870872372011377,
"learning_rate": 6.880932244956381e-06,
"loss": 1.1183,
"step": 1127
},
{
"epoch": 0.7675753838302216,
"grad_norm": 0.0749667174818459,
"learning_rate": 6.875926293119778e-06,
"loss": 1.0854,
"step": 1128
},
{
"epoch": 0.7682558584612768,
"grad_norm": 0.04392599229758133,
"learning_rate": 6.870918151875331e-06,
"loss": 1.1703,
"step": 1129
},
{
"epoch": 0.7689363330923319,
"grad_norm": 0.15512309240607164,
"learning_rate": 6.865907827068085e-06,
"loss": 1.1616,
"step": 1130
},
{
"epoch": 0.769616807723387,
"grad_norm": 0.04222809718225347,
"learning_rate": 6.8608953245456224e-06,
"loss": 1.1157,
"step": 1131
},
{
"epoch": 0.7702972823544422,
"grad_norm": 0.04781561093643504,
"learning_rate": 6.8558806501580764e-06,
"loss": 1.1492,
"step": 1132
},
{
"epoch": 0.7709777569854974,
"grad_norm": 0.15222417065814942,
"learning_rate": 6.85086380975811e-06,
"loss": 1.2147,
"step": 1133
},
{
"epoch": 0.7716582316165526,
"grad_norm": 0.05465946027165189,
"learning_rate": 6.845844809200918e-06,
"loss": 1.1875,
"step": 1134
},
{
"epoch": 0.7723387062476077,
"grad_norm": 0.043507474640149424,
"learning_rate": 6.840823654344211e-06,
"loss": 1.1466,
"step": 1135
},
{
"epoch": 0.7730191808786628,
"grad_norm": 0.047930995130181106,
"learning_rate": 6.835800351048218e-06,
"loss": 1.1868,
"step": 1136
},
{
"epoch": 0.773699655509718,
"grad_norm": 0.06266225882556943,
"learning_rate": 6.830774905175677e-06,
"loss": 1.156,
"step": 1137
},
{
"epoch": 0.7743801301407732,
"grad_norm": 0.04859618741694431,
"learning_rate": 6.82574732259182e-06,
"loss": 1.0918,
"step": 1138
},
{
"epoch": 0.7750606047718284,
"grad_norm": 0.04801810270287596,
"learning_rate": 6.82071760916438e-06,
"loss": 1.1477,
"step": 1139
},
{
"epoch": 0.7757410794028835,
"grad_norm": 0.0473223818411507,
"learning_rate": 6.815685770763573e-06,
"loss": 1.1164,
"step": 1140
},
{
"epoch": 0.7764215540339386,
"grad_norm": 0.043906942829315364,
"learning_rate": 6.810651813262096e-06,
"loss": 1.1267,
"step": 1141
},
{
"epoch": 0.7771020286649938,
"grad_norm": 0.04601911308760648,
"learning_rate": 6.805615742535117e-06,
"loss": 1.1288,
"step": 1142
},
{
"epoch": 0.777782503296049,
"grad_norm": 0.047303938482842395,
"learning_rate": 6.800577564460275e-06,
"loss": 1.179,
"step": 1143
},
{
"epoch": 0.7784629779271042,
"grad_norm": 0.04929207282941646,
"learning_rate": 6.795537284917666e-06,
"loss": 1.1631,
"step": 1144
},
{
"epoch": 0.7791434525581593,
"grad_norm": 0.0451527655223312,
"learning_rate": 6.7904949097898376e-06,
"loss": 1.1429,
"step": 1145
},
{
"epoch": 0.7798239271892144,
"grad_norm": 0.0488569182696569,
"learning_rate": 6.785450444961783e-06,
"loss": 1.1077,
"step": 1146
},
{
"epoch": 0.7805044018202696,
"grad_norm": 0.04586542464915239,
"learning_rate": 6.780403896320938e-06,
"loss": 1.1599,
"step": 1147
},
{
"epoch": 0.7811848764513248,
"grad_norm": 0.042563003759626114,
"learning_rate": 6.775355269757166e-06,
"loss": 1.1288,
"step": 1148
},
{
"epoch": 0.78186535108238,
"grad_norm": 0.04709233196206943,
"learning_rate": 6.770304571162759e-06,
"loss": 1.1172,
"step": 1149
},
{
"epoch": 0.7825458257134351,
"grad_norm": 0.0534028573034039,
"learning_rate": 6.765251806432423e-06,
"loss": 1.1766,
"step": 1150
},
{
"epoch": 0.7832263003444903,
"grad_norm": 0.037200717084029736,
"learning_rate": 6.7601969814632805e-06,
"loss": 1.1748,
"step": 1151
},
{
"epoch": 0.7839067749755454,
"grad_norm": 0.039839632104403555,
"learning_rate": 6.755140102154855e-06,
"loss": 1.0733,
"step": 1152
},
{
"epoch": 0.7845872496066006,
"grad_norm": 0.04106865746747149,
"learning_rate": 6.7500811744090725e-06,
"loss": 1.1776,
"step": 1153
},
{
"epoch": 0.7852677242376558,
"grad_norm": 0.039882001703510736,
"learning_rate": 6.7450202041302404e-06,
"loss": 1.1651,
"step": 1154
},
{
"epoch": 0.7859481988687109,
"grad_norm": 0.0398379046814066,
"learning_rate": 6.739957197225059e-06,
"loss": 1.1796,
"step": 1155
},
{
"epoch": 0.7866286734997661,
"grad_norm": 0.04132540884480281,
"learning_rate": 6.734892159602601e-06,
"loss": 1.1913,
"step": 1156
},
{
"epoch": 0.7873091481308212,
"grad_norm": 0.1403333681186676,
"learning_rate": 6.7298250971743094e-06,
"loss": 1.1167,
"step": 1157
},
{
"epoch": 0.7879896227618765,
"grad_norm": 0.04052355930252153,
"learning_rate": 6.724756015853994e-06,
"loss": 1.1029,
"step": 1158
},
{
"epoch": 0.7886700973929316,
"grad_norm": 0.06712051352806869,
"learning_rate": 6.719684921557816e-06,
"loss": 1.0857,
"step": 1159
},
{
"epoch": 0.7893505720239867,
"grad_norm": 0.040511857114010424,
"learning_rate": 6.71461182020429e-06,
"loss": 1.1407,
"step": 1160
},
{
"epoch": 0.7900310466550419,
"grad_norm": 0.03893191026718295,
"learning_rate": 6.709536717714269e-06,
"loss": 1.0647,
"step": 1161
},
{
"epoch": 0.790711521286097,
"grad_norm": 0.038741558519808664,
"learning_rate": 6.704459620010945e-06,
"loss": 1.1284,
"step": 1162
},
{
"epoch": 0.7913919959171523,
"grad_norm": 0.037503955643134496,
"learning_rate": 6.699380533019836e-06,
"loss": 1.1487,
"step": 1163
},
{
"epoch": 0.7920724705482074,
"grad_norm": 0.03868788023709744,
"learning_rate": 6.694299462668785e-06,
"loss": 1.194,
"step": 1164
},
{
"epoch": 0.7927529451792625,
"grad_norm": 0.05175370915744133,
"learning_rate": 6.689216414887947e-06,
"loss": 1.1888,
"step": 1165
},
{
"epoch": 0.7934334198103177,
"grad_norm": 0.04187198494977118,
"learning_rate": 6.684131395609784e-06,
"loss": 1.1652,
"step": 1166
},
{
"epoch": 0.7941138944413728,
"grad_norm": 0.06991835814940914,
"learning_rate": 6.679044410769064e-06,
"loss": 1.078,
"step": 1167
},
{
"epoch": 0.7947943690724281,
"grad_norm": 0.03928030662322917,
"learning_rate": 6.673955466302844e-06,
"loss": 1.1347,
"step": 1168
},
{
"epoch": 0.7954748437034832,
"grad_norm": 0.06556567246463971,
"learning_rate": 6.668864568150469e-06,
"loss": 1.1564,
"step": 1169
},
{
"epoch": 0.7961553183345383,
"grad_norm": 0.051672923587688206,
"learning_rate": 6.663771722253567e-06,
"loss": 1.1552,
"step": 1170
},
{
"epoch": 0.7968357929655935,
"grad_norm": 0.045778478618114,
"learning_rate": 6.658676934556035e-06,
"loss": 1.1613,
"step": 1171
},
{
"epoch": 0.7975162675966486,
"grad_norm": 0.037569343554903545,
"learning_rate": 6.653580211004039e-06,
"loss": 1.1837,
"step": 1172
},
{
"epoch": 0.7981967422277039,
"grad_norm": 0.042785536903053735,
"learning_rate": 6.648481557546002e-06,
"loss": 1.126,
"step": 1173
},
{
"epoch": 0.798877216858759,
"grad_norm": 0.03986940166665901,
"learning_rate": 6.643380980132608e-06,
"loss": 1.18,
"step": 1174
},
{
"epoch": 0.7995576914898141,
"grad_norm": 0.042633298364431076,
"learning_rate": 6.638278484716771e-06,
"loss": 1.1298,
"step": 1175
},
{
"epoch": 0.8002381661208693,
"grad_norm": 0.04041013356548493,
"learning_rate": 6.63317407725366e-06,
"loss": 1.1684,
"step": 1176
},
{
"epoch": 0.8009186407519244,
"grad_norm": 0.03925791424798017,
"learning_rate": 6.628067763700662e-06,
"loss": 1.1714,
"step": 1177
},
{
"epoch": 0.8015991153829797,
"grad_norm": 0.060428415986807496,
"learning_rate": 6.622959550017397e-06,
"loss": 1.1556,
"step": 1178
},
{
"epoch": 0.8022795900140348,
"grad_norm": 0.03968717411698841,
"learning_rate": 6.617849442165698e-06,
"loss": 1.1696,
"step": 1179
},
{
"epoch": 0.80296006464509,
"grad_norm": 0.04918573413004946,
"learning_rate": 6.612737446109614e-06,
"loss": 1.1542,
"step": 1180
},
{
"epoch": 0.8036405392761451,
"grad_norm": 0.03632300431935989,
"learning_rate": 6.607623567815391e-06,
"loss": 1.1728,
"step": 1181
},
{
"epoch": 0.8043210139072002,
"grad_norm": 0.03868462872196307,
"learning_rate": 6.602507813251478e-06,
"loss": 1.12,
"step": 1182
},
{
"epoch": 0.8050014885382555,
"grad_norm": 0.04326647088999474,
"learning_rate": 6.59739018838851e-06,
"loss": 1.1703,
"step": 1183
},
{
"epoch": 0.8056819631693106,
"grad_norm": 0.042993618675953436,
"learning_rate": 6.592270699199306e-06,
"loss": 1.1058,
"step": 1184
},
{
"epoch": 0.8063624378003658,
"grad_norm": 0.037234437193195045,
"learning_rate": 6.5871493516588595e-06,
"loss": 1.1532,
"step": 1185
},
{
"epoch": 0.8070429124314209,
"grad_norm": 0.05344055731907998,
"learning_rate": 6.5820261517443365e-06,
"loss": 1.1535,
"step": 1186
},
{
"epoch": 0.807723387062476,
"grad_norm": 0.04214235502413425,
"learning_rate": 6.5769011054350604e-06,
"loss": 1.1818,
"step": 1187
},
{
"epoch": 0.8084038616935313,
"grad_norm": 0.04404793500124364,
"learning_rate": 6.5717742187125146e-06,
"loss": 1.1332,
"step": 1188
},
{
"epoch": 0.8090843363245864,
"grad_norm": 0.03832530671679624,
"learning_rate": 6.5666454975603234e-06,
"loss": 1.0854,
"step": 1189
},
{
"epoch": 0.8097648109556416,
"grad_norm": 0.03968072705931161,
"learning_rate": 6.561514947964258e-06,
"loss": 1.0979,
"step": 1190
},
{
"epoch": 0.8104452855866967,
"grad_norm": 0.044024012396331864,
"learning_rate": 6.556382575912225e-06,
"loss": 1.15,
"step": 1191
},
{
"epoch": 0.8111257602177518,
"grad_norm": 0.046743667143049464,
"learning_rate": 6.551248387394251e-06,
"loss": 1.1359,
"step": 1192
},
{
"epoch": 0.8118062348488071,
"grad_norm": 0.07108309726781734,
"learning_rate": 6.546112388402487e-06,
"loss": 1.1255,
"step": 1193
},
{
"epoch": 0.8124867094798622,
"grad_norm": 0.04243584220518146,
"learning_rate": 6.540974584931199e-06,
"loss": 1.1633,
"step": 1194
},
{
"epoch": 0.8131671841109174,
"grad_norm": 0.04017199870986986,
"learning_rate": 6.535834982976752e-06,
"loss": 1.1856,
"step": 1195
},
{
"epoch": 0.8138476587419725,
"grad_norm": 0.03833103072994475,
"learning_rate": 6.530693588537619e-06,
"loss": 1.1441,
"step": 1196
},
{
"epoch": 0.8145281333730277,
"grad_norm": 0.04297394216426952,
"learning_rate": 6.525550407614358e-06,
"loss": 1.1445,
"step": 1197
},
{
"epoch": 0.8152086080040829,
"grad_norm": 0.07188862913546087,
"learning_rate": 6.520405446209615e-06,
"loss": 1.1799,
"step": 1198
},
{
"epoch": 0.815889082635138,
"grad_norm": 0.03987215118118984,
"learning_rate": 6.5152587103281165e-06,
"loss": 1.1144,
"step": 1199
},
{
"epoch": 0.8165695572661932,
"grad_norm": 0.03817132524293037,
"learning_rate": 6.510110205976652e-06,
"loss": 1.0878,
"step": 1200
},
{
"epoch": 0.8172500318972483,
"grad_norm": 0.04141078067635132,
"learning_rate": 6.504959939164081e-06,
"loss": 1.1538,
"step": 1201
},
{
"epoch": 0.8179305065283035,
"grad_norm": 0.03973666336401996,
"learning_rate": 6.4998079159013236e-06,
"loss": 1.1297,
"step": 1202
},
{
"epoch": 0.8186109811593586,
"grad_norm": 0.06287711186363208,
"learning_rate": 6.494654142201342e-06,
"loss": 1.0933,
"step": 1203
},
{
"epoch": 0.8192914557904138,
"grad_norm": 0.03691243912728769,
"learning_rate": 6.489498624079144e-06,
"loss": 1.0645,
"step": 1204
},
{
"epoch": 0.819971930421469,
"grad_norm": 0.03868123428432716,
"learning_rate": 6.4843413675517765e-06,
"loss": 1.1448,
"step": 1205
},
{
"epoch": 0.8206524050525241,
"grad_norm": 0.04592978741828481,
"learning_rate": 6.479182378638308e-06,
"loss": 1.0595,
"step": 1206
},
{
"epoch": 0.8213328796835793,
"grad_norm": 0.045057504118504885,
"learning_rate": 6.47402166335984e-06,
"loss": 1.1042,
"step": 1207
},
{
"epoch": 0.8220133543146344,
"grad_norm": 0.0463959955709015,
"learning_rate": 6.468859227739479e-06,
"loss": 1.2134,
"step": 1208
},
{
"epoch": 0.8226938289456897,
"grad_norm": 0.04393391375867404,
"learning_rate": 6.463695077802343e-06,
"loss": 1.159,
"step": 1209
},
{
"epoch": 0.8233743035767448,
"grad_norm": 0.03825599467131142,
"learning_rate": 6.458529219575551e-06,
"loss": 1.1733,
"step": 1210
},
{
"epoch": 0.8240547782077999,
"grad_norm": 0.037767227267221204,
"learning_rate": 6.453361659088217e-06,
"loss": 1.1174,
"step": 1211
},
{
"epoch": 0.8247352528388551,
"grad_norm": 0.0370763078234574,
"learning_rate": 6.448192402371436e-06,
"loss": 1.1563,
"step": 1212
},
{
"epoch": 0.8254157274699102,
"grad_norm": 0.036711175610672164,
"learning_rate": 6.443021455458292e-06,
"loss": 1.106,
"step": 1213
},
{
"epoch": 0.8260962021009655,
"grad_norm": 0.03738930371806809,
"learning_rate": 6.437848824383832e-06,
"loss": 1.1606,
"step": 1214
},
{
"epoch": 0.8267766767320206,
"grad_norm": 0.05164041123554863,
"learning_rate": 6.432674515185077e-06,
"loss": 1.1542,
"step": 1215
},
{
"epoch": 0.8274571513630757,
"grad_norm": 0.03676194118397665,
"learning_rate": 6.427498533900999e-06,
"loss": 1.1235,
"step": 1216
},
{
"epoch": 0.8281376259941309,
"grad_norm": 0.03989164985813374,
"learning_rate": 6.422320886572527e-06,
"loss": 1.1466,
"step": 1217
},
{
"epoch": 0.828818100625186,
"grad_norm": 0.04356300216442829,
"learning_rate": 6.417141579242532e-06,
"loss": 1.0876,
"step": 1218
},
{
"epoch": 0.8294985752562413,
"grad_norm": 0.03948663487235072,
"learning_rate": 6.4119606179558245e-06,
"loss": 1.0584,
"step": 1219
},
{
"epoch": 0.8301790498872964,
"grad_norm": 0.039683834391457604,
"learning_rate": 6.4067780087591415e-06,
"loss": 1.1104,
"step": 1220
},
{
"epoch": 0.8308595245183515,
"grad_norm": 0.039649808293137266,
"learning_rate": 6.4015937577011476e-06,
"loss": 1.1355,
"step": 1221
},
{
"epoch": 0.8315399991494067,
"grad_norm": 0.040785245487367505,
"learning_rate": 6.396407870832419e-06,
"loss": 1.2331,
"step": 1222
},
{
"epoch": 0.8322204737804618,
"grad_norm": 0.03981988996626304,
"learning_rate": 6.391220354205448e-06,
"loss": 1.1216,
"step": 1223
},
{
"epoch": 0.8329009484115171,
"grad_norm": 0.044016546550143455,
"learning_rate": 6.386031213874622e-06,
"loss": 1.1495,
"step": 1224
},
{
"epoch": 0.8335814230425722,
"grad_norm": 0.03896895263836741,
"learning_rate": 6.380840455896224e-06,
"loss": 1.1613,
"step": 1225
},
{
"epoch": 0.8342618976736274,
"grad_norm": 0.06832828163246732,
"learning_rate": 6.375648086328431e-06,
"loss": 1.1212,
"step": 1226
},
{
"epoch": 0.8349423723046825,
"grad_norm": 0.25245491435815004,
"learning_rate": 6.3704541112312945e-06,
"loss": 1.174,
"step": 1227
},
{
"epoch": 0.8356228469357376,
"grad_norm": 0.043274664497043036,
"learning_rate": 6.365258536666743e-06,
"loss": 1.1925,
"step": 1228
},
{
"epoch": 0.8363033215667929,
"grad_norm": 0.04474702661275679,
"learning_rate": 6.3600613686985726e-06,
"loss": 1.1827,
"step": 1229
},
{
"epoch": 0.836983796197848,
"grad_norm": 0.04088242470620767,
"learning_rate": 6.354862613392436e-06,
"loss": 1.1585,
"step": 1230
},
{
"epoch": 0.8376642708289032,
"grad_norm": 0.03934094541159658,
"learning_rate": 6.349662276815839e-06,
"loss": 1.0986,
"step": 1231
},
{
"epoch": 0.8383447454599583,
"grad_norm": 0.03836873137443747,
"learning_rate": 6.344460365038138e-06,
"loss": 1.0824,
"step": 1232
},
{
"epoch": 0.8390252200910134,
"grad_norm": 0.04423182388938506,
"learning_rate": 6.339256884130518e-06,
"loss": 1.1102,
"step": 1233
},
{
"epoch": 0.8397056947220687,
"grad_norm": 0.03929692065282594,
"learning_rate": 6.334051840166006e-06,
"loss": 1.1949,
"step": 1234
},
{
"epoch": 0.8403861693531238,
"grad_norm": 0.03897705730851779,
"learning_rate": 6.328845239219446e-06,
"loss": 1.1435,
"step": 1235
},
{
"epoch": 0.841066643984179,
"grad_norm": 0.04127103596120887,
"learning_rate": 6.3236370873675025e-06,
"loss": 1.0863,
"step": 1236
},
{
"epoch": 0.8417471186152341,
"grad_norm": 0.04896894494341286,
"learning_rate": 6.318427390688649e-06,
"loss": 1.1311,
"step": 1237
},
{
"epoch": 0.8424275932462892,
"grad_norm": 0.03891171272385797,
"learning_rate": 6.313216155263161e-06,
"loss": 1.1414,
"step": 1238
},
{
"epoch": 0.8431080678773445,
"grad_norm": 0.039783788194363214,
"learning_rate": 6.308003387173112e-06,
"loss": 1.1852,
"step": 1239
},
{
"epoch": 0.8437885425083996,
"grad_norm": 0.038793732155587374,
"learning_rate": 6.302789092502364e-06,
"loss": 1.1957,
"step": 1240
},
{
"epoch": 0.8444690171394548,
"grad_norm": 0.03833731039389526,
"learning_rate": 6.2975732773365575e-06,
"loss": 1.1059,
"step": 1241
},
{
"epoch": 0.8451494917705099,
"grad_norm": 0.038076988501699965,
"learning_rate": 6.292355947763114e-06,
"loss": 1.1573,
"step": 1242
},
{
"epoch": 0.845829966401565,
"grad_norm": 0.04952437152204103,
"learning_rate": 6.287137109871214e-06,
"loss": 1.1712,
"step": 1243
},
{
"epoch": 0.8465104410326203,
"grad_norm": 0.045910265315598234,
"learning_rate": 6.281916769751808e-06,
"loss": 1.1327,
"step": 1244
},
{
"epoch": 0.8471909156636754,
"grad_norm": 0.0393005068086636,
"learning_rate": 6.276694933497588e-06,
"loss": 1.0923,
"step": 1245
},
{
"epoch": 0.8478713902947306,
"grad_norm": 0.037291589698702415,
"learning_rate": 6.271471607203006e-06,
"loss": 1.1396,
"step": 1246
},
{
"epoch": 0.8485518649257857,
"grad_norm": 0.035658033542247156,
"learning_rate": 6.26624679696424e-06,
"loss": 1.2,
"step": 1247
},
{
"epoch": 0.8492323395568409,
"grad_norm": 0.037711808055210426,
"learning_rate": 6.26102050887921e-06,
"loss": 1.1482,
"step": 1248
},
{
"epoch": 0.8499128141878961,
"grad_norm": 0.04141554608842721,
"learning_rate": 6.2557927490475534e-06,
"loss": 1.0702,
"step": 1249
},
{
"epoch": 0.8505932888189512,
"grad_norm": 0.037101277030303734,
"learning_rate": 6.25056352357063e-06,
"loss": 1.1072,
"step": 1250
},
{
"epoch": 0.8512737634500064,
"grad_norm": 0.04107851103743336,
"learning_rate": 6.245332838551509e-06,
"loss": 1.1945,
"step": 1251
},
{
"epoch": 0.8519542380810615,
"grad_norm": 0.04491065841553481,
"learning_rate": 6.240100700094961e-06,
"loss": 1.1496,
"step": 1252
},
{
"epoch": 0.8526347127121167,
"grad_norm": 0.2556338887019016,
"learning_rate": 6.234867114307457e-06,
"loss": 1.1469,
"step": 1253
},
{
"epoch": 0.8533151873431719,
"grad_norm": 0.037445212352137484,
"learning_rate": 6.2296320872971515e-06,
"loss": 1.13,
"step": 1254
},
{
"epoch": 0.853995661974227,
"grad_norm": 0.04572930585644979,
"learning_rate": 6.224395625173887e-06,
"loss": 1.0707,
"step": 1255
},
{
"epoch": 0.8546761366052822,
"grad_norm": 0.03849805262900298,
"learning_rate": 6.219157734049179e-06,
"loss": 1.1179,
"step": 1256
},
{
"epoch": 0.8553566112363373,
"grad_norm": 0.03695730067560196,
"learning_rate": 6.213918420036207e-06,
"loss": 1.1548,
"step": 1257
},
{
"epoch": 0.8560370858673925,
"grad_norm": 0.046154897762123656,
"learning_rate": 6.208677689249816e-06,
"loss": 1.1393,
"step": 1258
},
{
"epoch": 0.8567175604984477,
"grad_norm": 0.049440430017302896,
"learning_rate": 6.203435547806503e-06,
"loss": 1.1343,
"step": 1259
},
{
"epoch": 0.8573980351295029,
"grad_norm": 0.0539384604758099,
"learning_rate": 6.19819200182441e-06,
"loss": 1.1647,
"step": 1260
},
{
"epoch": 0.858078509760558,
"grad_norm": 0.04721963897092052,
"learning_rate": 6.192947057423321e-06,
"loss": 1.1083,
"step": 1261
},
{
"epoch": 0.8587589843916131,
"grad_norm": 0.03671674182385264,
"learning_rate": 6.187700720724648e-06,
"loss": 1.1443,
"step": 1262
},
{
"epoch": 0.8594394590226683,
"grad_norm": 0.045014036344351804,
"learning_rate": 6.1824529978514335e-06,
"loss": 1.1997,
"step": 1263
},
{
"epoch": 0.8601199336537235,
"grad_norm": 0.03930216568782988,
"learning_rate": 6.177203894928333e-06,
"loss": 1.1547,
"step": 1264
},
{
"epoch": 0.8608004082847787,
"grad_norm": 0.03624644947156358,
"learning_rate": 6.171953418081616e-06,
"loss": 1.0756,
"step": 1265
},
{
"epoch": 0.8614808829158338,
"grad_norm": 0.04076660236087528,
"learning_rate": 6.16670157343915e-06,
"loss": 1.1932,
"step": 1266
},
{
"epoch": 0.8621613575468889,
"grad_norm": 0.03657911312435845,
"learning_rate": 6.161448367130407e-06,
"loss": 1.0542,
"step": 1267
},
{
"epoch": 0.8628418321779441,
"grad_norm": 0.04600666735918846,
"learning_rate": 6.156193805286442e-06,
"loss": 1.0834,
"step": 1268
},
{
"epoch": 0.8635223068089993,
"grad_norm": 0.03891338180640547,
"learning_rate": 6.1509378940398955e-06,
"loss": 1.1896,
"step": 1269
},
{
"epoch": 0.8642027814400545,
"grad_norm": 0.03821249523986339,
"learning_rate": 6.14568063952498e-06,
"loss": 1.1259,
"step": 1270
},
{
"epoch": 0.8648832560711096,
"grad_norm": 0.04080432984613173,
"learning_rate": 6.140422047877479e-06,
"loss": 1.1357,
"step": 1271
},
{
"epoch": 0.8655637307021647,
"grad_norm": 0.03944846842629839,
"learning_rate": 6.1351621252347305e-06,
"loss": 1.1267,
"step": 1272
},
{
"epoch": 0.8662442053332199,
"grad_norm": 0.05088129824160262,
"learning_rate": 6.129900877735636e-06,
"loss": 1.0795,
"step": 1273
},
{
"epoch": 0.8669246799642751,
"grad_norm": 0.06282343181233374,
"learning_rate": 6.124638311520634e-06,
"loss": 1.1804,
"step": 1274
},
{
"epoch": 0.8676051545953303,
"grad_norm": 0.04264232701626964,
"learning_rate": 6.1193744327317075e-06,
"loss": 1.0991,
"step": 1275
},
{
"epoch": 0.8682856292263854,
"grad_norm": 0.03433889810718825,
"learning_rate": 6.1141092475123675e-06,
"loss": 1.0482,
"step": 1276
},
{
"epoch": 0.8689661038574406,
"grad_norm": 0.04012493907691257,
"learning_rate": 6.108842762007653e-06,
"loss": 1.1447,
"step": 1277
},
{
"epoch": 0.8696465784884957,
"grad_norm": 0.040379105180865495,
"learning_rate": 6.103574982364118e-06,
"loss": 1.1536,
"step": 1278
},
{
"epoch": 0.8703270531195509,
"grad_norm": 0.04068287917649762,
"learning_rate": 6.098305914729829e-06,
"loss": 1.0713,
"step": 1279
},
{
"epoch": 0.8710075277506061,
"grad_norm": 0.04412785485245827,
"learning_rate": 6.093035565254356e-06,
"loss": 1.1282,
"step": 1280
},
{
"epoch": 0.8716880023816612,
"grad_norm": 0.042002433554191435,
"learning_rate": 6.087763940088761e-06,
"loss": 1.1684,
"step": 1281
},
{
"epoch": 0.8723684770127164,
"grad_norm": 0.050676037683770894,
"learning_rate": 6.082491045385601e-06,
"loss": 1.1357,
"step": 1282
},
{
"epoch": 0.8730489516437715,
"grad_norm": 0.05907674164713621,
"learning_rate": 6.0772168872989094e-06,
"loss": 1.0541,
"step": 1283
},
{
"epoch": 0.8737294262748266,
"grad_norm": 0.0502306633678257,
"learning_rate": 6.0719414719841985e-06,
"loss": 1.2116,
"step": 1284
},
{
"epoch": 0.8744099009058819,
"grad_norm": 0.04096965723691513,
"learning_rate": 6.066664805598442e-06,
"loss": 1.1772,
"step": 1285
},
{
"epoch": 0.875090375536937,
"grad_norm": 0.044691170839062655,
"learning_rate": 6.061386894300082e-06,
"loss": 1.0655,
"step": 1286
},
{
"epoch": 0.8757708501679922,
"grad_norm": 0.07041783986510274,
"learning_rate": 6.0561077442490045e-06,
"loss": 1.1424,
"step": 1287
},
{
"epoch": 0.8764513247990473,
"grad_norm": 0.03981795746048054,
"learning_rate": 6.050827361606549e-06,
"loss": 1.0984,
"step": 1288
},
{
"epoch": 0.8771317994301024,
"grad_norm": 0.03913148996349027,
"learning_rate": 6.0455457525354865e-06,
"loss": 1.0865,
"step": 1289
},
{
"epoch": 0.8778122740611577,
"grad_norm": 0.036648731691854126,
"learning_rate": 6.0402629232000275e-06,
"loss": 1.16,
"step": 1290
},
{
"epoch": 0.8784927486922128,
"grad_norm": 0.03810812228799579,
"learning_rate": 6.034978879765798e-06,
"loss": 1.1141,
"step": 1291
},
{
"epoch": 0.879173223323268,
"grad_norm": 0.03912326860714995,
"learning_rate": 6.029693628399851e-06,
"loss": 1.0799,
"step": 1292
},
{
"epoch": 0.8798536979543231,
"grad_norm": 0.24643907520489966,
"learning_rate": 6.024407175270637e-06,
"loss": 1.1197,
"step": 1293
},
{
"epoch": 0.8805341725853782,
"grad_norm": 0.03564071624903765,
"learning_rate": 6.01911952654802e-06,
"loss": 1.1021,
"step": 1294
},
{
"epoch": 0.8812146472164335,
"grad_norm": 0.03760501790930282,
"learning_rate": 6.013830688403252e-06,
"loss": 1.1147,
"step": 1295
},
{
"epoch": 0.8818951218474886,
"grad_norm": 0.03809395915758253,
"learning_rate": 6.008540667008981e-06,
"loss": 1.194,
"step": 1296
},
{
"epoch": 0.8825755964785438,
"grad_norm": 0.039825805300777097,
"learning_rate": 6.003249468539226e-06,
"loss": 1.1169,
"step": 1297
},
{
"epoch": 0.8832560711095989,
"grad_norm": 0.03810123090259154,
"learning_rate": 5.997957099169388e-06,
"loss": 1.0684,
"step": 1298
},
{
"epoch": 0.8839365457406541,
"grad_norm": 0.03686386460521089,
"learning_rate": 5.99266356507623e-06,
"loss": 1.0948,
"step": 1299
},
{
"epoch": 0.8846170203717093,
"grad_norm": 0.041215161228331305,
"learning_rate": 5.9873688724378764e-06,
"loss": 1.1038,
"step": 1300
},
{
"epoch": 0.8852974950027644,
"grad_norm": 0.0404316678257641,
"learning_rate": 5.982073027433803e-06,
"loss": 1.0868,
"step": 1301
},
{
"epoch": 0.8859779696338196,
"grad_norm": 0.04485496009556138,
"learning_rate": 5.976776036244833e-06,
"loss": 1.1781,
"step": 1302
},
{
"epoch": 0.8866584442648747,
"grad_norm": 0.039768048320405154,
"learning_rate": 5.971477905053121e-06,
"loss": 1.1799,
"step": 1303
},
{
"epoch": 0.8873389188959299,
"grad_norm": 0.038106129270751916,
"learning_rate": 5.96617864004216e-06,
"loss": 1.1092,
"step": 1304
},
{
"epoch": 0.8880193935269851,
"grad_norm": 0.04124245178507421,
"learning_rate": 5.960878247396761e-06,
"loss": 1.1269,
"step": 1305
},
{
"epoch": 0.8886998681580403,
"grad_norm": 0.0407894735351479,
"learning_rate": 5.955576733303053e-06,
"loss": 1.0696,
"step": 1306
},
{
"epoch": 0.8893803427890954,
"grad_norm": 0.040065948868845826,
"learning_rate": 5.9502741039484704e-06,
"loss": 1.1434,
"step": 1307
},
{
"epoch": 0.8900608174201505,
"grad_norm": 0.045054485388447335,
"learning_rate": 5.944970365521757e-06,
"loss": 1.167,
"step": 1308
},
{
"epoch": 0.8907412920512057,
"grad_norm": 0.06286146195704309,
"learning_rate": 5.939665524212943e-06,
"loss": 1.145,
"step": 1309
},
{
"epoch": 0.8914217666822609,
"grad_norm": 0.04950147084807547,
"learning_rate": 5.9343595862133515e-06,
"loss": 1.1118,
"step": 1310
},
{
"epoch": 0.8921022413133161,
"grad_norm": 0.04048402991149793,
"learning_rate": 5.92905255771558e-06,
"loss": 1.0208,
"step": 1311
},
{
"epoch": 0.8927827159443712,
"grad_norm": 0.049178537817983335,
"learning_rate": 5.923744444913504e-06,
"loss": 1.179,
"step": 1312
},
{
"epoch": 0.8934631905754263,
"grad_norm": 0.039002225058750825,
"learning_rate": 5.918435254002262e-06,
"loss": 1.1072,
"step": 1313
},
{
"epoch": 0.8941436652064815,
"grad_norm": 0.03673171425347061,
"learning_rate": 5.91312499117825e-06,
"loss": 1.1457,
"step": 1314
},
{
"epoch": 0.8948241398375367,
"grad_norm": 0.03643856865852586,
"learning_rate": 5.907813662639119e-06,
"loss": 1.1448,
"step": 1315
},
{
"epoch": 0.8955046144685919,
"grad_norm": 0.0371870126740186,
"learning_rate": 5.902501274583757e-06,
"loss": 1.0616,
"step": 1316
},
{
"epoch": 0.896185089099647,
"grad_norm": 0.04363839355813366,
"learning_rate": 5.897187833212295e-06,
"loss": 1.0726,
"step": 1317
},
{
"epoch": 0.8968655637307021,
"grad_norm": 0.03861512314791017,
"learning_rate": 5.891873344726089e-06,
"loss": 1.1552,
"step": 1318
},
{
"epoch": 0.8975460383617573,
"grad_norm": 0.04715528251371471,
"learning_rate": 5.886557815327723e-06,
"loss": 1.1466,
"step": 1319
},
{
"epoch": 0.8982265129928125,
"grad_norm": 0.03987438529084703,
"learning_rate": 5.881241251220986e-06,
"loss": 1.066,
"step": 1320
},
{
"epoch": 0.8989069876238677,
"grad_norm": 0.03838599343411921,
"learning_rate": 5.875923658610886e-06,
"loss": 1.1084,
"step": 1321
},
{
"epoch": 0.8995874622549228,
"grad_norm": 0.04794769005912478,
"learning_rate": 5.87060504370362e-06,
"loss": 1.1769,
"step": 1322
},
{
"epoch": 0.900267936885978,
"grad_norm": 0.04193282437336126,
"learning_rate": 5.865285412706589e-06,
"loss": 1.1094,
"step": 1323
},
{
"epoch": 0.9009484115170331,
"grad_norm": 0.04100519165196842,
"learning_rate": 5.859964771828373e-06,
"loss": 1.1342,
"step": 1324
},
{
"epoch": 0.9016288861480883,
"grad_norm": 0.0414869677010185,
"learning_rate": 5.8546431272787315e-06,
"loss": 1.1219,
"step": 1325
},
{
"epoch": 0.9023093607791435,
"grad_norm": 0.041992393473424115,
"learning_rate": 5.849320485268597e-06,
"loss": 1.0561,
"step": 1326
},
{
"epoch": 0.9029898354101986,
"grad_norm": 0.038869212364382434,
"learning_rate": 5.843996852010067e-06,
"loss": 1.1571,
"step": 1327
},
{
"epoch": 0.9036703100412538,
"grad_norm": 0.036608143403602626,
"learning_rate": 5.83867223371639e-06,
"loss": 1.1347,
"step": 1328
},
{
"epoch": 0.9043507846723089,
"grad_norm": 0.036981921488967735,
"learning_rate": 5.833346636601974e-06,
"loss": 1.1286,
"step": 1329
},
{
"epoch": 0.9050312593033641,
"grad_norm": 0.03900628795925875,
"learning_rate": 5.828020066882361e-06,
"loss": 1.1043,
"step": 1330
},
{
"epoch": 0.9057117339344193,
"grad_norm": 0.04582352143693707,
"learning_rate": 5.822692530774231e-06,
"loss": 1.0899,
"step": 1331
},
{
"epoch": 0.9063922085654744,
"grad_norm": 0.03613498624225157,
"learning_rate": 5.817364034495392e-06,
"loss": 1.0869,
"step": 1332
},
{
"epoch": 0.9070726831965296,
"grad_norm": 0.03579108523961496,
"learning_rate": 5.812034584264772e-06,
"loss": 1.0919,
"step": 1333
},
{
"epoch": 0.9077531578275847,
"grad_norm": 0.04619823633581344,
"learning_rate": 5.806704186302413e-06,
"loss": 1.0912,
"step": 1334
},
{
"epoch": 0.90843363245864,
"grad_norm": 0.037258090650687144,
"learning_rate": 5.801372846829466e-06,
"loss": 1.1635,
"step": 1335
},
{
"epoch": 0.9091141070896951,
"grad_norm": 0.0365801539988293,
"learning_rate": 5.796040572068175e-06,
"loss": 1.1301,
"step": 1336
},
{
"epoch": 0.9097945817207502,
"grad_norm": 0.05106804125404341,
"learning_rate": 5.790707368241878e-06,
"loss": 1.1021,
"step": 1337
},
{
"epoch": 0.9104750563518054,
"grad_norm": 0.03710062230657684,
"learning_rate": 5.7853732415749985e-06,
"loss": 1.1301,
"step": 1338
},
{
"epoch": 0.9111555309828605,
"grad_norm": 0.045332251441410246,
"learning_rate": 5.7800381982930366e-06,
"loss": 1.1265,
"step": 1339
},
{
"epoch": 0.9118360056139158,
"grad_norm": 0.045452205093223154,
"learning_rate": 5.774702244622563e-06,
"loss": 1.1631,
"step": 1340
},
{
"epoch": 0.9125164802449709,
"grad_norm": 0.04685102810770462,
"learning_rate": 5.769365386791207e-06,
"loss": 1.1557,
"step": 1341
},
{
"epoch": 0.913196954876026,
"grad_norm": 0.038270566458910894,
"learning_rate": 5.764027631027659e-06,
"loss": 1.0619,
"step": 1342
},
{
"epoch": 0.9138774295070812,
"grad_norm": 0.04116978111328107,
"learning_rate": 5.7586889835616514e-06,
"loss": 1.1851,
"step": 1343
},
{
"epoch": 0.9145579041381363,
"grad_norm": 0.09016617477511366,
"learning_rate": 5.753349450623961e-06,
"loss": 1.1841,
"step": 1344
},
{
"epoch": 0.9152383787691916,
"grad_norm": 0.11860215671725502,
"learning_rate": 5.748009038446398e-06,
"loss": 1.0897,
"step": 1345
},
{
"epoch": 0.9159188534002467,
"grad_norm": 0.03962594938674042,
"learning_rate": 5.7426677532618e-06,
"loss": 1.1321,
"step": 1346
},
{
"epoch": 0.9165993280313018,
"grad_norm": 0.04517562669664499,
"learning_rate": 5.737325601304019e-06,
"loss": 1.0935,
"step": 1347
},
{
"epoch": 0.917279802662357,
"grad_norm": 0.040219962082552206,
"learning_rate": 5.7319825888079215e-06,
"loss": 1.1468,
"step": 1348
},
{
"epoch": 0.9179602772934121,
"grad_norm": 0.03875848018693194,
"learning_rate": 5.7266387220093775e-06,
"loss": 1.1082,
"step": 1349
},
{
"epoch": 0.9186407519244674,
"grad_norm": 0.04142151680913673,
"learning_rate": 5.721294007145256e-06,
"loss": 1.2033,
"step": 1350
},
{
"epoch": 0.9193212265555225,
"grad_norm": 0.03777933370189356,
"learning_rate": 5.715948450453413e-06,
"loss": 1.1057,
"step": 1351
},
{
"epoch": 0.9200017011865776,
"grad_norm": 0.04734712225650399,
"learning_rate": 5.710602058172691e-06,
"loss": 1.1423,
"step": 1352
},
{
"epoch": 0.9206821758176328,
"grad_norm": 0.04535076469762656,
"learning_rate": 5.705254836542902e-06,
"loss": 1.1911,
"step": 1353
},
{
"epoch": 0.9213626504486879,
"grad_norm": 0.03684009862310686,
"learning_rate": 5.69990679180483e-06,
"loss": 1.0754,
"step": 1354
},
{
"epoch": 0.9220431250797432,
"grad_norm": 0.04289378110103612,
"learning_rate": 5.6945579302002176e-06,
"loss": 1.2281,
"step": 1355
},
{
"epoch": 0.9227235997107983,
"grad_norm": 0.037830261395348126,
"learning_rate": 5.689208257971766e-06,
"loss": 1.0832,
"step": 1356
},
{
"epoch": 0.9234040743418535,
"grad_norm": 2.090392898581139,
"learning_rate": 5.683857781363114e-06,
"loss": 1.0782,
"step": 1357
},
{
"epoch": 0.9240845489729086,
"grad_norm": 0.042266058791723134,
"learning_rate": 5.678506506618845e-06,
"loss": 1.1484,
"step": 1358
},
{
"epoch": 0.9247650236039637,
"grad_norm": 0.03718158141145417,
"learning_rate": 5.673154439984471e-06,
"loss": 1.0984,
"step": 1359
},
{
"epoch": 0.9254454982350189,
"grad_norm": 0.04138340821876582,
"learning_rate": 5.667801587706434e-06,
"loss": 1.1733,
"step": 1360
},
{
"epoch": 0.9261259728660741,
"grad_norm": 0.037699073237977145,
"learning_rate": 5.662447956032083e-06,
"loss": 1.1329,
"step": 1361
},
{
"epoch": 0.9268064474971293,
"grad_norm": 0.03988237002748851,
"learning_rate": 5.657093551209687e-06,
"loss": 1.1106,
"step": 1362
},
{
"epoch": 0.9274869221281844,
"grad_norm": 0.040582020227302805,
"learning_rate": 5.651738379488409e-06,
"loss": 1.1831,
"step": 1363
},
{
"epoch": 0.9281673967592395,
"grad_norm": 0.037750046648622575,
"learning_rate": 5.646382447118315e-06,
"loss": 1.1402,
"step": 1364
},
{
"epoch": 0.9288478713902947,
"grad_norm": 0.04005012095284457,
"learning_rate": 5.641025760350348e-06,
"loss": 1.1422,
"step": 1365
},
{
"epoch": 0.9295283460213499,
"grad_norm": 0.03670304338073242,
"learning_rate": 5.635668325436343e-06,
"loss": 1.1749,
"step": 1366
},
{
"epoch": 0.9302088206524051,
"grad_norm": 0.042699791501834956,
"learning_rate": 5.6303101486290025e-06,
"loss": 1.1627,
"step": 1367
},
{
"epoch": 0.9308892952834602,
"grad_norm": 0.038454435482173276,
"learning_rate": 5.624951236181893e-06,
"loss": 1.1084,
"step": 1368
},
{
"epoch": 0.9315697699145153,
"grad_norm": 0.042276070833888346,
"learning_rate": 5.619591594349443e-06,
"loss": 1.1576,
"step": 1369
},
{
"epoch": 0.9322502445455705,
"grad_norm": 0.04573517286117657,
"learning_rate": 5.614231229386933e-06,
"loss": 1.1648,
"step": 1370
},
{
"epoch": 0.9329307191766257,
"grad_norm": 0.042286294375058105,
"learning_rate": 5.608870147550483e-06,
"loss": 1.1192,
"step": 1371
},
{
"epoch": 0.9336111938076809,
"grad_norm": 0.04213665462727033,
"learning_rate": 5.603508355097054e-06,
"loss": 1.1248,
"step": 1372
},
{
"epoch": 0.934291668438736,
"grad_norm": 0.0874870290332434,
"learning_rate": 5.598145858284436e-06,
"loss": 1.1522,
"step": 1373
},
{
"epoch": 0.9349721430697912,
"grad_norm": 0.05352039006439997,
"learning_rate": 5.592782663371237e-06,
"loss": 1.1555,
"step": 1374
},
{
"epoch": 0.9356526177008463,
"grad_norm": 0.05227157008033531,
"learning_rate": 5.587418776616884e-06,
"loss": 1.0767,
"step": 1375
},
{
"epoch": 0.9363330923319015,
"grad_norm": 0.037992861991384505,
"learning_rate": 5.582054204281609e-06,
"loss": 1.1053,
"step": 1376
},
{
"epoch": 0.9370135669629567,
"grad_norm": 0.03978769517638952,
"learning_rate": 5.576688952626445e-06,
"loss": 1.1536,
"step": 1377
},
{
"epoch": 0.9376940415940118,
"grad_norm": 0.03770655553602028,
"learning_rate": 5.571323027913221e-06,
"loss": 1.1481,
"step": 1378
},
{
"epoch": 0.938374516225067,
"grad_norm": 0.03974451793347783,
"learning_rate": 5.565956436404547e-06,
"loss": 1.1494,
"step": 1379
},
{
"epoch": 0.9390549908561221,
"grad_norm": 0.04198108346201592,
"learning_rate": 5.56058918436381e-06,
"loss": 1.1255,
"step": 1380
},
{
"epoch": 0.9397354654871773,
"grad_norm": 0.04227156170307245,
"learning_rate": 5.555221278055175e-06,
"loss": 1.1527,
"step": 1381
},
{
"epoch": 0.9404159401182325,
"grad_norm": 0.0499261928413723,
"learning_rate": 5.549852723743564e-06,
"loss": 1.119,
"step": 1382
},
{
"epoch": 0.9410964147492876,
"grad_norm": 0.08956697734623169,
"learning_rate": 5.544483527694656e-06,
"loss": 1.1032,
"step": 1383
},
{
"epoch": 0.9417768893803428,
"grad_norm": 0.04523933390609141,
"learning_rate": 5.539113696174884e-06,
"loss": 1.155,
"step": 1384
},
{
"epoch": 0.9424573640113979,
"grad_norm": 0.03890032216180598,
"learning_rate": 5.533743235451417e-06,
"loss": 1.1675,
"step": 1385
},
{
"epoch": 0.9431378386424532,
"grad_norm": 0.040665145364503094,
"learning_rate": 5.528372151792161e-06,
"loss": 1.0709,
"step": 1386
},
{
"epoch": 0.9438183132735083,
"grad_norm": 0.037657332316222335,
"learning_rate": 5.52300045146575e-06,
"loss": 1.1867,
"step": 1387
},
{
"epoch": 0.9444987879045634,
"grad_norm": 0.06000112693459993,
"learning_rate": 5.517628140741532e-06,
"loss": 1.1546,
"step": 1388
},
{
"epoch": 0.9451792625356186,
"grad_norm": 0.038525920528912905,
"learning_rate": 5.512255225889578e-06,
"loss": 1.1404,
"step": 1389
},
{
"epoch": 0.9458597371666737,
"grad_norm": 0.03770977448092822,
"learning_rate": 5.506881713180652e-06,
"loss": 1.1335,
"step": 1390
},
{
"epoch": 0.946540211797729,
"grad_norm": 0.05719346106237229,
"learning_rate": 5.501507608886225e-06,
"loss": 1.0739,
"step": 1391
},
{
"epoch": 0.9472206864287841,
"grad_norm": 0.06138885021775949,
"learning_rate": 5.496132919278454e-06,
"loss": 1.1118,
"step": 1392
},
{
"epoch": 0.9479011610598392,
"grad_norm": 0.036329347316896306,
"learning_rate": 5.490757650630181e-06,
"loss": 1.1345,
"step": 1393
},
{
"epoch": 0.9485816356908944,
"grad_norm": 0.044635102687606,
"learning_rate": 5.485381809214921e-06,
"loss": 1.097,
"step": 1394
},
{
"epoch": 0.9492621103219495,
"grad_norm": 0.041074287992577035,
"learning_rate": 5.480005401306859e-06,
"loss": 1.1724,
"step": 1395
},
{
"epoch": 0.9499425849530048,
"grad_norm": 0.040562189183978595,
"learning_rate": 5.474628433180844e-06,
"loss": 1.1502,
"step": 1396
},
{
"epoch": 0.9506230595840599,
"grad_norm": 0.03527787660089924,
"learning_rate": 5.469250911112377e-06,
"loss": 1.1372,
"step": 1397
},
{
"epoch": 0.951303534215115,
"grad_norm": 0.03771528448444702,
"learning_rate": 5.463872841377601e-06,
"loss": 1.1362,
"step": 1398
},
{
"epoch": 0.9519840088461702,
"grad_norm": 0.03839056214692817,
"learning_rate": 5.458494230253305e-06,
"loss": 1.1983,
"step": 1399
},
{
"epoch": 0.9526644834772253,
"grad_norm": 0.03931771962376804,
"learning_rate": 5.453115084016908e-06,
"loss": 1.132,
"step": 1400
},
{
"epoch": 0.9533449581082806,
"grad_norm": 0.0382340673892103,
"learning_rate": 5.4477354089464484e-06,
"loss": 1.0805,
"step": 1401
},
{
"epoch": 0.9540254327393357,
"grad_norm": 0.03832456586318755,
"learning_rate": 5.44235521132059e-06,
"loss": 1.1774,
"step": 1402
},
{
"epoch": 0.9547059073703908,
"grad_norm": 0.0386574664813739,
"learning_rate": 5.436974497418599e-06,
"loss": 1.0919,
"step": 1403
},
{
"epoch": 0.955386382001446,
"grad_norm": 0.037913574314079615,
"learning_rate": 5.43159327352035e-06,
"loss": 1.1638,
"step": 1404
},
{
"epoch": 0.9560668566325011,
"grad_norm": 0.061544162650649714,
"learning_rate": 5.426211545906308e-06,
"loss": 1.1622,
"step": 1405
},
{
"epoch": 0.9567473312635564,
"grad_norm": 0.04048621678766252,
"learning_rate": 5.420829320857532e-06,
"loss": 1.1138,
"step": 1406
},
{
"epoch": 0.9574278058946115,
"grad_norm": 0.06916520224483899,
"learning_rate": 5.415446604655654e-06,
"loss": 1.1071,
"step": 1407
},
{
"epoch": 0.9581082805256667,
"grad_norm": 0.04432249632586175,
"learning_rate": 5.410063403582886e-06,
"loss": 1.0735,
"step": 1408
},
{
"epoch": 0.9587887551567218,
"grad_norm": 0.0457617630211233,
"learning_rate": 5.404679723921999e-06,
"loss": 1.0876,
"step": 1409
},
{
"epoch": 0.9594692297877769,
"grad_norm": 0.046665510793855995,
"learning_rate": 5.39929557195633e-06,
"loss": 1.1707,
"step": 1410
},
{
"epoch": 0.9601497044188322,
"grad_norm": 0.03832739526432612,
"learning_rate": 5.3939109539697625e-06,
"loss": 1.1537,
"step": 1411
},
{
"epoch": 0.9608301790498873,
"grad_norm": 0.03944706799007417,
"learning_rate": 5.388525876246726e-06,
"loss": 1.1296,
"step": 1412
},
{
"epoch": 0.9615106536809425,
"grad_norm": 0.08813887345188155,
"learning_rate": 5.383140345072183e-06,
"loss": 1.1174,
"step": 1413
},
{
"epoch": 0.9621911283119976,
"grad_norm": 0.040135008945791584,
"learning_rate": 5.377754366731633e-06,
"loss": 1.1248,
"step": 1414
},
{
"epoch": 0.9628716029430527,
"grad_norm": 0.0386390489055341,
"learning_rate": 5.372367947511086e-06,
"loss": 1.1562,
"step": 1415
},
{
"epoch": 0.963552077574108,
"grad_norm": 0.039348686351082796,
"learning_rate": 5.3669810936970755e-06,
"loss": 1.1387,
"step": 1416
},
{
"epoch": 0.9642325522051631,
"grad_norm": 0.041790651810475817,
"learning_rate": 5.361593811576641e-06,
"loss": 1.1156,
"step": 1417
},
{
"epoch": 0.9649130268362183,
"grad_norm": 0.036848062207846095,
"learning_rate": 5.35620610743732e-06,
"loss": 1.143,
"step": 1418
},
{
"epoch": 0.9655935014672734,
"grad_norm": 0.037952363281026545,
"learning_rate": 5.350817987567141e-06,
"loss": 1.0856,
"step": 1419
},
{
"epoch": 0.9662739760983285,
"grad_norm": 0.14653864959295063,
"learning_rate": 5.345429458254622e-06,
"loss": 1.1963,
"step": 1420
},
{
"epoch": 0.9669544507293838,
"grad_norm": 0.03952385494760658,
"learning_rate": 5.340040525788755e-06,
"loss": 1.165,
"step": 1421
},
{
"epoch": 0.9676349253604389,
"grad_norm": 0.040922869335162954,
"learning_rate": 5.334651196459003e-06,
"loss": 1.0993,
"step": 1422
},
{
"epoch": 0.9683153999914941,
"grad_norm": 0.05271089254134115,
"learning_rate": 5.329261476555295e-06,
"loss": 1.099,
"step": 1423
},
{
"epoch": 0.9689958746225492,
"grad_norm": 0.05761662333166488,
"learning_rate": 5.323871372368017e-06,
"loss": 1.1506,
"step": 1424
},
{
"epoch": 0.9696763492536044,
"grad_norm": 0.03840418283045493,
"learning_rate": 5.318480890187995e-06,
"loss": 1.2209,
"step": 1425
},
{
"epoch": 0.9703568238846596,
"grad_norm": 0.044583352681142795,
"learning_rate": 5.3130900363065055e-06,
"loss": 1.1219,
"step": 1426
},
{
"epoch": 0.9710372985157147,
"grad_norm": 0.039247478926898276,
"learning_rate": 5.307698817015252e-06,
"loss": 1.123,
"step": 1427
},
{
"epoch": 0.9717177731467699,
"grad_norm": 0.04708105196017248,
"learning_rate": 5.30230723860637e-06,
"loss": 1.1874,
"step": 1428
},
{
"epoch": 0.972398247777825,
"grad_norm": 0.03912227172746955,
"learning_rate": 5.296915307372411e-06,
"loss": 1.1852,
"step": 1429
},
{
"epoch": 0.9730787224088802,
"grad_norm": 0.040029452451936,
"learning_rate": 5.291523029606339e-06,
"loss": 1.187,
"step": 1430
},
{
"epoch": 0.9737591970399354,
"grad_norm": 0.0418960722715767,
"learning_rate": 5.286130411601523e-06,
"loss": 1.148,
"step": 1431
},
{
"epoch": 0.9744396716709905,
"grad_norm": 0.03828988551024802,
"learning_rate": 5.2807374596517255e-06,
"loss": 1.1438,
"step": 1432
},
{
"epoch": 0.9751201463020457,
"grad_norm": 0.045285823510491964,
"learning_rate": 5.2753441800511065e-06,
"loss": 1.1514,
"step": 1433
},
{
"epoch": 0.9758006209331008,
"grad_norm": 0.04234680826692546,
"learning_rate": 5.269950579094199e-06,
"loss": 1.0763,
"step": 1434
},
{
"epoch": 0.976481095564156,
"grad_norm": 0.03866507267333306,
"learning_rate": 5.26455666307592e-06,
"loss": 1.1106,
"step": 1435
},
{
"epoch": 0.9771615701952111,
"grad_norm": 0.04187314728313469,
"learning_rate": 5.259162438291546e-06,
"loss": 1.1464,
"step": 1436
},
{
"epoch": 0.9778420448262664,
"grad_norm": 0.0410785349208627,
"learning_rate": 5.253767911036721e-06,
"loss": 1.1317,
"step": 1437
},
{
"epoch": 0.9785225194573215,
"grad_norm": 0.03711776033657422,
"learning_rate": 5.248373087607434e-06,
"loss": 1.0356,
"step": 1438
},
{
"epoch": 0.9792029940883766,
"grad_norm": 0.03787434005715655,
"learning_rate": 5.242977974300032e-06,
"loss": 1.1309,
"step": 1439
},
{
"epoch": 0.9798834687194318,
"grad_norm": 0.04055478162982645,
"learning_rate": 5.2375825774111865e-06,
"loss": 1.1183,
"step": 1440
},
{
"epoch": 0.9805639433504869,
"grad_norm": 0.0389445480851957,
"learning_rate": 5.232186903237909e-06,
"loss": 1.143,
"step": 1441
},
{
"epoch": 0.9812444179815422,
"grad_norm": 0.03609316149289306,
"learning_rate": 5.22679095807753e-06,
"loss": 1.0921,
"step": 1442
},
{
"epoch": 0.9819248926125973,
"grad_norm": 0.041188917003126546,
"learning_rate": 5.221394748227698e-06,
"loss": 1.1189,
"step": 1443
},
{
"epoch": 0.9826053672436524,
"grad_norm": 0.04193981287457778,
"learning_rate": 5.215998279986374e-06,
"loss": 1.0826,
"step": 1444
},
{
"epoch": 0.9832858418747076,
"grad_norm": 0.039570258904263994,
"learning_rate": 5.210601559651815e-06,
"loss": 1.1551,
"step": 1445
},
{
"epoch": 0.9839663165057627,
"grad_norm": 0.059679492238651086,
"learning_rate": 5.2052045935225725e-06,
"loss": 1.1729,
"step": 1446
},
{
"epoch": 0.984646791136818,
"grad_norm": 0.036826388540006895,
"learning_rate": 5.199807387897491e-06,
"loss": 1.1271,
"step": 1447
},
{
"epoch": 0.9853272657678731,
"grad_norm": 0.08793949662097836,
"learning_rate": 5.194409949075685e-06,
"loss": 1.0448,
"step": 1448
},
{
"epoch": 0.9860077403989282,
"grad_norm": 0.05855974483521235,
"learning_rate": 5.18901228335655e-06,
"loss": 1.1708,
"step": 1449
},
{
"epoch": 0.9866882150299834,
"grad_norm": 0.05010892409986885,
"learning_rate": 5.183614397039741e-06,
"loss": 1.089,
"step": 1450
},
{
"epoch": 0.9873686896610385,
"grad_norm": 0.03988118351430687,
"learning_rate": 5.178216296425175e-06,
"loss": 1.1308,
"step": 1451
},
{
"epoch": 0.9880491642920938,
"grad_norm": 0.03919638208961494,
"learning_rate": 5.172817987813013e-06,
"loss": 1.1184,
"step": 1452
},
{
"epoch": 0.9887296389231489,
"grad_norm": 0.043036257620388624,
"learning_rate": 5.167419477503664e-06,
"loss": 1.145,
"step": 1453
},
{
"epoch": 0.989410113554204,
"grad_norm": 0.039876449306626455,
"learning_rate": 5.162020771797768e-06,
"loss": 1.0945,
"step": 1454
},
{
"epoch": 0.9900905881852592,
"grad_norm": 0.03657462252776725,
"learning_rate": 5.156621876996197e-06,
"loss": 1.0527,
"step": 1455
},
{
"epoch": 0.9907710628163143,
"grad_norm": 0.04025378889075889,
"learning_rate": 5.1512227994000445e-06,
"loss": 1.0751,
"step": 1456
},
{
"epoch": 0.9914515374473696,
"grad_norm": 0.03847147746123726,
"learning_rate": 5.145823545310611e-06,
"loss": 1.1502,
"step": 1457
},
{
"epoch": 0.9921320120784247,
"grad_norm": 0.06386324802577721,
"learning_rate": 5.1404241210294095e-06,
"loss": 1.1378,
"step": 1458
},
{
"epoch": 0.9928124867094799,
"grad_norm": 0.040216014642994895,
"learning_rate": 5.135024532858149e-06,
"loss": 1.1142,
"step": 1459
},
{
"epoch": 0.993492961340535,
"grad_norm": 0.04520156556129337,
"learning_rate": 5.1296247870987295e-06,
"loss": 1.1541,
"step": 1460
},
{
"epoch": 0.9941734359715901,
"grad_norm": 0.054314888027714454,
"learning_rate": 5.124224890053235e-06,
"loss": 1.1552,
"step": 1461
},
{
"epoch": 0.9948539106026454,
"grad_norm": 0.04058797212640824,
"learning_rate": 5.118824848023926e-06,
"loss": 1.116,
"step": 1462
},
{
"epoch": 0.9955343852337005,
"grad_norm": 0.039620054276532035,
"learning_rate": 5.1134246673132335e-06,
"loss": 1.1666,
"step": 1463
},
{
"epoch": 0.9962148598647557,
"grad_norm": 0.09428129868211108,
"learning_rate": 5.10802435422375e-06,
"loss": 1.1405,
"step": 1464
},
{
"epoch": 0.9968953344958108,
"grad_norm": 0.03562254613548753,
"learning_rate": 5.102623915058219e-06,
"loss": 1.1264,
"step": 1465
},
{
"epoch": 0.9975758091268659,
"grad_norm": 0.0394178711051225,
"learning_rate": 5.097223356119538e-06,
"loss": 1.152,
"step": 1466
},
{
"epoch": 0.9982562837579212,
"grad_norm": 0.067470107294966,
"learning_rate": 5.091822683710739e-06,
"loss": 1.0979,
"step": 1467
},
{
"epoch": 0.9989367583889763,
"grad_norm": 0.04241799597468662,
"learning_rate": 5.086421904134988e-06,
"loss": 1.1988,
"step": 1468
},
{
"epoch": 0.9996172330200315,
"grad_norm": 0.038200475889552825,
"learning_rate": 5.081021023695575e-06,
"loss": 1.1171,
"step": 1469
},
{
"epoch": 1.0002126483222047,
"grad_norm": 0.04995661906093174,
"learning_rate": 5.07562004869591e-06,
"loss": 1.3522,
"step": 1470
}
],
"logging_steps": 1,
"max_steps": 2938,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 735,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6098680151015424.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}