gyung's picture
Add epoch 1 checkpoint
b3c7de7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 734,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013623978201634877,
"grad_norm": 27.647886276245117,
"learning_rate": 0.0,
"loss": 2.7866311073303223,
"step": 1
},
{
"epoch": 0.0027247956403269754,
"grad_norm": 28.430374145507812,
"learning_rate": 4.444444444444445e-07,
"loss": 2.9265763759613037,
"step": 2
},
{
"epoch": 0.004087193460490463,
"grad_norm": 24.750080108642578,
"learning_rate": 8.88888888888889e-07,
"loss": 2.650007724761963,
"step": 3
},
{
"epoch": 0.005449591280653951,
"grad_norm": 26.000843048095703,
"learning_rate": 1.3333333333333334e-06,
"loss": 3.0528907775878906,
"step": 4
},
{
"epoch": 0.006811989100817439,
"grad_norm": 27.27891731262207,
"learning_rate": 1.777777777777778e-06,
"loss": 2.7619106769561768,
"step": 5
},
{
"epoch": 0.008174386920980926,
"grad_norm": 21.593921661376953,
"learning_rate": 2.222222222222222e-06,
"loss": 2.5617551803588867,
"step": 6
},
{
"epoch": 0.009536784741144414,
"grad_norm": 19.521177291870117,
"learning_rate": 2.666666666666667e-06,
"loss": 2.3857626914978027,
"step": 7
},
{
"epoch": 0.010899182561307902,
"grad_norm": 13.534676551818848,
"learning_rate": 3.1111111111111116e-06,
"loss": 2.1002044677734375,
"step": 8
},
{
"epoch": 0.01226158038147139,
"grad_norm": 9.906437873840332,
"learning_rate": 3.555555555555556e-06,
"loss": 1.8867732286453247,
"step": 9
},
{
"epoch": 0.013623978201634877,
"grad_norm": 10.24316692352295,
"learning_rate": 4.000000000000001e-06,
"loss": 1.7133018970489502,
"step": 10
},
{
"epoch": 0.014986376021798364,
"grad_norm": 5.109798908233643,
"learning_rate": 4.444444444444444e-06,
"loss": 1.3970978260040283,
"step": 11
},
{
"epoch": 0.01634877384196185,
"grad_norm": 4.742298126220703,
"learning_rate": 4.888888888888889e-06,
"loss": 1.3010644912719727,
"step": 12
},
{
"epoch": 0.017711171662125342,
"grad_norm": 8.546895027160645,
"learning_rate": 5.333333333333334e-06,
"loss": 1.1571484804153442,
"step": 13
},
{
"epoch": 0.01907356948228883,
"grad_norm": 2.846215009689331,
"learning_rate": 5.777777777777778e-06,
"loss": 1.193354845046997,
"step": 14
},
{
"epoch": 0.020435967302452316,
"grad_norm": 2.637321710586548,
"learning_rate": 6.222222222222223e-06,
"loss": 1.077557921409607,
"step": 15
},
{
"epoch": 0.021798365122615803,
"grad_norm": 1.948442816734314,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8891923427581787,
"step": 16
},
{
"epoch": 0.02316076294277929,
"grad_norm": 2.195993423461914,
"learning_rate": 7.111111111111112e-06,
"loss": 0.917742908000946,
"step": 17
},
{
"epoch": 0.02452316076294278,
"grad_norm": 1.420324683189392,
"learning_rate": 7.555555555555556e-06,
"loss": 1.0057953596115112,
"step": 18
},
{
"epoch": 0.025885558583106268,
"grad_norm": 1.1720658540725708,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7948991060256958,
"step": 19
},
{
"epoch": 0.027247956403269755,
"grad_norm": 2.2465505599975586,
"learning_rate": 8.444444444444446e-06,
"loss": 0.9116527438163757,
"step": 20
},
{
"epoch": 0.02861035422343324,
"grad_norm": 1.0430631637573242,
"learning_rate": 8.888888888888888e-06,
"loss": 0.8433143496513367,
"step": 21
},
{
"epoch": 0.02997275204359673,
"grad_norm": 0.9517339468002319,
"learning_rate": 9.333333333333334e-06,
"loss": 0.7266635894775391,
"step": 22
},
{
"epoch": 0.031335149863760216,
"grad_norm": 0.9054233431816101,
"learning_rate": 9.777777777777779e-06,
"loss": 0.6896510124206543,
"step": 23
},
{
"epoch": 0.0326975476839237,
"grad_norm": 0.8445485830307007,
"learning_rate": 1.0222222222222223e-05,
"loss": 0.7166739106178284,
"step": 24
},
{
"epoch": 0.0340599455040872,
"grad_norm": 0.799867570400238,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.70599365234375,
"step": 25
},
{
"epoch": 0.035422343324250684,
"grad_norm": 0.8114046454429626,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.68497633934021,
"step": 26
},
{
"epoch": 0.03678474114441417,
"grad_norm": 0.717369556427002,
"learning_rate": 1.1555555555555556e-05,
"loss": 0.4579915404319763,
"step": 27
},
{
"epoch": 0.03814713896457766,
"grad_norm": 0.8005459904670715,
"learning_rate": 1.2e-05,
"loss": 0.6453820466995239,
"step": 28
},
{
"epoch": 0.039509536784741145,
"grad_norm": 0.7655712366104126,
"learning_rate": 1.2444444444444446e-05,
"loss": 0.5622988939285278,
"step": 29
},
{
"epoch": 0.04087193460490463,
"grad_norm": 0.6727678775787354,
"learning_rate": 1.288888888888889e-05,
"loss": 0.5062695741653442,
"step": 30
},
{
"epoch": 0.04223433242506812,
"grad_norm": 0.7242804169654846,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5595001578330994,
"step": 31
},
{
"epoch": 0.043596730245231606,
"grad_norm": 0.7660211324691772,
"learning_rate": 1.377777777777778e-05,
"loss": 0.5437361598014832,
"step": 32
},
{
"epoch": 0.04495912806539509,
"grad_norm": 0.7480101585388184,
"learning_rate": 1.4222222222222224e-05,
"loss": 0.5963411927223206,
"step": 33
},
{
"epoch": 0.04632152588555858,
"grad_norm": 0.7058649659156799,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.5767841339111328,
"step": 34
},
{
"epoch": 0.047683923705722074,
"grad_norm": 0.6593936681747437,
"learning_rate": 1.5111111111111112e-05,
"loss": 0.4501464366912842,
"step": 35
},
{
"epoch": 0.04904632152588556,
"grad_norm": 0.6758292317390442,
"learning_rate": 1.555555555555556e-05,
"loss": 0.5993712544441223,
"step": 36
},
{
"epoch": 0.05040871934604905,
"grad_norm": 0.7313498258590698,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.5267407894134521,
"step": 37
},
{
"epoch": 0.051771117166212535,
"grad_norm": 0.6532080173492432,
"learning_rate": 1.6444444444444444e-05,
"loss": 0.5639113783836365,
"step": 38
},
{
"epoch": 0.05313351498637602,
"grad_norm": 0.6379766464233398,
"learning_rate": 1.688888888888889e-05,
"loss": 0.5104179382324219,
"step": 39
},
{
"epoch": 0.05449591280653951,
"grad_norm": 0.6598156690597534,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.5445395112037659,
"step": 40
},
{
"epoch": 0.055858310626702996,
"grad_norm": 0.6215161085128784,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.5025588870048523,
"step": 41
},
{
"epoch": 0.05722070844686648,
"grad_norm": 0.5954686403274536,
"learning_rate": 1.8222222222222224e-05,
"loss": 0.46108683943748474,
"step": 42
},
{
"epoch": 0.05858310626702997,
"grad_norm": 0.6308771967887878,
"learning_rate": 1.866666666666667e-05,
"loss": 0.5122817158699036,
"step": 43
},
{
"epoch": 0.05994550408719346,
"grad_norm": 0.6523401141166687,
"learning_rate": 1.9111111111111113e-05,
"loss": 0.5358462333679199,
"step": 44
},
{
"epoch": 0.06130790190735695,
"grad_norm": 0.6872179508209229,
"learning_rate": 1.9555555555555557e-05,
"loss": 0.5521703958511353,
"step": 45
},
{
"epoch": 0.06267029972752043,
"grad_norm": 0.5889444947242737,
"learning_rate": 2e-05,
"loss": 0.43677768111228943,
"step": 46
},
{
"epoch": 0.06403269754768393,
"grad_norm": 0.6145616769790649,
"learning_rate": 1.9999975629761854e-05,
"loss": 0.581091046333313,
"step": 47
},
{
"epoch": 0.0653950953678474,
"grad_norm": 0.570745587348938,
"learning_rate": 1.9999902519166192e-05,
"loss": 0.5111463069915771,
"step": 48
},
{
"epoch": 0.0667574931880109,
"grad_norm": 0.6257374286651611,
"learning_rate": 1.9999780668569363e-05,
"loss": 0.5142855644226074,
"step": 49
},
{
"epoch": 0.0681198910081744,
"grad_norm": 0.5916558504104614,
"learning_rate": 1.9999610078565272e-05,
"loss": 0.4986756443977356,
"step": 50
},
{
"epoch": 0.06948228882833787,
"grad_norm": 0.517594575881958,
"learning_rate": 1.999939074998538e-05,
"loss": 0.40387624502182007,
"step": 51
},
{
"epoch": 0.07084468664850137,
"grad_norm": 0.5758000612258911,
"learning_rate": 1.9999122683898708e-05,
"loss": 0.4982905387878418,
"step": 52
},
{
"epoch": 0.07220708446866485,
"grad_norm": 0.5720963478088379,
"learning_rate": 1.9998805881611816e-05,
"loss": 0.4793824553489685,
"step": 53
},
{
"epoch": 0.07356948228882834,
"grad_norm": 0.5784019231796265,
"learning_rate": 1.9998440344668827e-05,
"loss": 0.5332019329071045,
"step": 54
},
{
"epoch": 0.07493188010899182,
"grad_norm": 0.5833513736724854,
"learning_rate": 1.999802607485137e-05,
"loss": 0.5038864016532898,
"step": 55
},
{
"epoch": 0.07629427792915532,
"grad_norm": 0.5767697095870972,
"learning_rate": 1.999756307417863e-05,
"loss": 0.5325872898101807,
"step": 56
},
{
"epoch": 0.0776566757493188,
"grad_norm": 0.5682862401008606,
"learning_rate": 1.9997051344907284e-05,
"loss": 0.5185012817382812,
"step": 57
},
{
"epoch": 0.07901907356948229,
"grad_norm": 0.614080011844635,
"learning_rate": 1.9996490889531528e-05,
"loss": 0.49285584688186646,
"step": 58
},
{
"epoch": 0.08038147138964577,
"grad_norm": 0.5887868404388428,
"learning_rate": 1.999588171078305e-05,
"loss": 0.5096205472946167,
"step": 59
},
{
"epoch": 0.08174386920980926,
"grad_norm": 0.556518018245697,
"learning_rate": 1.9995223811631016e-05,
"loss": 0.43357372283935547,
"step": 60
},
{
"epoch": 0.08310626702997276,
"grad_norm": 0.6359555125236511,
"learning_rate": 1.9994517195282053e-05,
"loss": 0.5556465983390808,
"step": 61
},
{
"epoch": 0.08446866485013624,
"grad_norm": 0.5386205911636353,
"learning_rate": 1.999376186518025e-05,
"loss": 0.46860817074775696,
"step": 62
},
{
"epoch": 0.08583106267029973,
"grad_norm": 0.6243954300880432,
"learning_rate": 1.9992957825007115e-05,
"loss": 0.5260002613067627,
"step": 63
},
{
"epoch": 0.08719346049046321,
"grad_norm": 0.5575344562530518,
"learning_rate": 1.9992105078681587e-05,
"loss": 0.48442351818084717,
"step": 64
},
{
"epoch": 0.0885558583106267,
"grad_norm": 0.5960455536842346,
"learning_rate": 1.999120363035998e-05,
"loss": 0.529167890548706,
"step": 65
},
{
"epoch": 0.08991825613079019,
"grad_norm": 0.49278023838996887,
"learning_rate": 1.9990253484436004e-05,
"loss": 0.4226565361022949,
"step": 66
},
{
"epoch": 0.09128065395095368,
"grad_norm": 0.64710932970047,
"learning_rate": 1.9989254645540715e-05,
"loss": 0.5989794731140137,
"step": 67
},
{
"epoch": 0.09264305177111716,
"grad_norm": 0.5093353390693665,
"learning_rate": 1.9988207118542504e-05,
"loss": 0.4339316487312317,
"step": 68
},
{
"epoch": 0.09400544959128065,
"grad_norm": 0.5457233786582947,
"learning_rate": 1.998711090854706e-05,
"loss": 0.46932798624038696,
"step": 69
},
{
"epoch": 0.09536784741144415,
"grad_norm": 0.6200721859931946,
"learning_rate": 1.998596602089737e-05,
"loss": 0.5474086999893188,
"step": 70
},
{
"epoch": 0.09673024523160763,
"grad_norm": 0.4950924217700958,
"learning_rate": 1.9984772461173663e-05,
"loss": 0.40740966796875,
"step": 71
},
{
"epoch": 0.09809264305177112,
"grad_norm": 0.5403825640678406,
"learning_rate": 1.998353023519341e-05,
"loss": 0.4256601929664612,
"step": 72
},
{
"epoch": 0.0994550408719346,
"grad_norm": 0.5570508241653442,
"learning_rate": 1.9982239349011286e-05,
"loss": 0.5229888558387756,
"step": 73
},
{
"epoch": 0.1008174386920981,
"grad_norm": 0.5410299897193909,
"learning_rate": 1.9980899808919122e-05,
"loss": 0.4738315939903259,
"step": 74
},
{
"epoch": 0.10217983651226158,
"grad_norm": 0.5304137468338013,
"learning_rate": 1.9979511621445902e-05,
"loss": 0.453370064496994,
"step": 75
},
{
"epoch": 0.10354223433242507,
"grad_norm": 0.5148070454597473,
"learning_rate": 1.9978074793357726e-05,
"loss": 0.4637362062931061,
"step": 76
},
{
"epoch": 0.10490463215258855,
"grad_norm": 0.4985616207122803,
"learning_rate": 1.9976589331657754e-05,
"loss": 0.41210031509399414,
"step": 77
},
{
"epoch": 0.10626702997275204,
"grad_norm": 0.5819671154022217,
"learning_rate": 1.99750552435862e-05,
"loss": 0.49417591094970703,
"step": 78
},
{
"epoch": 0.10762942779291552,
"grad_norm": 0.5260801315307617,
"learning_rate": 1.997347253662028e-05,
"loss": 0.46970927715301514,
"step": 79
},
{
"epoch": 0.10899182561307902,
"grad_norm": 0.7598191499710083,
"learning_rate": 1.9971841218474184e-05,
"loss": 0.5050591230392456,
"step": 80
},
{
"epoch": 0.11035422343324251,
"grad_norm": 0.5178552865982056,
"learning_rate": 1.997016129709904e-05,
"loss": 0.48321259021759033,
"step": 81
},
{
"epoch": 0.11171662125340599,
"grad_norm": 0.588383674621582,
"learning_rate": 1.9968432780682855e-05,
"loss": 0.5230928659439087,
"step": 82
},
{
"epoch": 0.11307901907356949,
"grad_norm": 0.48950737714767456,
"learning_rate": 1.9966655677650512e-05,
"loss": 0.41843950748443604,
"step": 83
},
{
"epoch": 0.11444141689373297,
"grad_norm": 0.5205618739128113,
"learning_rate": 1.9964829996663684e-05,
"loss": 0.4415491819381714,
"step": 84
},
{
"epoch": 0.11580381471389646,
"grad_norm": 0.5614944100379944,
"learning_rate": 1.9962955746620832e-05,
"loss": 0.5067013502120972,
"step": 85
},
{
"epoch": 0.11716621253405994,
"grad_norm": 0.5298795104026794,
"learning_rate": 1.9961032936657143e-05,
"loss": 0.43599075078964233,
"step": 86
},
{
"epoch": 0.11852861035422343,
"grad_norm": 0.5152458548545837,
"learning_rate": 1.9959061576144482e-05,
"loss": 0.4749916195869446,
"step": 87
},
{
"epoch": 0.11989100817438691,
"grad_norm": 0.4422807991504669,
"learning_rate": 1.9957041674691356e-05,
"loss": 0.3546351194381714,
"step": 88
},
{
"epoch": 0.12125340599455041,
"grad_norm": 0.5386228561401367,
"learning_rate": 1.995497324214285e-05,
"loss": 0.488656610250473,
"step": 89
},
{
"epoch": 0.1226158038147139,
"grad_norm": 0.530255913734436,
"learning_rate": 1.995285628858062e-05,
"loss": 0.4679569602012634,
"step": 90
},
{
"epoch": 0.12397820163487738,
"grad_norm": 0.5118647217750549,
"learning_rate": 1.995069082432279e-05,
"loss": 0.4375801682472229,
"step": 91
},
{
"epoch": 0.12534059945504086,
"grad_norm": 0.5084353685379028,
"learning_rate": 1.994847685992393e-05,
"loss": 0.46044063568115234,
"step": 92
},
{
"epoch": 0.12670299727520437,
"grad_norm": 0.49600714445114136,
"learning_rate": 1.9946214406175016e-05,
"loss": 0.4706187844276428,
"step": 93
},
{
"epoch": 0.12806539509536785,
"grad_norm": 0.5475752949714661,
"learning_rate": 1.9943903474103354e-05,
"loss": 0.47216880321502686,
"step": 94
},
{
"epoch": 0.12942779291553133,
"grad_norm": 0.4829142093658447,
"learning_rate": 1.994154407497254e-05,
"loss": 0.43380069732666016,
"step": 95
},
{
"epoch": 0.1307901907356948,
"grad_norm": 0.4949623942375183,
"learning_rate": 1.993913622028239e-05,
"loss": 0.45172300934791565,
"step": 96
},
{
"epoch": 0.13215258855585832,
"grad_norm": 0.5360006093978882,
"learning_rate": 1.9936679921768905e-05,
"loss": 0.43014198541641235,
"step": 97
},
{
"epoch": 0.1335149863760218,
"grad_norm": 0.4718469977378845,
"learning_rate": 1.9934175191404202e-05,
"loss": 0.42612555623054504,
"step": 98
},
{
"epoch": 0.13487738419618528,
"grad_norm": 0.5138616561889648,
"learning_rate": 1.9931622041396456e-05,
"loss": 0.4884234666824341,
"step": 99
},
{
"epoch": 0.1362397820163488,
"grad_norm": 0.4901588559150696,
"learning_rate": 1.9929020484189843e-05,
"loss": 0.4561987519264221,
"step": 100
},
{
"epoch": 0.13760217983651227,
"grad_norm": 0.46556147933006287,
"learning_rate": 1.992637053246448e-05,
"loss": 0.3546220660209656,
"step": 101
},
{
"epoch": 0.13896457765667575,
"grad_norm": 0.5201694965362549,
"learning_rate": 1.992367219913635e-05,
"loss": 0.4550625681877136,
"step": 102
},
{
"epoch": 0.14032697547683923,
"grad_norm": 0.5415034294128418,
"learning_rate": 1.9920925497357265e-05,
"loss": 0.43804478645324707,
"step": 103
},
{
"epoch": 0.14168937329700274,
"grad_norm": 0.475267231464386,
"learning_rate": 1.9918130440514775e-05,
"loss": 0.37618038058280945,
"step": 104
},
{
"epoch": 0.14305177111716622,
"grad_norm": 0.47954168915748596,
"learning_rate": 1.9915287042232117e-05,
"loss": 0.40945565700531006,
"step": 105
},
{
"epoch": 0.1444141689373297,
"grad_norm": 0.4963965117931366,
"learning_rate": 1.9912395316368163e-05,
"loss": 0.39603012800216675,
"step": 106
},
{
"epoch": 0.14577656675749318,
"grad_norm": 0.5018640756607056,
"learning_rate": 1.990945527701731e-05,
"loss": 0.42917919158935547,
"step": 107
},
{
"epoch": 0.14713896457765668,
"grad_norm": 0.49071332812309265,
"learning_rate": 1.9906466938509456e-05,
"loss": 0.41273534297943115,
"step": 108
},
{
"epoch": 0.14850136239782016,
"grad_norm": 0.4668489992618561,
"learning_rate": 1.9903430315409908e-05,
"loss": 0.37738558650016785,
"step": 109
},
{
"epoch": 0.14986376021798364,
"grad_norm": 0.5912647843360901,
"learning_rate": 1.9900345422519302e-05,
"loss": 0.537000298500061,
"step": 110
},
{
"epoch": 0.15122615803814715,
"grad_norm": 0.501377522945404,
"learning_rate": 1.9897212274873558e-05,
"loss": 0.45420363545417786,
"step": 111
},
{
"epoch": 0.15258855585831063,
"grad_norm": 0.47566258907318115,
"learning_rate": 1.989403088774379e-05,
"loss": 0.4192931652069092,
"step": 112
},
{
"epoch": 0.1539509536784741,
"grad_norm": 0.545165479183197,
"learning_rate": 1.9890801276636226e-05,
"loss": 0.5133235454559326,
"step": 113
},
{
"epoch": 0.1553133514986376,
"grad_norm": 0.4873102307319641,
"learning_rate": 1.9887523457292145e-05,
"loss": 0.45365816354751587,
"step": 114
},
{
"epoch": 0.1566757493188011,
"grad_norm": 0.45992809534072876,
"learning_rate": 1.9884197445687795e-05,
"loss": 0.4125695824623108,
"step": 115
},
{
"epoch": 0.15803814713896458,
"grad_norm": 0.489008367061615,
"learning_rate": 1.9880823258034317e-05,
"loss": 0.45539143681526184,
"step": 116
},
{
"epoch": 0.15940054495912806,
"grad_norm": 0.45899710059165955,
"learning_rate": 1.987740091077766e-05,
"loss": 0.3859185576438904,
"step": 117
},
{
"epoch": 0.16076294277929154,
"grad_norm": 0.48963943123817444,
"learning_rate": 1.9873930420598508e-05,
"loss": 0.4098602831363678,
"step": 118
},
{
"epoch": 0.16212534059945505,
"grad_norm": 0.5609935522079468,
"learning_rate": 1.9870411804412196e-05,
"loss": 0.541596531867981,
"step": 119
},
{
"epoch": 0.16348773841961853,
"grad_norm": 0.451092392206192,
"learning_rate": 1.9866845079368628e-05,
"loss": 0.369413822889328,
"step": 120
},
{
"epoch": 0.164850136239782,
"grad_norm": 0.5160583257675171,
"learning_rate": 1.9863230262852188e-05,
"loss": 0.48153048753738403,
"step": 121
},
{
"epoch": 0.16621253405994552,
"grad_norm": 0.5104784369468689,
"learning_rate": 1.9859567372481666e-05,
"loss": 0.463962197303772,
"step": 122
},
{
"epoch": 0.167574931880109,
"grad_norm": 0.4819332957267761,
"learning_rate": 1.9855856426110163e-05,
"loss": 0.4457293152809143,
"step": 123
},
{
"epoch": 0.16893732970027248,
"grad_norm": 0.545768678188324,
"learning_rate": 1.9852097441825017e-05,
"loss": 0.4510464072227478,
"step": 124
},
{
"epoch": 0.17029972752043596,
"grad_norm": 0.46205708384513855,
"learning_rate": 1.9848290437947683e-05,
"loss": 0.4033975303173065,
"step": 125
},
{
"epoch": 0.17166212534059946,
"grad_norm": 0.4812323749065399,
"learning_rate": 1.9844435433033687e-05,
"loss": 0.42786744236946106,
"step": 126
},
{
"epoch": 0.17302452316076294,
"grad_norm": 0.5063855051994324,
"learning_rate": 1.9840532445872504e-05,
"loss": 0.439453125,
"step": 127
},
{
"epoch": 0.17438692098092642,
"grad_norm": 0.42103609442710876,
"learning_rate": 1.983658149548748e-05,
"loss": 0.372048556804657,
"step": 128
},
{
"epoch": 0.17574931880108993,
"grad_norm": 0.4785323739051819,
"learning_rate": 1.9832582601135737e-05,
"loss": 0.4088958501815796,
"step": 129
},
{
"epoch": 0.1771117166212534,
"grad_norm": 0.4796772301197052,
"learning_rate": 1.9828535782308074e-05,
"loss": 0.4392489492893219,
"step": 130
},
{
"epoch": 0.1784741144414169,
"grad_norm": 0.5020930767059326,
"learning_rate": 1.9824441058728882e-05,
"loss": 0.4506283402442932,
"step": 131
},
{
"epoch": 0.17983651226158037,
"grad_norm": 0.44487935304641724,
"learning_rate": 1.9820298450356036e-05,
"loss": 0.37223148345947266,
"step": 132
},
{
"epoch": 0.18119891008174388,
"grad_norm": 0.48866400122642517,
"learning_rate": 1.9816107977380805e-05,
"loss": 0.4460309147834778,
"step": 133
},
{
"epoch": 0.18256130790190736,
"grad_norm": 0.4672461450099945,
"learning_rate": 1.9811869660227757e-05,
"loss": 0.4269269108772278,
"step": 134
},
{
"epoch": 0.18392370572207084,
"grad_norm": 0.49336063861846924,
"learning_rate": 1.980758351955465e-05,
"loss": 0.4544104039669037,
"step": 135
},
{
"epoch": 0.18528610354223432,
"grad_norm": 0.45952633023262024,
"learning_rate": 1.9803249576252338e-05,
"loss": 0.38991525769233704,
"step": 136
},
{
"epoch": 0.18664850136239783,
"grad_norm": 0.48544129729270935,
"learning_rate": 1.979886785144467e-05,
"loss": 0.42076367139816284,
"step": 137
},
{
"epoch": 0.1880108991825613,
"grad_norm": 0.38183560967445374,
"learning_rate": 1.9794438366488377e-05,
"loss": 0.2765650153160095,
"step": 138
},
{
"epoch": 0.1893732970027248,
"grad_norm": 0.45844128727912903,
"learning_rate": 1.9789961142972983e-05,
"loss": 0.3620038628578186,
"step": 139
},
{
"epoch": 0.1907356948228883,
"grad_norm": 0.4732954502105713,
"learning_rate": 1.9785436202720687e-05,
"loss": 0.40507155656814575,
"step": 140
},
{
"epoch": 0.19209809264305178,
"grad_norm": 0.4901241362094879,
"learning_rate": 1.978086356778626e-05,
"loss": 0.4239882826805115,
"step": 141
},
{
"epoch": 0.19346049046321526,
"grad_norm": 0.4907649755477905,
"learning_rate": 1.9776243260456953e-05,
"loss": 0.4591466188430786,
"step": 142
},
{
"epoch": 0.19482288828337874,
"grad_norm": 0.4430483877658844,
"learning_rate": 1.977157530325235e-05,
"loss": 0.4003378450870514,
"step": 143
},
{
"epoch": 0.19618528610354224,
"grad_norm": 0.4529455006122589,
"learning_rate": 1.976685971892431e-05,
"loss": 0.4060153663158417,
"step": 144
},
{
"epoch": 0.19754768392370572,
"grad_norm": 0.500708281993866,
"learning_rate": 1.9762096530456803e-05,
"loss": 0.4507729411125183,
"step": 145
},
{
"epoch": 0.1989100817438692,
"grad_norm": 0.4838204085826874,
"learning_rate": 1.9757285761065846e-05,
"loss": 0.4445388913154602,
"step": 146
},
{
"epoch": 0.20027247956403268,
"grad_norm": 0.47918063402175903,
"learning_rate": 1.9752427434199356e-05,
"loss": 0.4199233651161194,
"step": 147
},
{
"epoch": 0.2016348773841962,
"grad_norm": 0.4424859881401062,
"learning_rate": 1.9747521573537048e-05,
"loss": 0.3845542073249817,
"step": 148
},
{
"epoch": 0.20299727520435967,
"grad_norm": 0.5027382373809814,
"learning_rate": 1.974256820299032e-05,
"loss": 0.4608474373817444,
"step": 149
},
{
"epoch": 0.20435967302452315,
"grad_norm": 0.4017482399940491,
"learning_rate": 1.9737567346702137e-05,
"loss": 0.35635966062545776,
"step": 150
},
{
"epoch": 0.20572207084468666,
"grad_norm": 0.4682810604572296,
"learning_rate": 1.973251902904691e-05,
"loss": 0.3920828104019165,
"step": 151
},
{
"epoch": 0.20708446866485014,
"grad_norm": 0.4764021337032318,
"learning_rate": 1.9727423274630385e-05,
"loss": 0.43593406677246094,
"step": 152
},
{
"epoch": 0.20844686648501362,
"grad_norm": 0.5115736126899719,
"learning_rate": 1.97222801082895e-05,
"loss": 0.4738002419471741,
"step": 153
},
{
"epoch": 0.2098092643051771,
"grad_norm": 0.4896221458911896,
"learning_rate": 1.9717089555092306e-05,
"loss": 0.4509856104850769,
"step": 154
},
{
"epoch": 0.2111716621253406,
"grad_norm": 0.4766773581504822,
"learning_rate": 1.97118516403378e-05,
"loss": 0.44380685687065125,
"step": 155
},
{
"epoch": 0.2125340599455041,
"grad_norm": 0.5250641107559204,
"learning_rate": 1.9706566389555825e-05,
"loss": 0.5936962962150574,
"step": 156
},
{
"epoch": 0.21389645776566757,
"grad_norm": 0.4729316830635071,
"learning_rate": 1.970123382850695e-05,
"loss": 0.4068388342857361,
"step": 157
},
{
"epoch": 0.21525885558583105,
"grad_norm": 0.4348066449165344,
"learning_rate": 1.969585398318233e-05,
"loss": 0.3906315267086029,
"step": 158
},
{
"epoch": 0.21662125340599456,
"grad_norm": 0.4758957624435425,
"learning_rate": 1.969042687980359e-05,
"loss": 0.4242008924484253,
"step": 159
},
{
"epoch": 0.21798365122615804,
"grad_norm": 0.49115386605262756,
"learning_rate": 1.9684952544822685e-05,
"loss": 0.44710224866867065,
"step": 160
},
{
"epoch": 0.21934604904632152,
"grad_norm": 0.506231963634491,
"learning_rate": 1.9679431004921788e-05,
"loss": 0.49800950288772583,
"step": 161
},
{
"epoch": 0.22070844686648503,
"grad_norm": 0.4652118682861328,
"learning_rate": 1.9673862287013144e-05,
"loss": 0.397954523563385,
"step": 162
},
{
"epoch": 0.2220708446866485,
"grad_norm": 0.4163675010204315,
"learning_rate": 1.9668246418238955e-05,
"loss": 0.33537113666534424,
"step": 163
},
{
"epoch": 0.22343324250681199,
"grad_norm": 0.46636438369750977,
"learning_rate": 1.9662583425971227e-05,
"loss": 0.4491173028945923,
"step": 164
},
{
"epoch": 0.22479564032697547,
"grad_norm": 0.471574068069458,
"learning_rate": 1.9656873337811658e-05,
"loss": 0.4455568790435791,
"step": 165
},
{
"epoch": 0.22615803814713897,
"grad_norm": 0.42547208070755005,
"learning_rate": 1.9651116181591493e-05,
"loss": 0.33267468214035034,
"step": 166
},
{
"epoch": 0.22752043596730245,
"grad_norm": 0.46092942357063293,
"learning_rate": 1.9645311985371374e-05,
"loss": 0.38794922828674316,
"step": 167
},
{
"epoch": 0.22888283378746593,
"grad_norm": 0.4635142683982849,
"learning_rate": 1.9639460777441243e-05,
"loss": 0.41988471150398254,
"step": 168
},
{
"epoch": 0.23024523160762944,
"grad_norm": 0.5089847445487976,
"learning_rate": 1.9633562586320157e-05,
"loss": 0.4452226758003235,
"step": 169
},
{
"epoch": 0.23160762942779292,
"grad_norm": 0.42120999097824097,
"learning_rate": 1.962761744075618e-05,
"loss": 0.34265103936195374,
"step": 170
},
{
"epoch": 0.2329700272479564,
"grad_norm": 0.37199750542640686,
"learning_rate": 1.9621625369726246e-05,
"loss": 0.2743265926837921,
"step": 171
},
{
"epoch": 0.23433242506811988,
"grad_norm": 0.45613083243370056,
"learning_rate": 1.961558640243598e-05,
"loss": 0.3892587423324585,
"step": 172
},
{
"epoch": 0.2356948228882834,
"grad_norm": 0.5023411512374878,
"learning_rate": 1.9609500568319605e-05,
"loss": 0.4512037932872772,
"step": 173
},
{
"epoch": 0.23705722070844687,
"grad_norm": 0.4689593017101288,
"learning_rate": 1.960336789703977e-05,
"loss": 0.4525565207004547,
"step": 174
},
{
"epoch": 0.23841961852861035,
"grad_norm": 0.4769952595233917,
"learning_rate": 1.9597188418487395e-05,
"loss": 0.4317702651023865,
"step": 175
},
{
"epoch": 0.23978201634877383,
"grad_norm": 0.44649723172187805,
"learning_rate": 1.959096216278156e-05,
"loss": 0.4062075912952423,
"step": 176
},
{
"epoch": 0.24114441416893734,
"grad_norm": 0.4167250096797943,
"learning_rate": 1.958468916026933e-05,
"loss": 0.3360878825187683,
"step": 177
},
{
"epoch": 0.24250681198910082,
"grad_norm": 0.501832127571106,
"learning_rate": 1.957836944152562e-05,
"loss": 0.4890025854110718,
"step": 178
},
{
"epoch": 0.2438692098092643,
"grad_norm": 0.4422992169857025,
"learning_rate": 1.957200303735304e-05,
"loss": 0.3915623128414154,
"step": 179
},
{
"epoch": 0.2452316076294278,
"grad_norm": 0.43513938784599304,
"learning_rate": 1.9565589978781747e-05,
"loss": 0.3817967474460602,
"step": 180
},
{
"epoch": 0.24659400544959129,
"grad_norm": 0.5025864839553833,
"learning_rate": 1.955913029706929e-05,
"loss": 0.47003644704818726,
"step": 181
},
{
"epoch": 0.24795640326975477,
"grad_norm": 0.481099009513855,
"learning_rate": 1.9552624023700472e-05,
"loss": 0.4433462917804718,
"step": 182
},
{
"epoch": 0.24931880108991825,
"grad_norm": 0.5063067078590393,
"learning_rate": 1.9546071190387175e-05,
"loss": 0.41345930099487305,
"step": 183
},
{
"epoch": 0.2506811989100817,
"grad_norm": 0.46571362018585205,
"learning_rate": 1.953947182906822e-05,
"loss": 0.4099092483520508,
"step": 184
},
{
"epoch": 0.25204359673024523,
"grad_norm": 0.43692854046821594,
"learning_rate": 1.953282597190921e-05,
"loss": 0.34167295694351196,
"step": 185
},
{
"epoch": 0.25340599455040874,
"grad_norm": 0.46433547139167786,
"learning_rate": 1.9526133651302372e-05,
"loss": 0.4195370674133301,
"step": 186
},
{
"epoch": 0.2547683923705722,
"grad_norm": 0.4526609778404236,
"learning_rate": 1.951939489986639e-05,
"loss": 0.4328376352787018,
"step": 187
},
{
"epoch": 0.2561307901907357,
"grad_norm": 0.4832986891269684,
"learning_rate": 1.951260975044626e-05,
"loss": 0.42811205983161926,
"step": 188
},
{
"epoch": 0.2574931880108992,
"grad_norm": 0.46561217308044434,
"learning_rate": 1.950577823611313e-05,
"loss": 0.4136439561843872,
"step": 189
},
{
"epoch": 0.25885558583106266,
"grad_norm": 0.7555133700370789,
"learning_rate": 1.9498900390164118e-05,
"loss": 0.44394075870513916,
"step": 190
},
{
"epoch": 0.26021798365122617,
"grad_norm": 0.49045872688293457,
"learning_rate": 1.949197624612218e-05,
"loss": 0.47819724678993225,
"step": 191
},
{
"epoch": 0.2615803814713896,
"grad_norm": 0.4706166386604309,
"learning_rate": 1.9485005837735918e-05,
"loss": 0.4488511383533478,
"step": 192
},
{
"epoch": 0.26294277929155313,
"grad_norm": 0.4559481143951416,
"learning_rate": 1.947798919897944e-05,
"loss": 0.35875898599624634,
"step": 193
},
{
"epoch": 0.26430517711171664,
"grad_norm": 0.459917277097702,
"learning_rate": 1.947092636405217e-05,
"loss": 0.43740493059158325,
"step": 194
},
{
"epoch": 0.2656675749318801,
"grad_norm": 0.48496246337890625,
"learning_rate": 1.946381736737871e-05,
"loss": 0.4613405466079712,
"step": 195
},
{
"epoch": 0.2670299727520436,
"grad_norm": 0.5405692458152771,
"learning_rate": 1.9456662243608643e-05,
"loss": 0.5320178270339966,
"step": 196
},
{
"epoch": 0.2683923705722071,
"grad_norm": 0.4415741264820099,
"learning_rate": 1.9449461027616382e-05,
"loss": 0.376873642206192,
"step": 197
},
{
"epoch": 0.26975476839237056,
"grad_norm": 0.4813419282436371,
"learning_rate": 1.9442213754501002e-05,
"loss": 0.4509425759315491,
"step": 198
},
{
"epoch": 0.27111716621253407,
"grad_norm": 0.511968731880188,
"learning_rate": 1.9434920459586054e-05,
"loss": 0.5076147317886353,
"step": 199
},
{
"epoch": 0.2724795640326976,
"grad_norm": 0.4836997091770172,
"learning_rate": 1.9427581178419408e-05,
"loss": 0.44785064458847046,
"step": 200
},
{
"epoch": 0.273841961852861,
"grad_norm": 0.4130904972553253,
"learning_rate": 1.9420195946773063e-05,
"loss": 0.36012643575668335,
"step": 201
},
{
"epoch": 0.27520435967302453,
"grad_norm": 0.4587012827396393,
"learning_rate": 1.9412764800643e-05,
"loss": 0.45600613951683044,
"step": 202
},
{
"epoch": 0.276566757493188,
"grad_norm": 0.4676440954208374,
"learning_rate": 1.940528777624897e-05,
"loss": 0.4534785747528076,
"step": 203
},
{
"epoch": 0.2779291553133515,
"grad_norm": 0.5145980715751648,
"learning_rate": 1.939776491003435e-05,
"loss": 0.48491308093070984,
"step": 204
},
{
"epoch": 0.279291553133515,
"grad_norm": 0.41034233570098877,
"learning_rate": 1.9390196238665944e-05,
"loss": 0.34817013144493103,
"step": 205
},
{
"epoch": 0.28065395095367845,
"grad_norm": 0.5155647397041321,
"learning_rate": 1.9382581799033824e-05,
"loss": 0.5011834502220154,
"step": 206
},
{
"epoch": 0.28201634877384196,
"grad_norm": 0.4596708416938782,
"learning_rate": 1.9374921628251127e-05,
"loss": 0.39814120531082153,
"step": 207
},
{
"epoch": 0.28337874659400547,
"grad_norm": 0.4686714708805084,
"learning_rate": 1.936721576365389e-05,
"loss": 0.48343995213508606,
"step": 208
},
{
"epoch": 0.2847411444141689,
"grad_norm": 0.47685396671295166,
"learning_rate": 1.935946424280087e-05,
"loss": 0.3693901002407074,
"step": 209
},
{
"epoch": 0.28610354223433243,
"grad_norm": 0.46070703864097595,
"learning_rate": 1.935166710347334e-05,
"loss": 0.40199288725852966,
"step": 210
},
{
"epoch": 0.28746594005449594,
"grad_norm": 0.42680197954177856,
"learning_rate": 1.9343824383674936e-05,
"loss": 0.39812421798706055,
"step": 211
},
{
"epoch": 0.2888283378746594,
"grad_norm": 0.44704172015190125,
"learning_rate": 1.9335936121631442e-05,
"loss": 0.36939626932144165,
"step": 212
},
{
"epoch": 0.2901907356948229,
"grad_norm": 0.39125990867614746,
"learning_rate": 1.9328002355790624e-05,
"loss": 0.332170695066452,
"step": 213
},
{
"epoch": 0.29155313351498635,
"grad_norm": 0.4364356994628906,
"learning_rate": 1.9320023124822035e-05,
"loss": 0.3795939087867737,
"step": 214
},
{
"epoch": 0.29291553133514986,
"grad_norm": 0.427299827337265,
"learning_rate": 1.931199846761683e-05,
"loss": 0.3539160192012787,
"step": 215
},
{
"epoch": 0.29427792915531337,
"grad_norm": 0.3824451267719269,
"learning_rate": 1.9303928423287568e-05,
"loss": 0.33199343085289,
"step": 216
},
{
"epoch": 0.2956403269754768,
"grad_norm": 0.4601697027683258,
"learning_rate": 1.929581303116803e-05,
"loss": 0.4479762017726898,
"step": 217
},
{
"epoch": 0.2970027247956403,
"grad_norm": 0.4551186263561249,
"learning_rate": 1.9287652330813024e-05,
"loss": 0.4017976224422455,
"step": 218
},
{
"epoch": 0.29836512261580383,
"grad_norm": 0.4389873445034027,
"learning_rate": 1.9279446361998188e-05,
"loss": 0.3540688157081604,
"step": 219
},
{
"epoch": 0.2997275204359673,
"grad_norm": 0.45103585720062256,
"learning_rate": 1.927119516471981e-05,
"loss": 0.4199802875518799,
"step": 220
},
{
"epoch": 0.3010899182561308,
"grad_norm": 0.4221908450126648,
"learning_rate": 1.9262898779194613e-05,
"loss": 0.36206182837486267,
"step": 221
},
{
"epoch": 0.3024523160762943,
"grad_norm": 0.46009913086891174,
"learning_rate": 1.9254557245859583e-05,
"loss": 0.4081338047981262,
"step": 222
},
{
"epoch": 0.30381471389645776,
"grad_norm": 0.46696937084198,
"learning_rate": 1.924617060537175e-05,
"loss": 0.36600178480148315,
"step": 223
},
{
"epoch": 0.30517711171662126,
"grad_norm": 0.44915148615837097,
"learning_rate": 1.9237738898607992e-05,
"loss": 0.3773924708366394,
"step": 224
},
{
"epoch": 0.3065395095367847,
"grad_norm": 0.40148746967315674,
"learning_rate": 1.9229262166664854e-05,
"loss": 0.29502028226852417,
"step": 225
},
{
"epoch": 0.3079019073569482,
"grad_norm": 0.3726682960987091,
"learning_rate": 1.9220740450858328e-05,
"loss": 0.3030723035335541,
"step": 226
},
{
"epoch": 0.30926430517711173,
"grad_norm": 0.5065223574638367,
"learning_rate": 1.921217379272367e-05,
"loss": 0.4941141605377197,
"step": 227
},
{
"epoch": 0.3106267029972752,
"grad_norm": 0.4267047941684723,
"learning_rate": 1.9203562234015172e-05,
"loss": 0.35858261585235596,
"step": 228
},
{
"epoch": 0.3119891008174387,
"grad_norm": 0.38982680439949036,
"learning_rate": 1.9194905816705988e-05,
"loss": 0.3197462856769562,
"step": 229
},
{
"epoch": 0.3133514986376022,
"grad_norm": 0.44453105330467224,
"learning_rate": 1.91862045829879e-05,
"loss": 0.40640050172805786,
"step": 230
},
{
"epoch": 0.31471389645776565,
"grad_norm": 0.492948442697525,
"learning_rate": 1.9177458575271143e-05,
"loss": 0.4595552682876587,
"step": 231
},
{
"epoch": 0.31607629427792916,
"grad_norm": 0.4012284278869629,
"learning_rate": 1.916866783618417e-05,
"loss": 0.3264189660549164,
"step": 232
},
{
"epoch": 0.31743869209809267,
"grad_norm": 0.4652048647403717,
"learning_rate": 1.9159832408573467e-05,
"loss": 0.4094054698944092,
"step": 233
},
{
"epoch": 0.3188010899182561,
"grad_norm": 0.46388110518455505,
"learning_rate": 1.9150952335503325e-05,
"loss": 0.4324240982532501,
"step": 234
},
{
"epoch": 0.3201634877384196,
"grad_norm": 0.44346991181373596,
"learning_rate": 1.9142027660255645e-05,
"loss": 0.369159996509552,
"step": 235
},
{
"epoch": 0.3215258855585831,
"grad_norm": 0.42788419127464294,
"learning_rate": 1.9133058426329717e-05,
"loss": 0.37542596459388733,
"step": 236
},
{
"epoch": 0.3228882833787466,
"grad_norm": 0.41329479217529297,
"learning_rate": 1.912404467744202e-05,
"loss": 0.36349016427993774,
"step": 237
},
{
"epoch": 0.3242506811989101,
"grad_norm": 0.47673869132995605,
"learning_rate": 1.911498645752599e-05,
"loss": 0.4231317937374115,
"step": 238
},
{
"epoch": 0.32561307901907355,
"grad_norm": 0.45236167311668396,
"learning_rate": 1.9105883810731822e-05,
"loss": 0.41860949993133545,
"step": 239
},
{
"epoch": 0.32697547683923706,
"grad_norm": 0.47766655683517456,
"learning_rate": 1.9096736781426252e-05,
"loss": 0.45107653737068176,
"step": 240
},
{
"epoch": 0.32833787465940056,
"grad_norm": 0.4172971546649933,
"learning_rate": 1.9087545414192338e-05,
"loss": 0.36643242835998535,
"step": 241
},
{
"epoch": 0.329700272479564,
"grad_norm": 0.4231413006782532,
"learning_rate": 1.907830975382924e-05,
"loss": 0.3520575165748596,
"step": 242
},
{
"epoch": 0.3310626702997275,
"grad_norm": 0.4126909375190735,
"learning_rate": 1.9069029845352006e-05,
"loss": 0.36405885219573975,
"step": 243
},
{
"epoch": 0.33242506811989103,
"grad_norm": 0.42022719979286194,
"learning_rate": 1.9059705733991352e-05,
"loss": 0.3583207130432129,
"step": 244
},
{
"epoch": 0.3337874659400545,
"grad_norm": 0.4531959593296051,
"learning_rate": 1.9050337465193443e-05,
"loss": 0.38180652260780334,
"step": 245
},
{
"epoch": 0.335149863760218,
"grad_norm": 0.42306748032569885,
"learning_rate": 1.9040925084619663e-05,
"loss": 0.3619072437286377,
"step": 246
},
{
"epoch": 0.33651226158038144,
"grad_norm": 0.46138980984687805,
"learning_rate": 1.9031468638146408e-05,
"loss": 0.38578206300735474,
"step": 247
},
{
"epoch": 0.33787465940054495,
"grad_norm": 0.43929317593574524,
"learning_rate": 1.9021968171864843e-05,
"loss": 0.4235476851463318,
"step": 248
},
{
"epoch": 0.33923705722070846,
"grad_norm": 0.401759535074234,
"learning_rate": 1.90124237320807e-05,
"loss": 0.34792521595954895,
"step": 249
},
{
"epoch": 0.3405994550408719,
"grad_norm": 0.478180468082428,
"learning_rate": 1.900283536531403e-05,
"loss": 0.4711982011795044,
"step": 250
},
{
"epoch": 0.3419618528610354,
"grad_norm": 0.43466177582740784,
"learning_rate": 1.8993203118298988e-05,
"loss": 0.36447733640670776,
"step": 251
},
{
"epoch": 0.34332425068119893,
"grad_norm": 0.44407346844673157,
"learning_rate": 1.8983527037983606e-05,
"loss": 0.38788866996765137,
"step": 252
},
{
"epoch": 0.3446866485013624,
"grad_norm": 0.4327251613140106,
"learning_rate": 1.8973807171529556e-05,
"loss": 0.38622599840164185,
"step": 253
},
{
"epoch": 0.3460490463215259,
"grad_norm": 0.44564682245254517,
"learning_rate": 1.8964043566311942e-05,
"loss": 0.3841802477836609,
"step": 254
},
{
"epoch": 0.3474114441416894,
"grad_norm": 0.4468725621700287,
"learning_rate": 1.8954236269919026e-05,
"loss": 0.3644585609436035,
"step": 255
},
{
"epoch": 0.34877384196185285,
"grad_norm": 0.4858334958553314,
"learning_rate": 1.8944385330152047e-05,
"loss": 0.4247015118598938,
"step": 256
},
{
"epoch": 0.35013623978201636,
"grad_norm": 0.4670877456665039,
"learning_rate": 1.893449079502495e-05,
"loss": 0.41649121046066284,
"step": 257
},
{
"epoch": 0.35149863760217986,
"grad_norm": 0.42309293150901794,
"learning_rate": 1.892455271276418e-05,
"loss": 0.3775060772895813,
"step": 258
},
{
"epoch": 0.3528610354223433,
"grad_norm": 0.42874789237976074,
"learning_rate": 1.8914571131808407e-05,
"loss": 0.40018612146377563,
"step": 259
},
{
"epoch": 0.3542234332425068,
"grad_norm": 0.45877915620803833,
"learning_rate": 1.8904546100808346e-05,
"loss": 0.4454330801963806,
"step": 260
},
{
"epoch": 0.3555858310626703,
"grad_norm": 0.4848128855228424,
"learning_rate": 1.889447766862647e-05,
"loss": 0.4608079791069031,
"step": 261
},
{
"epoch": 0.3569482288828338,
"grad_norm": 0.38830137252807617,
"learning_rate": 1.8884365884336796e-05,
"loss": 0.34983474016189575,
"step": 262
},
{
"epoch": 0.3583106267029973,
"grad_norm": 0.45347243547439575,
"learning_rate": 1.8874210797224646e-05,
"loss": 0.37533241510391235,
"step": 263
},
{
"epoch": 0.35967302452316074,
"grad_norm": 0.4399562478065491,
"learning_rate": 1.8864012456786397e-05,
"loss": 0.3630625903606415,
"step": 264
},
{
"epoch": 0.36103542234332425,
"grad_norm": 0.4255392849445343,
"learning_rate": 1.8853770912729243e-05,
"loss": 0.37027978897094727,
"step": 265
},
{
"epoch": 0.36239782016348776,
"grad_norm": 0.423592746257782,
"learning_rate": 1.884348621497096e-05,
"loss": 0.37999391555786133,
"step": 266
},
{
"epoch": 0.3637602179836512,
"grad_norm": 0.4627998173236847,
"learning_rate": 1.8833158413639656e-05,
"loss": 0.43481695652008057,
"step": 267
},
{
"epoch": 0.3651226158038147,
"grad_norm": 0.4129141867160797,
"learning_rate": 1.8822787559073522e-05,
"loss": 0.35913515090942383,
"step": 268
},
{
"epoch": 0.36648501362397823,
"grad_norm": 0.45795056223869324,
"learning_rate": 1.8812373701820603e-05,
"loss": 0.3965306878089905,
"step": 269
},
{
"epoch": 0.3678474114441417,
"grad_norm": 0.40726739168167114,
"learning_rate": 1.8801916892638533e-05,
"loss": 0.3417432904243469,
"step": 270
},
{
"epoch": 0.3692098092643052,
"grad_norm": 0.4479921758174896,
"learning_rate": 1.8791417182494296e-05,
"loss": 0.4090738594532013,
"step": 271
},
{
"epoch": 0.37057220708446864,
"grad_norm": 0.380698561668396,
"learning_rate": 1.878087462256398e-05,
"loss": 0.31110888719558716,
"step": 272
},
{
"epoch": 0.37193460490463215,
"grad_norm": 0.4521123468875885,
"learning_rate": 1.8770289264232526e-05,
"loss": 0.414950966835022,
"step": 273
},
{
"epoch": 0.37329700272479566,
"grad_norm": 0.4697805643081665,
"learning_rate": 1.875966115909347e-05,
"loss": 0.42033064365386963,
"step": 274
},
{
"epoch": 0.3746594005449591,
"grad_norm": 0.4483690857887268,
"learning_rate": 1.8748990358948713e-05,
"loss": 0.3845226466655731,
"step": 275
},
{
"epoch": 0.3760217983651226,
"grad_norm": 0.4191807210445404,
"learning_rate": 1.8738276915808232e-05,
"loss": 0.3214520514011383,
"step": 276
},
{
"epoch": 0.3773841961852861,
"grad_norm": 0.42148375511169434,
"learning_rate": 1.8727520881889865e-05,
"loss": 0.3698553442955017,
"step": 277
},
{
"epoch": 0.3787465940054496,
"grad_norm": 0.42890867590904236,
"learning_rate": 1.8716722309619033e-05,
"loss": 0.35935938358306885,
"step": 278
},
{
"epoch": 0.3801089918256131,
"grad_norm": 0.4575975835323334,
"learning_rate": 1.870588125162849e-05,
"loss": 0.3697071075439453,
"step": 279
},
{
"epoch": 0.3814713896457766,
"grad_norm": 0.46115341782569885,
"learning_rate": 1.8694997760758073e-05,
"loss": 0.3968576192855835,
"step": 280
},
{
"epoch": 0.38283378746594005,
"grad_norm": 0.494211882352829,
"learning_rate": 1.8684071890054425e-05,
"loss": 0.44661998748779297,
"step": 281
},
{
"epoch": 0.38419618528610355,
"grad_norm": 0.4422100782394409,
"learning_rate": 1.8673103692770772e-05,
"loss": 0.39138174057006836,
"step": 282
},
{
"epoch": 0.385558583106267,
"grad_norm": 0.39777207374572754,
"learning_rate": 1.8662093222366623e-05,
"loss": 0.34542185068130493,
"step": 283
},
{
"epoch": 0.3869209809264305,
"grad_norm": 0.38417258858680725,
"learning_rate": 1.8651040532507538e-05,
"loss": 0.3356142044067383,
"step": 284
},
{
"epoch": 0.388283378746594,
"grad_norm": 0.46102169156074524,
"learning_rate": 1.863994567706485e-05,
"loss": 0.430128276348114,
"step": 285
},
{
"epoch": 0.3896457765667575,
"grad_norm": 0.4118143618106842,
"learning_rate": 1.8628808710115417e-05,
"loss": 0.34772276878356934,
"step": 286
},
{
"epoch": 0.391008174386921,
"grad_norm": 0.39578622579574585,
"learning_rate": 1.861762968594135e-05,
"loss": 0.32895392179489136,
"step": 287
},
{
"epoch": 0.3923705722070845,
"grad_norm": 0.3824600875377655,
"learning_rate": 1.8606408659029736e-05,
"loss": 0.32151734828948975,
"step": 288
},
{
"epoch": 0.39373297002724794,
"grad_norm": 0.4184086322784424,
"learning_rate": 1.8595145684072398e-05,
"loss": 0.3639921247959137,
"step": 289
},
{
"epoch": 0.39509536784741145,
"grad_norm": 0.45187273621559143,
"learning_rate": 1.8583840815965614e-05,
"loss": 0.38925087451934814,
"step": 290
},
{
"epoch": 0.39645776566757496,
"grad_norm": 0.39254751801490784,
"learning_rate": 1.8572494109809852e-05,
"loss": 0.3342083692550659,
"step": 291
},
{
"epoch": 0.3978201634877384,
"grad_norm": 0.43775174021720886,
"learning_rate": 1.856110562090949e-05,
"loss": 0.41103285551071167,
"step": 292
},
{
"epoch": 0.3991825613079019,
"grad_norm": 0.4114571213722229,
"learning_rate": 1.8549675404772574e-05,
"loss": 0.36463573575019836,
"step": 293
},
{
"epoch": 0.40054495912806537,
"grad_norm": 0.4514394998550415,
"learning_rate": 1.853820351711052e-05,
"loss": 0.40414565801620483,
"step": 294
},
{
"epoch": 0.4019073569482289,
"grad_norm": 0.4267936944961548,
"learning_rate": 1.852669001383785e-05,
"loss": 0.39556679129600525,
"step": 295
},
{
"epoch": 0.4032697547683924,
"grad_norm": 0.3571447432041168,
"learning_rate": 1.8515134951071932e-05,
"loss": 0.27661603689193726,
"step": 296
},
{
"epoch": 0.40463215258855584,
"grad_norm": 0.4328432083129883,
"learning_rate": 1.8503538385132692e-05,
"loss": 0.3701832890510559,
"step": 297
},
{
"epoch": 0.40599455040871935,
"grad_norm": 0.3956933915615082,
"learning_rate": 1.849190037254234e-05,
"loss": 0.31413373351097107,
"step": 298
},
{
"epoch": 0.40735694822888285,
"grad_norm": 0.4569341540336609,
"learning_rate": 1.8480220970025114e-05,
"loss": 0.46876251697540283,
"step": 299
},
{
"epoch": 0.4087193460490463,
"grad_norm": 0.4352339208126068,
"learning_rate": 1.8468500234506965e-05,
"loss": 0.34557855129241943,
"step": 300
},
{
"epoch": 0.4100817438692098,
"grad_norm": 0.47778064012527466,
"learning_rate": 1.8456738223115325e-05,
"loss": 0.4523351490497589,
"step": 301
},
{
"epoch": 0.4114441416893733,
"grad_norm": 0.3924112021923065,
"learning_rate": 1.8444934993178796e-05,
"loss": 0.32918280363082886,
"step": 302
},
{
"epoch": 0.4128065395095368,
"grad_norm": 0.4721715748310089,
"learning_rate": 1.843309060222688e-05,
"loss": 0.4108656346797943,
"step": 303
},
{
"epoch": 0.4141689373297003,
"grad_norm": 0.4324953556060791,
"learning_rate": 1.8421205107989707e-05,
"loss": 0.39233145117759705,
"step": 304
},
{
"epoch": 0.41553133514986373,
"grad_norm": 0.4237790107727051,
"learning_rate": 1.8409278568397742e-05,
"loss": 0.33894914388656616,
"step": 305
},
{
"epoch": 0.41689373297002724,
"grad_norm": 0.3769596815109253,
"learning_rate": 1.83973110415815e-05,
"loss": 0.28818315267562866,
"step": 306
},
{
"epoch": 0.41825613079019075,
"grad_norm": 0.39678072929382324,
"learning_rate": 1.8385302585871284e-05,
"loss": 0.3248705565929413,
"step": 307
},
{
"epoch": 0.4196185286103542,
"grad_norm": 0.4702602028846741,
"learning_rate": 1.8373253259796877e-05,
"loss": 0.44057464599609375,
"step": 308
},
{
"epoch": 0.4209809264305177,
"grad_norm": 0.43663206696510315,
"learning_rate": 1.8361163122087265e-05,
"loss": 0.41709601879119873,
"step": 309
},
{
"epoch": 0.4223433242506812,
"grad_norm": 0.41904065012931824,
"learning_rate": 1.8349032231670363e-05,
"loss": 0.3891496956348419,
"step": 310
},
{
"epoch": 0.42370572207084467,
"grad_norm": 0.42954081296920776,
"learning_rate": 1.8336860647672702e-05,
"loss": 0.38407522439956665,
"step": 311
},
{
"epoch": 0.4250681198910082,
"grad_norm": 0.40387260913848877,
"learning_rate": 1.8324648429419164e-05,
"loss": 0.35146600008010864,
"step": 312
},
{
"epoch": 0.4264305177111717,
"grad_norm": 0.3923007845878601,
"learning_rate": 1.831239563643268e-05,
"loss": 0.3610236942768097,
"step": 313
},
{
"epoch": 0.42779291553133514,
"grad_norm": 0.8759288191795349,
"learning_rate": 1.8300102328433952e-05,
"loss": 0.4138031601905823,
"step": 314
},
{
"epoch": 0.42915531335149865,
"grad_norm": 0.4077308475971222,
"learning_rate": 1.8287768565341143e-05,
"loss": 0.3436448574066162,
"step": 315
},
{
"epoch": 0.4305177111716621,
"grad_norm": 0.4676551818847656,
"learning_rate": 1.82753944072696e-05,
"loss": 0.4458121061325073,
"step": 316
},
{
"epoch": 0.4318801089918256,
"grad_norm": 0.3823956847190857,
"learning_rate": 1.826297991453157e-05,
"loss": 0.32054227590560913,
"step": 317
},
{
"epoch": 0.4332425068119891,
"grad_norm": 0.42467930912971497,
"learning_rate": 1.8250525147635873e-05,
"loss": 0.3760542869567871,
"step": 318
},
{
"epoch": 0.43460490463215257,
"grad_norm": 0.4084574580192566,
"learning_rate": 1.8238030167287638e-05,
"loss": 0.36126622557640076,
"step": 319
},
{
"epoch": 0.4359673024523161,
"grad_norm": 0.3886258602142334,
"learning_rate": 1.8225495034387996e-05,
"loss": 0.324174702167511,
"step": 320
},
{
"epoch": 0.4373297002724796,
"grad_norm": 0.3744279146194458,
"learning_rate": 1.8212919810033777e-05,
"loss": 0.3609943985939026,
"step": 321
},
{
"epoch": 0.43869209809264303,
"grad_norm": 0.39002716541290283,
"learning_rate": 1.820030455551723e-05,
"loss": 0.33594828844070435,
"step": 322
},
{
"epoch": 0.44005449591280654,
"grad_norm": 0.4567135274410248,
"learning_rate": 1.8187649332325702e-05,
"loss": 0.4041770100593567,
"step": 323
},
{
"epoch": 0.44141689373297005,
"grad_norm": 0.40135565400123596,
"learning_rate": 1.8174954202141352e-05,
"loss": 0.3575393855571747,
"step": 324
},
{
"epoch": 0.4427792915531335,
"grad_norm": 0.45827990770339966,
"learning_rate": 1.8162219226840857e-05,
"loss": 0.414590448141098,
"step": 325
},
{
"epoch": 0.444141689373297,
"grad_norm": 0.49530652165412903,
"learning_rate": 1.814944446849508e-05,
"loss": 0.45791754126548767,
"step": 326
},
{
"epoch": 0.44550408719346046,
"grad_norm": 0.4364672601222992,
"learning_rate": 1.8136629989368815e-05,
"loss": 0.3820730447769165,
"step": 327
},
{
"epoch": 0.44686648501362397,
"grad_norm": 0.46344640851020813,
"learning_rate": 1.8123775851920438e-05,
"loss": 0.42946088314056396,
"step": 328
},
{
"epoch": 0.4482288828337875,
"grad_norm": 0.3821968138217926,
"learning_rate": 1.8110882118801633e-05,
"loss": 0.3248857855796814,
"step": 329
},
{
"epoch": 0.44959128065395093,
"grad_norm": 0.39740368723869324,
"learning_rate": 1.8097948852857054e-05,
"loss": 0.34824830293655396,
"step": 330
},
{
"epoch": 0.45095367847411444,
"grad_norm": 0.4666673541069031,
"learning_rate": 1.8084976117124072e-05,
"loss": 0.39515918493270874,
"step": 331
},
{
"epoch": 0.45231607629427795,
"grad_norm": 0.36016690731048584,
"learning_rate": 1.807196397483241e-05,
"loss": 0.2601340711116791,
"step": 332
},
{
"epoch": 0.4536784741144414,
"grad_norm": 0.3608089089393616,
"learning_rate": 1.8058912489403867e-05,
"loss": 0.2999919056892395,
"step": 333
},
{
"epoch": 0.4550408719346049,
"grad_norm": 0.42653337121009827,
"learning_rate": 1.804582172445201e-05,
"loss": 0.39884787797927856,
"step": 334
},
{
"epoch": 0.4564032697547684,
"grad_norm": 0.42624911665916443,
"learning_rate": 1.8032691743781853e-05,
"loss": 0.39197784662246704,
"step": 335
},
{
"epoch": 0.45776566757493187,
"grad_norm": 0.513020396232605,
"learning_rate": 1.8019522611389543e-05,
"loss": 0.4664883315563202,
"step": 336
},
{
"epoch": 0.4591280653950954,
"grad_norm": 0.3827018141746521,
"learning_rate": 1.8006314391462056e-05,
"loss": 0.3538336157798767,
"step": 337
},
{
"epoch": 0.4604904632152589,
"grad_norm": 0.3974727988243103,
"learning_rate": 1.799306714837689e-05,
"loss": 0.3404923379421234,
"step": 338
},
{
"epoch": 0.46185286103542234,
"grad_norm": 0.390813410282135,
"learning_rate": 1.7979780946701737e-05,
"loss": 0.3352108895778656,
"step": 339
},
{
"epoch": 0.46321525885558584,
"grad_norm": 0.395134299993515,
"learning_rate": 1.7966455851194178e-05,
"loss": 0.35127317905426025,
"step": 340
},
{
"epoch": 0.4645776566757493,
"grad_norm": 0.3822191059589386,
"learning_rate": 1.795309192680136e-05,
"loss": 0.3215380311012268,
"step": 341
},
{
"epoch": 0.4659400544959128,
"grad_norm": 0.38805821537971497,
"learning_rate": 1.7939689238659692e-05,
"loss": 0.3339554965496063,
"step": 342
},
{
"epoch": 0.4673024523160763,
"grad_norm": 0.4180435538291931,
"learning_rate": 1.792624785209451e-05,
"loss": 0.37479549646377563,
"step": 343
},
{
"epoch": 0.46866485013623976,
"grad_norm": 0.3718315362930298,
"learning_rate": 1.7912767832619776e-05,
"loss": 0.2911319136619568,
"step": 344
},
{
"epoch": 0.47002724795640327,
"grad_norm": 0.43831902742385864,
"learning_rate": 1.789924924593774e-05,
"loss": 0.388246089220047,
"step": 345
},
{
"epoch": 0.4713896457765668,
"grad_norm": 0.40909937024116516,
"learning_rate": 1.7885692157938646e-05,
"loss": 0.3607439398765564,
"step": 346
},
{
"epoch": 0.47275204359673023,
"grad_norm": 0.4436415433883667,
"learning_rate": 1.787209663470038e-05,
"loss": 0.4487878382205963,
"step": 347
},
{
"epoch": 0.47411444141689374,
"grad_norm": 0.4295803904533386,
"learning_rate": 1.7858462742488175e-05,
"loss": 0.4205361008644104,
"step": 348
},
{
"epoch": 0.47547683923705725,
"grad_norm": 0.3416333794593811,
"learning_rate": 1.7844790547754264e-05,
"loss": 0.2564454674720764,
"step": 349
},
{
"epoch": 0.4768392370572207,
"grad_norm": 0.3937414884567261,
"learning_rate": 1.7831080117137584e-05,
"loss": 0.3362025022506714,
"step": 350
},
{
"epoch": 0.4782016348773842,
"grad_norm": 0.44493257999420166,
"learning_rate": 1.781733151746342e-05,
"loss": 0.41151055693626404,
"step": 351
},
{
"epoch": 0.47956403269754766,
"grad_norm": 0.4470524489879608,
"learning_rate": 1.7803544815743107e-05,
"loss": 0.4086991548538208,
"step": 352
},
{
"epoch": 0.48092643051771117,
"grad_norm": 0.4161277115345001,
"learning_rate": 1.7789720079173682e-05,
"loss": 0.3814135193824768,
"step": 353
},
{
"epoch": 0.4822888283378747,
"grad_norm": 0.4259527027606964,
"learning_rate": 1.777585737513757e-05,
"loss": 0.39335864782333374,
"step": 354
},
{
"epoch": 0.48365122615803813,
"grad_norm": 0.44917598366737366,
"learning_rate": 1.7761956771202255e-05,
"loss": 0.41355639696121216,
"step": 355
},
{
"epoch": 0.48501362397820164,
"grad_norm": 0.45215553045272827,
"learning_rate": 1.7748018335119935e-05,
"loss": 0.42670729756355286,
"step": 356
},
{
"epoch": 0.48637602179836514,
"grad_norm": 0.43753278255462646,
"learning_rate": 1.7734042134827216e-05,
"loss": 0.39761465787887573,
"step": 357
},
{
"epoch": 0.4877384196185286,
"grad_norm": 0.42144203186035156,
"learning_rate": 1.772002823844476e-05,
"loss": 0.3738403916358948,
"step": 358
},
{
"epoch": 0.4891008174386921,
"grad_norm": 0.4108579456806183,
"learning_rate": 1.7705976714276976e-05,
"loss": 0.3864634037017822,
"step": 359
},
{
"epoch": 0.4904632152588556,
"grad_norm": 0.44009944796562195,
"learning_rate": 1.7691887630811653e-05,
"loss": 0.387514591217041,
"step": 360
},
{
"epoch": 0.49182561307901906,
"grad_norm": 0.42644309997558594,
"learning_rate": 1.7677761056719652e-05,
"loss": 0.38349243998527527,
"step": 361
},
{
"epoch": 0.49318801089918257,
"grad_norm": 0.4462713301181793,
"learning_rate": 1.7663597060854577e-05,
"loss": 0.43910130858421326,
"step": 362
},
{
"epoch": 0.494550408719346,
"grad_norm": 0.4547002911567688,
"learning_rate": 1.764939571225241e-05,
"loss": 0.4280800223350525,
"step": 363
},
{
"epoch": 0.49591280653950953,
"grad_norm": 0.3978780210018158,
"learning_rate": 1.763515708013121e-05,
"loss": 0.3507936894893646,
"step": 364
},
{
"epoch": 0.49727520435967304,
"grad_norm": 0.3963427245616913,
"learning_rate": 1.762088123389074e-05,
"loss": 0.37120676040649414,
"step": 365
},
{
"epoch": 0.4986376021798365,
"grad_norm": 0.4243077337741852,
"learning_rate": 1.760656824311216e-05,
"loss": 0.36520522832870483,
"step": 366
},
{
"epoch": 0.5,
"grad_norm": 0.44183287024497986,
"learning_rate": 1.7592218177557662e-05,
"loss": 0.42573392391204834,
"step": 367
},
{
"epoch": 0.5013623978201635,
"grad_norm": 0.4238261282444,
"learning_rate": 1.7577831107170157e-05,
"loss": 0.3345707952976227,
"step": 368
},
{
"epoch": 0.502724795640327,
"grad_norm": 0.4145262539386749,
"learning_rate": 1.7563407102072902e-05,
"loss": 0.3537534475326538,
"step": 369
},
{
"epoch": 0.5040871934604905,
"grad_norm": 0.4781807065010071,
"learning_rate": 1.7548946232569196e-05,
"loss": 0.4613100588321686,
"step": 370
},
{
"epoch": 0.5054495912806539,
"grad_norm": 0.383511483669281,
"learning_rate": 1.7534448569141997e-05,
"loss": 0.33608487248420715,
"step": 371
},
{
"epoch": 0.5068119891008175,
"grad_norm": 0.4402075409889221,
"learning_rate": 1.751991418245361e-05,
"loss": 0.4129033088684082,
"step": 372
},
{
"epoch": 0.5081743869209809,
"grad_norm": 0.5239101052284241,
"learning_rate": 1.7505343143345328e-05,
"loss": 0.45621195435523987,
"step": 373
},
{
"epoch": 0.5095367847411444,
"grad_norm": 0.4115491807460785,
"learning_rate": 1.749073552283709e-05,
"loss": 0.338983952999115,
"step": 374
},
{
"epoch": 0.510899182561308,
"grad_norm": 0.4104604423046112,
"learning_rate": 1.7476091392127132e-05,
"loss": 0.34245091676712036,
"step": 375
},
{
"epoch": 0.5122615803814714,
"grad_norm": 0.43850037455558777,
"learning_rate": 1.746141082259165e-05,
"loss": 0.40123671293258667,
"step": 376
},
{
"epoch": 0.5136239782016349,
"grad_norm": 0.41533970832824707,
"learning_rate": 1.7446693885784435e-05,
"loss": 0.34971946477890015,
"step": 377
},
{
"epoch": 0.5149863760217984,
"grad_norm": 0.36809873580932617,
"learning_rate": 1.7431940653436538e-05,
"loss": 0.3055441379547119,
"step": 378
},
{
"epoch": 0.5163487738419619,
"grad_norm": 0.4054659605026245,
"learning_rate": 1.7417151197455915e-05,
"loss": 0.35166579484939575,
"step": 379
},
{
"epoch": 0.5177111716621253,
"grad_norm": 0.435969740152359,
"learning_rate": 1.740232558992708e-05,
"loss": 0.3930160403251648,
"step": 380
},
{
"epoch": 0.5190735694822888,
"grad_norm": 0.4018082916736603,
"learning_rate": 1.738746390311075e-05,
"loss": 0.3543049693107605,
"step": 381
},
{
"epoch": 0.5204359673024523,
"grad_norm": 0.4188288450241089,
"learning_rate": 1.7372566209443496e-05,
"loss": 0.37953218817710876,
"step": 382
},
{
"epoch": 0.5217983651226158,
"grad_norm": 0.4601037800312042,
"learning_rate": 1.735763258153739e-05,
"loss": 0.4313342571258545,
"step": 383
},
{
"epoch": 0.5231607629427792,
"grad_norm": 0.43152326345443726,
"learning_rate": 1.7342663092179636e-05,
"loss": 0.41218316555023193,
"step": 384
},
{
"epoch": 0.5245231607629428,
"grad_norm": 0.4087159037590027,
"learning_rate": 1.7327657814332247e-05,
"loss": 0.3378143906593323,
"step": 385
},
{
"epoch": 0.5258855585831063,
"grad_norm": 0.39226964116096497,
"learning_rate": 1.7312616821131657e-05,
"loss": 0.3294611871242523,
"step": 386
},
{
"epoch": 0.5272479564032697,
"grad_norm": 0.41971486806869507,
"learning_rate": 1.729754018588838e-05,
"loss": 0.37797778844833374,
"step": 387
},
{
"epoch": 0.5286103542234333,
"grad_norm": 0.4497551918029785,
"learning_rate": 1.728242798208666e-05,
"loss": 0.38471484184265137,
"step": 388
},
{
"epoch": 0.5299727520435967,
"grad_norm": 0.41503041982650757,
"learning_rate": 1.7267280283384104e-05,
"loss": 0.38631588220596313,
"step": 389
},
{
"epoch": 0.5313351498637602,
"grad_norm": 0.4341152310371399,
"learning_rate": 1.7252097163611304e-05,
"loss": 0.43717920780181885,
"step": 390
},
{
"epoch": 0.5326975476839237,
"grad_norm": 0.41421955823898315,
"learning_rate": 1.723687869677152e-05,
"loss": 0.3743841052055359,
"step": 391
},
{
"epoch": 0.5340599455040872,
"grad_norm": 0.39457762241363525,
"learning_rate": 1.7221624957040274e-05,
"loss": 0.3561673164367676,
"step": 392
},
{
"epoch": 0.5354223433242506,
"grad_norm": 0.38131183385849,
"learning_rate": 1.7206336018765026e-05,
"loss": 0.3137727975845337,
"step": 393
},
{
"epoch": 0.5367847411444142,
"grad_norm": 0.3871758282184601,
"learning_rate": 1.7191011956464788e-05,
"loss": 0.35666629672050476,
"step": 394
},
{
"epoch": 0.5381471389645777,
"grad_norm": 0.43537119030952454,
"learning_rate": 1.717565284482977e-05,
"loss": 0.42949140071868896,
"step": 395
},
{
"epoch": 0.5395095367847411,
"grad_norm": 0.4189457893371582,
"learning_rate": 1.7160258758721015e-05,
"loss": 0.396271288394928,
"step": 396
},
{
"epoch": 0.5408719346049047,
"grad_norm": 0.41827261447906494,
"learning_rate": 1.714482977317003e-05,
"loss": 0.4051450490951538,
"step": 397
},
{
"epoch": 0.5422343324250681,
"grad_norm": 0.38875138759613037,
"learning_rate": 1.7129365963378428e-05,
"loss": 0.3301708698272705,
"step": 398
},
{
"epoch": 0.5435967302452316,
"grad_norm": 0.42139869928359985,
"learning_rate": 1.711386740471755e-05,
"loss": 0.3770272433757782,
"step": 399
},
{
"epoch": 0.5449591280653951,
"grad_norm": 0.4148419499397278,
"learning_rate": 1.7098334172728112e-05,
"loss": 0.37180018424987793,
"step": 400
},
{
"epoch": 0.5463215258855586,
"grad_norm": 0.42452266812324524,
"learning_rate": 1.7082766343119822e-05,
"loss": 0.37390637397766113,
"step": 401
},
{
"epoch": 0.547683923705722,
"grad_norm": 0.4145396053791046,
"learning_rate": 1.706716399177103e-05,
"loss": 0.3574928939342499,
"step": 402
},
{
"epoch": 0.5490463215258855,
"grad_norm": 0.404379665851593,
"learning_rate": 1.7051527194728343e-05,
"loss": 0.3393360674381256,
"step": 403
},
{
"epoch": 0.5504087193460491,
"grad_norm": 0.4394095242023468,
"learning_rate": 1.703585602820624e-05,
"loss": 0.38446563482284546,
"step": 404
},
{
"epoch": 0.5517711171662125,
"grad_norm": 0.4012243449687958,
"learning_rate": 1.7020150568586743e-05,
"loss": 0.34150344133377075,
"step": 405
},
{
"epoch": 0.553133514986376,
"grad_norm": 0.43355002999305725,
"learning_rate": 1.7004410892419012e-05,
"loss": 0.3841056227684021,
"step": 406
},
{
"epoch": 0.5544959128065395,
"grad_norm": 0.4600158631801605,
"learning_rate": 1.698863707641897e-05,
"loss": 0.39545172452926636,
"step": 407
},
{
"epoch": 0.555858310626703,
"grad_norm": 0.473522424697876,
"learning_rate": 1.6972829197468958e-05,
"loss": 0.4410251975059509,
"step": 408
},
{
"epoch": 0.5572207084468664,
"grad_norm": 0.44125762581825256,
"learning_rate": 1.695698733261732e-05,
"loss": 0.3858538269996643,
"step": 409
},
{
"epoch": 0.55858310626703,
"grad_norm": 0.5052700042724609,
"learning_rate": 1.694111155907807e-05,
"loss": 0.505725622177124,
"step": 410
},
{
"epoch": 0.5599455040871935,
"grad_norm": 0.39854127168655396,
"learning_rate": 1.6925201954230474e-05,
"loss": 0.3284291625022888,
"step": 411
},
{
"epoch": 0.5613079019073569,
"grad_norm": 0.4676287770271301,
"learning_rate": 1.690925859561871e-05,
"loss": 0.42946767807006836,
"step": 412
},
{
"epoch": 0.5626702997275205,
"grad_norm": 0.4244855046272278,
"learning_rate": 1.689328156095147e-05,
"loss": 0.3870871663093567,
"step": 413
},
{
"epoch": 0.5640326975476839,
"grad_norm": 0.41849035024642944,
"learning_rate": 1.6877270928101573e-05,
"loss": 0.37404653429985046,
"step": 414
},
{
"epoch": 0.5653950953678474,
"grad_norm": 0.4176686406135559,
"learning_rate": 1.6861226775105618e-05,
"loss": 0.38222536444664,
"step": 415
},
{
"epoch": 0.5667574931880109,
"grad_norm": 0.42130082845687866,
"learning_rate": 1.684514918016356e-05,
"loss": 0.3801380395889282,
"step": 416
},
{
"epoch": 0.5681198910081744,
"grad_norm": 0.4545654058456421,
"learning_rate": 1.6829038221638366e-05,
"loss": 0.42598506808280945,
"step": 417
},
{
"epoch": 0.5694822888283378,
"grad_norm": 0.4204128682613373,
"learning_rate": 1.681289397805562e-05,
"loss": 0.3880673944950104,
"step": 418
},
{
"epoch": 0.5708446866485014,
"grad_norm": 0.40949374437332153,
"learning_rate": 1.6796716528103127e-05,
"loss": 0.3792712092399597,
"step": 419
},
{
"epoch": 0.5722070844686649,
"grad_norm": 0.42425790429115295,
"learning_rate": 1.6780505950630552e-05,
"loss": 0.40029221773147583,
"step": 420
},
{
"epoch": 0.5735694822888283,
"grad_norm": 0.449004203081131,
"learning_rate": 1.6764262324649024e-05,
"loss": 0.4227592647075653,
"step": 421
},
{
"epoch": 0.5749318801089919,
"grad_norm": 0.3931463956832886,
"learning_rate": 1.674798572933075e-05,
"loss": 0.3561609387397766,
"step": 422
},
{
"epoch": 0.5762942779291553,
"grad_norm": 0.4123283624649048,
"learning_rate": 1.6731676244008622e-05,
"loss": 0.3775140643119812,
"step": 423
},
{
"epoch": 0.5776566757493188,
"grad_norm": 0.413714736700058,
"learning_rate": 1.6715333948175857e-05,
"loss": 0.3620632290840149,
"step": 424
},
{
"epoch": 0.5790190735694822,
"grad_norm": 0.36583277583122253,
"learning_rate": 1.6698958921485577e-05,
"loss": 0.30589473247528076,
"step": 425
},
{
"epoch": 0.5803814713896458,
"grad_norm": 0.4184879958629608,
"learning_rate": 1.668255124375045e-05,
"loss": 0.37757375836372375,
"step": 426
},
{
"epoch": 0.5817438692098093,
"grad_norm": 0.4325942397117615,
"learning_rate": 1.6666110994942274e-05,
"loss": 0.3947750926017761,
"step": 427
},
{
"epoch": 0.5831062670299727,
"grad_norm": 0.41197946667671204,
"learning_rate": 1.6649638255191604e-05,
"loss": 0.36544039845466614,
"step": 428
},
{
"epoch": 0.5844686648501363,
"grad_norm": 0.4328736662864685,
"learning_rate": 1.663313310478736e-05,
"loss": 0.3924716114997864,
"step": 429
},
{
"epoch": 0.5858310626702997,
"grad_norm": 0.4509877562522888,
"learning_rate": 1.661659562417643e-05,
"loss": 0.4020155370235443,
"step": 430
},
{
"epoch": 0.5871934604904632,
"grad_norm": 0.4538882076740265,
"learning_rate": 1.660002589396328e-05,
"loss": 0.42874789237976074,
"step": 431
},
{
"epoch": 0.5885558583106267,
"grad_norm": 0.4030280113220215,
"learning_rate": 1.6583423994909573e-05,
"loss": 0.36683404445648193,
"step": 432
},
{
"epoch": 0.5899182561307902,
"grad_norm": 0.4076124131679535,
"learning_rate": 1.6566790007933746e-05,
"loss": 0.3436656594276428,
"step": 433
},
{
"epoch": 0.5912806539509536,
"grad_norm": 0.454622358083725,
"learning_rate": 1.6550124014110646e-05,
"loss": 0.4208211302757263,
"step": 434
},
{
"epoch": 0.5926430517711172,
"grad_norm": 0.422625869512558,
"learning_rate": 1.6533426094671125e-05,
"loss": 0.3721390664577484,
"step": 435
},
{
"epoch": 0.5940054495912807,
"grad_norm": 0.35872289538383484,
"learning_rate": 1.651669633100163e-05,
"loss": 0.29117974638938904,
"step": 436
},
{
"epoch": 0.5953678474114441,
"grad_norm": 0.36502501368522644,
"learning_rate": 1.6499934804643838e-05,
"loss": 0.2901703119277954,
"step": 437
},
{
"epoch": 0.5967302452316077,
"grad_norm": 0.412866473197937,
"learning_rate": 1.6483141597294214e-05,
"loss": 0.31686800718307495,
"step": 438
},
{
"epoch": 0.5980926430517711,
"grad_norm": 0.37948358058929443,
"learning_rate": 1.646631679080366e-05,
"loss": 0.3297507166862488,
"step": 439
},
{
"epoch": 0.5994550408719346,
"grad_norm": 0.4186059832572937,
"learning_rate": 1.6449460467177078e-05,
"loss": 0.37851136922836304,
"step": 440
},
{
"epoch": 0.6008174386920981,
"grad_norm": 0.3846619427204132,
"learning_rate": 1.6432572708572997e-05,
"loss": 0.3128829002380371,
"step": 441
},
{
"epoch": 0.6021798365122616,
"grad_norm": 0.4310852289199829,
"learning_rate": 1.641565359730315e-05,
"loss": 0.39028769731521606,
"step": 442
},
{
"epoch": 0.603542234332425,
"grad_norm": 0.4645153880119324,
"learning_rate": 1.6398703215832097e-05,
"loss": 0.428106427192688,
"step": 443
},
{
"epoch": 0.6049046321525886,
"grad_norm": 0.4289141595363617,
"learning_rate": 1.6381721646776805e-05,
"loss": 0.3838496208190918,
"step": 444
},
{
"epoch": 0.6062670299727521,
"grad_norm": 0.381273478269577,
"learning_rate": 1.6364708972906246e-05,
"loss": 0.3133726119995117,
"step": 445
},
{
"epoch": 0.6076294277929155,
"grad_norm": 0.4178448021411896,
"learning_rate": 1.6347665277141005e-05,
"loss": 0.37862086296081543,
"step": 446
},
{
"epoch": 0.6089918256130791,
"grad_norm": 0.42201322317123413,
"learning_rate": 1.6330590642552867e-05,
"loss": 0.39746665954589844,
"step": 447
},
{
"epoch": 0.6103542234332425,
"grad_norm": 0.43660175800323486,
"learning_rate": 1.6313485152364417e-05,
"loss": 0.3727680444717407,
"step": 448
},
{
"epoch": 0.611716621253406,
"grad_norm": 0.4348539710044861,
"learning_rate": 1.6296348889948627e-05,
"loss": 0.386578232049942,
"step": 449
},
{
"epoch": 0.6130790190735694,
"grad_norm": 0.4908278286457062,
"learning_rate": 1.627918193882845e-05,
"loss": 0.45541319251060486,
"step": 450
},
{
"epoch": 0.614441416893733,
"grad_norm": 0.46335548162460327,
"learning_rate": 1.6261984382676432e-05,
"loss": 0.4110366702079773,
"step": 451
},
{
"epoch": 0.6158038147138964,
"grad_norm": 0.4220646917819977,
"learning_rate": 1.624475630531428e-05,
"loss": 0.35985416173934937,
"step": 452
},
{
"epoch": 0.6171662125340599,
"grad_norm": 0.4713914096355438,
"learning_rate": 1.6227497790712458e-05,
"loss": 0.4303567111492157,
"step": 453
},
{
"epoch": 0.6185286103542235,
"grad_norm": 0.4228816628456116,
"learning_rate": 1.621020892298979e-05,
"loss": 0.35945630073547363,
"step": 454
},
{
"epoch": 0.6198910081743869,
"grad_norm": 0.3780403435230255,
"learning_rate": 1.6192889786413048e-05,
"loss": 0.32220372557640076,
"step": 455
},
{
"epoch": 0.6212534059945504,
"grad_norm": 0.4096536338329315,
"learning_rate": 1.617554046539652e-05,
"loss": 0.3749344050884247,
"step": 456
},
{
"epoch": 0.6226158038147139,
"grad_norm": 0.4119769334793091,
"learning_rate": 1.6158161044501624e-05,
"loss": 0.38287466764450073,
"step": 457
},
{
"epoch": 0.6239782016348774,
"grad_norm": 0.39846253395080566,
"learning_rate": 1.6140751608436487e-05,
"loss": 0.3293435275554657,
"step": 458
},
{
"epoch": 0.6253405994550408,
"grad_norm": 0.4839153289794922,
"learning_rate": 1.6123312242055533e-05,
"loss": 0.49237170815467834,
"step": 459
},
{
"epoch": 0.6267029972752044,
"grad_norm": 0.345336377620697,
"learning_rate": 1.6105843030359055e-05,
"loss": 0.2600591778755188,
"step": 460
},
{
"epoch": 0.6280653950953679,
"grad_norm": 0.36076706647872925,
"learning_rate": 1.6088344058492836e-05,
"loss": 0.3101092576980591,
"step": 461
},
{
"epoch": 0.6294277929155313,
"grad_norm": 0.4160480499267578,
"learning_rate": 1.6070815411747686e-05,
"loss": 0.38243746757507324,
"step": 462
},
{
"epoch": 0.6307901907356949,
"grad_norm": 0.35517603158950806,
"learning_rate": 1.6053257175559074e-05,
"loss": 0.28862473368644714,
"step": 463
},
{
"epoch": 0.6321525885558583,
"grad_norm": 0.3878868520259857,
"learning_rate": 1.6035669435506674e-05,
"loss": 0.3289714455604553,
"step": 464
},
{
"epoch": 0.6335149863760218,
"grad_norm": 0.42289090156555176,
"learning_rate": 1.6018052277313966e-05,
"loss": 0.3916146755218506,
"step": 465
},
{
"epoch": 0.6348773841961853,
"grad_norm": 0.40691617131233215,
"learning_rate": 1.600040578684782e-05,
"loss": 0.38257652521133423,
"step": 466
},
{
"epoch": 0.6362397820163488,
"grad_norm": 0.3280545771121979,
"learning_rate": 1.598273005011808e-05,
"loss": 0.26138222217559814,
"step": 467
},
{
"epoch": 0.6376021798365122,
"grad_norm": 0.4141409695148468,
"learning_rate": 1.5965025153277112e-05,
"loss": 0.35480794310569763,
"step": 468
},
{
"epoch": 0.6389645776566758,
"grad_norm": 0.4675044119358063,
"learning_rate": 1.5947291182619444e-05,
"loss": 0.4690595865249634,
"step": 469
},
{
"epoch": 0.6403269754768393,
"grad_norm": 0.3967062830924988,
"learning_rate": 1.5929528224581283e-05,
"loss": 0.34801554679870605,
"step": 470
},
{
"epoch": 0.6416893732970027,
"grad_norm": 0.43718570470809937,
"learning_rate": 1.5911736365740133e-05,
"loss": 0.3724061846733093,
"step": 471
},
{
"epoch": 0.6430517711171662,
"grad_norm": 0.45367127656936646,
"learning_rate": 1.5893915692814365e-05,
"loss": 0.4154523015022278,
"step": 472
},
{
"epoch": 0.6444141689373297,
"grad_norm": 0.41580531001091003,
"learning_rate": 1.5876066292662784e-05,
"loss": 0.3685305714607239,
"step": 473
},
{
"epoch": 0.6457765667574932,
"grad_norm": 0.3875036835670471,
"learning_rate": 1.585818825228422e-05,
"loss": 0.33631476759910583,
"step": 474
},
{
"epoch": 0.6471389645776566,
"grad_norm": 0.3880947232246399,
"learning_rate": 1.5840281658817093e-05,
"loss": 0.3135243058204651,
"step": 475
},
{
"epoch": 0.6485013623978202,
"grad_norm": 0.4160878658294678,
"learning_rate": 1.582234659953899e-05,
"loss": 0.3561175763607025,
"step": 476
},
{
"epoch": 0.6498637602179836,
"grad_norm": 0.427616149187088,
"learning_rate": 1.5804383161866245e-05,
"loss": 0.39607974886894226,
"step": 477
},
{
"epoch": 0.6512261580381471,
"grad_norm": 0.4082714319229126,
"learning_rate": 1.5786391433353508e-05,
"loss": 0.35210445523262024,
"step": 478
},
{
"epoch": 0.6525885558583107,
"grad_norm": 0.41783925890922546,
"learning_rate": 1.5768371501693326e-05,
"loss": 0.376731276512146,
"step": 479
},
{
"epoch": 0.6539509536784741,
"grad_norm": 0.37993308901786804,
"learning_rate": 1.5750323454715696e-05,
"loss": 0.322902649641037,
"step": 480
},
{
"epoch": 0.6553133514986376,
"grad_norm": 0.4466570317745209,
"learning_rate": 1.5732247380387664e-05,
"loss": 0.39785605669021606,
"step": 481
},
{
"epoch": 0.6566757493188011,
"grad_norm": 0.40761619806289673,
"learning_rate": 1.5714143366812876e-05,
"loss": 0.3618497848510742,
"step": 482
},
{
"epoch": 0.6580381471389646,
"grad_norm": 0.3778402507305145,
"learning_rate": 1.5696011502231158e-05,
"loss": 0.339969664812088,
"step": 483
},
{
"epoch": 0.659400544959128,
"grad_norm": 0.4464191496372223,
"learning_rate": 1.5677851875018076e-05,
"loss": 0.43336886167526245,
"step": 484
},
{
"epoch": 0.6607629427792916,
"grad_norm": 0.3769300878047943,
"learning_rate": 1.565966457368453e-05,
"loss": 0.31399497389793396,
"step": 485
},
{
"epoch": 0.662125340599455,
"grad_norm": 0.39209413528442383,
"learning_rate": 1.564144968687628e-05,
"loss": 0.34925514459609985,
"step": 486
},
{
"epoch": 0.6634877384196185,
"grad_norm": 0.44385579228401184,
"learning_rate": 1.5623207303373553e-05,
"loss": 0.4195161461830139,
"step": 487
},
{
"epoch": 0.6648501362397821,
"grad_norm": 0.44335752725601196,
"learning_rate": 1.5604937512090602e-05,
"loss": 0.394603431224823,
"step": 488
},
{
"epoch": 0.6662125340599455,
"grad_norm": 0.41506636142730713,
"learning_rate": 1.5586640402075258e-05,
"loss": 0.39139121770858765,
"step": 489
},
{
"epoch": 0.667574931880109,
"grad_norm": 0.4141143262386322,
"learning_rate": 1.5568316062508502e-05,
"loss": 0.3790861666202545,
"step": 490
},
{
"epoch": 0.6689373297002725,
"grad_norm": 0.4253380298614502,
"learning_rate": 1.5549964582704044e-05,
"loss": 0.38978779315948486,
"step": 491
},
{
"epoch": 0.670299727520436,
"grad_norm": 0.38304126262664795,
"learning_rate": 1.5531586052107868e-05,
"loss": 0.3342415690422058,
"step": 492
},
{
"epoch": 0.6716621253405994,
"grad_norm": 0.4576060175895691,
"learning_rate": 1.5513180560297808e-05,
"loss": 0.42615556716918945,
"step": 493
},
{
"epoch": 0.6730245231607629,
"grad_norm": 0.4253360331058502,
"learning_rate": 1.5494748196983106e-05,
"loss": 0.4119224548339844,
"step": 494
},
{
"epoch": 0.6743869209809265,
"grad_norm": 0.4528999626636505,
"learning_rate": 1.547628905200398e-05,
"loss": 0.49085426330566406,
"step": 495
},
{
"epoch": 0.6757493188010899,
"grad_norm": 0.4398576617240906,
"learning_rate": 1.5457803215331182e-05,
"loss": 0.40197885036468506,
"step": 496
},
{
"epoch": 0.6771117166212534,
"grad_norm": 0.4078048765659332,
"learning_rate": 1.5439290777065558e-05,
"loss": 0.35455724596977234,
"step": 497
},
{
"epoch": 0.6784741144414169,
"grad_norm": 0.41575735807418823,
"learning_rate": 1.542075182743762e-05,
"loss": 0.3819142282009125,
"step": 498
},
{
"epoch": 0.6798365122615804,
"grad_norm": 0.3810281753540039,
"learning_rate": 1.5402186456807086e-05,
"loss": 0.330873966217041,
"step": 499
},
{
"epoch": 0.6811989100817438,
"grad_norm": 0.3949718773365021,
"learning_rate": 1.5383594755662453e-05,
"loss": 0.34446001052856445,
"step": 500
},
{
"epoch": 0.6825613079019074,
"grad_norm": 0.3613286316394806,
"learning_rate": 1.5364976814620568e-05,
"loss": 0.3226144313812256,
"step": 501
},
{
"epoch": 0.6839237057220708,
"grad_norm": 0.3470703661441803,
"learning_rate": 1.5346332724426155e-05,
"loss": 0.29430314898490906,
"step": 502
},
{
"epoch": 0.6852861035422343,
"grad_norm": 0.3816620111465454,
"learning_rate": 1.5327662575951404e-05,
"loss": 0.3283197283744812,
"step": 503
},
{
"epoch": 0.6866485013623979,
"grad_norm": 0.391053169965744,
"learning_rate": 1.5308966460195503e-05,
"loss": 0.34893274307250977,
"step": 504
},
{
"epoch": 0.6880108991825613,
"grad_norm": 0.4378596544265747,
"learning_rate": 1.5290244468284206e-05,
"loss": 0.40730422735214233,
"step": 505
},
{
"epoch": 0.6893732970027248,
"grad_norm": 0.4042869210243225,
"learning_rate": 1.5271496691469404e-05,
"loss": 0.3062353730201721,
"step": 506
},
{
"epoch": 0.6907356948228883,
"grad_norm": 0.42129504680633545,
"learning_rate": 1.525272322112865e-05,
"loss": 0.39751139283180237,
"step": 507
},
{
"epoch": 0.6920980926430518,
"grad_norm": 0.42566022276878357,
"learning_rate": 1.5233924148764727e-05,
"loss": 0.3925180435180664,
"step": 508
},
{
"epoch": 0.6934604904632152,
"grad_norm": 0.3715537488460541,
"learning_rate": 1.5215099566005217e-05,
"loss": 0.31566864252090454,
"step": 509
},
{
"epoch": 0.6948228882833788,
"grad_norm": 0.45559632778167725,
"learning_rate": 1.519624956460203e-05,
"loss": 0.4103913903236389,
"step": 510
},
{
"epoch": 0.6961852861035422,
"grad_norm": 0.34860438108444214,
"learning_rate": 1.517737423643097e-05,
"loss": 0.2865496277809143,
"step": 511
},
{
"epoch": 0.6975476839237057,
"grad_norm": 0.44315922260284424,
"learning_rate": 1.5158473673491285e-05,
"loss": 0.4178670048713684,
"step": 512
},
{
"epoch": 0.6989100817438693,
"grad_norm": 0.4062572419643402,
"learning_rate": 1.5139547967905221e-05,
"loss": 0.34824541211128235,
"step": 513
},
{
"epoch": 0.7002724795640327,
"grad_norm": 0.42972540855407715,
"learning_rate": 1.5120597211917564e-05,
"loss": 0.40658995509147644,
"step": 514
},
{
"epoch": 0.7016348773841962,
"grad_norm": 0.4179152846336365,
"learning_rate": 1.510162149789521e-05,
"loss": 0.38474875688552856,
"step": 515
},
{
"epoch": 0.7029972752043597,
"grad_norm": 0.41898468136787415,
"learning_rate": 1.5082620918326685e-05,
"loss": 0.3936446011066437,
"step": 516
},
{
"epoch": 0.7043596730245232,
"grad_norm": 0.4191248416900635,
"learning_rate": 1.5063595565821721e-05,
"loss": 0.3933945298194885,
"step": 517
},
{
"epoch": 0.7057220708446866,
"grad_norm": 0.45913466811180115,
"learning_rate": 1.5044545533110793e-05,
"loss": 0.38266128301620483,
"step": 518
},
{
"epoch": 0.7070844686648501,
"grad_norm": 0.4441032409667969,
"learning_rate": 1.5025470913044666e-05,
"loss": 0.4108320474624634,
"step": 519
},
{
"epoch": 0.7084468664850136,
"grad_norm": 0.42410925030708313,
"learning_rate": 1.5006371798593948e-05,
"loss": 0.4113953709602356,
"step": 520
},
{
"epoch": 0.7098092643051771,
"grad_norm": 0.4308149218559265,
"learning_rate": 1.4987248282848637e-05,
"loss": 0.3980264961719513,
"step": 521
},
{
"epoch": 0.7111716621253406,
"grad_norm": 0.44944408535957336,
"learning_rate": 1.4968100459017652e-05,
"loss": 0.36318397521972656,
"step": 522
},
{
"epoch": 0.7125340599455041,
"grad_norm": 0.39412856101989746,
"learning_rate": 1.4948928420428403e-05,
"loss": 0.3426111042499542,
"step": 523
},
{
"epoch": 0.7138964577656676,
"grad_norm": 0.35763245820999146,
"learning_rate": 1.4929732260526318e-05,
"loss": 0.28396356105804443,
"step": 524
},
{
"epoch": 0.715258855585831,
"grad_norm": 0.4681552052497864,
"learning_rate": 1.4910512072874395e-05,
"loss": 0.4816880226135254,
"step": 525
},
{
"epoch": 0.7166212534059946,
"grad_norm": 0.38465479016304016,
"learning_rate": 1.489126795115274e-05,
"loss": 0.31631630659103394,
"step": 526
},
{
"epoch": 0.717983651226158,
"grad_norm": 0.700157880783081,
"learning_rate": 1.4871999989158123e-05,
"loss": 0.3877072334289551,
"step": 527
},
{
"epoch": 0.7193460490463215,
"grad_norm": 0.36922982335090637,
"learning_rate": 1.4852708280803512e-05,
"loss": 0.33346259593963623,
"step": 528
},
{
"epoch": 0.720708446866485,
"grad_norm": 0.4037865698337555,
"learning_rate": 1.4833392920117607e-05,
"loss": 0.3595266342163086,
"step": 529
},
{
"epoch": 0.7220708446866485,
"grad_norm": 0.38142573833465576,
"learning_rate": 1.4814054001244395e-05,
"loss": 0.31460440158843994,
"step": 530
},
{
"epoch": 0.723433242506812,
"grad_norm": 0.3785574436187744,
"learning_rate": 1.4794691618442691e-05,
"loss": 0.30783623456954956,
"step": 531
},
{
"epoch": 0.7247956403269755,
"grad_norm": 0.44186413288116455,
"learning_rate": 1.477530586608567e-05,
"loss": 0.41240638494491577,
"step": 532
},
{
"epoch": 0.726158038147139,
"grad_norm": 0.3977665603160858,
"learning_rate": 1.4755896838660412e-05,
"loss": 0.32479894161224365,
"step": 533
},
{
"epoch": 0.7275204359673024,
"grad_norm": 0.40463075041770935,
"learning_rate": 1.4736464630767442e-05,
"loss": 0.3699343502521515,
"step": 534
},
{
"epoch": 0.728882833787466,
"grad_norm": 0.41050946712493896,
"learning_rate": 1.4717009337120268e-05,
"loss": 0.3829270005226135,
"step": 535
},
{
"epoch": 0.7302452316076294,
"grad_norm": 0.4132955074310303,
"learning_rate": 1.4697531052544914e-05,
"loss": 0.34740668535232544,
"step": 536
},
{
"epoch": 0.7316076294277929,
"grad_norm": 0.43629321455955505,
"learning_rate": 1.4678029871979469e-05,
"loss": 0.3730025589466095,
"step": 537
},
{
"epoch": 0.7329700272479565,
"grad_norm": 0.44061553478240967,
"learning_rate": 1.4658505890473615e-05,
"loss": 0.40129199624061584,
"step": 538
},
{
"epoch": 0.7343324250681199,
"grad_norm": 0.4008716642856598,
"learning_rate": 1.463895920318817e-05,
"loss": 0.3489900827407837,
"step": 539
},
{
"epoch": 0.7356948228882834,
"grad_norm": 0.3535612225532532,
"learning_rate": 1.4619389905394616e-05,
"loss": 0.2988109290599823,
"step": 540
},
{
"epoch": 0.7370572207084468,
"grad_norm": 0.4381372630596161,
"learning_rate": 1.4599798092474646e-05,
"loss": 0.3608015775680542,
"step": 541
},
{
"epoch": 0.7384196185286104,
"grad_norm": 0.3945034444332123,
"learning_rate": 1.4580183859919686e-05,
"loss": 0.32994401454925537,
"step": 542
},
{
"epoch": 0.7397820163487738,
"grad_norm": 0.41232776641845703,
"learning_rate": 1.4560547303330441e-05,
"loss": 0.38900381326675415,
"step": 543
},
{
"epoch": 0.7411444141689373,
"grad_norm": 0.38951003551483154,
"learning_rate": 1.4540888518416423e-05,
"loss": 0.3017880320549011,
"step": 544
},
{
"epoch": 0.7425068119891008,
"grad_norm": 0.3949369490146637,
"learning_rate": 1.4521207600995487e-05,
"loss": 0.32867276668548584,
"step": 545
},
{
"epoch": 0.7438692098092643,
"grad_norm": 0.3731197416782379,
"learning_rate": 1.4501504646993358e-05,
"loss": 0.32244962453842163,
"step": 546
},
{
"epoch": 0.7452316076294278,
"grad_norm": 0.42670756578445435,
"learning_rate": 1.4481779752443177e-05,
"loss": 0.34767279028892517,
"step": 547
},
{
"epoch": 0.7465940054495913,
"grad_norm": 0.4462198317050934,
"learning_rate": 1.446203301348502e-05,
"loss": 0.40442246198654175,
"step": 548
},
{
"epoch": 0.7479564032697548,
"grad_norm": 0.43297913670539856,
"learning_rate": 1.4442264526365425e-05,
"loss": 0.39974820613861084,
"step": 549
},
{
"epoch": 0.7493188010899182,
"grad_norm": 0.38846778869628906,
"learning_rate": 1.4422474387436951e-05,
"loss": 0.3305945098400116,
"step": 550
},
{
"epoch": 0.7506811989100818,
"grad_norm": 0.3659208118915558,
"learning_rate": 1.4402662693157672e-05,
"loss": 0.28628021478652954,
"step": 551
},
{
"epoch": 0.7520435967302452,
"grad_norm": 0.4232732653617859,
"learning_rate": 1.4382829540090728e-05,
"loss": 0.39836883544921875,
"step": 552
},
{
"epoch": 0.7534059945504087,
"grad_norm": 0.3981814682483673,
"learning_rate": 1.4362975024903854e-05,
"loss": 0.3883022964000702,
"step": 553
},
{
"epoch": 0.7547683923705722,
"grad_norm": 0.37059178948402405,
"learning_rate": 1.43430992443689e-05,
"loss": 0.31552067399024963,
"step": 554
},
{
"epoch": 0.7561307901907357,
"grad_norm": 0.5106791257858276,
"learning_rate": 1.4323202295361375e-05,
"loss": 0.3364841043949127,
"step": 555
},
{
"epoch": 0.7574931880108992,
"grad_norm": 0.3709793984889984,
"learning_rate": 1.4303284274859947e-05,
"loss": 0.3160533010959625,
"step": 556
},
{
"epoch": 0.7588555858310627,
"grad_norm": 0.38688817620277405,
"learning_rate": 1.4283345279946e-05,
"loss": 0.3377053439617157,
"step": 557
},
{
"epoch": 0.7602179836512262,
"grad_norm": 0.3955825865268707,
"learning_rate": 1.4263385407803147e-05,
"loss": 0.367174357175827,
"step": 558
},
{
"epoch": 0.7615803814713896,
"grad_norm": 0.387660413980484,
"learning_rate": 1.424340475571675e-05,
"loss": 0.355742484331131,
"step": 559
},
{
"epoch": 0.7629427792915532,
"grad_norm": 0.4117288589477539,
"learning_rate": 1.4223403421073465e-05,
"loss": 0.3632027506828308,
"step": 560
},
{
"epoch": 0.7643051771117166,
"grad_norm": 0.3913569748401642,
"learning_rate": 1.4203381501360746e-05,
"loss": 0.34550565481185913,
"step": 561
},
{
"epoch": 0.7656675749318801,
"grad_norm": 0.3794730305671692,
"learning_rate": 1.4183339094166386e-05,
"loss": 0.32455602288246155,
"step": 562
},
{
"epoch": 0.7670299727520435,
"grad_norm": 0.4123244881629944,
"learning_rate": 1.416327629717803e-05,
"loss": 0.33050400018692017,
"step": 563
},
{
"epoch": 0.7683923705722071,
"grad_norm": 0.5458968877792358,
"learning_rate": 1.4143193208182705e-05,
"loss": 0.33752286434173584,
"step": 564
},
{
"epoch": 0.7697547683923706,
"grad_norm": 0.3934307396411896,
"learning_rate": 1.4123089925066347e-05,
"loss": 0.31893715262413025,
"step": 565
},
{
"epoch": 0.771117166212534,
"grad_norm": 0.4280702471733093,
"learning_rate": 1.4102966545813312e-05,
"loss": 0.38816317915916443,
"step": 566
},
{
"epoch": 0.7724795640326976,
"grad_norm": 0.38977575302124023,
"learning_rate": 1.4082823168505912e-05,
"loss": 0.3048614263534546,
"step": 567
},
{
"epoch": 0.773841961852861,
"grad_norm": 0.4607933461666107,
"learning_rate": 1.4062659891323927e-05,
"loss": 0.3881381154060364,
"step": 568
},
{
"epoch": 0.7752043596730245,
"grad_norm": 0.4132702350616455,
"learning_rate": 1.4042476812544128e-05,
"loss": 0.367891788482666,
"step": 569
},
{
"epoch": 0.776566757493188,
"grad_norm": 0.36173874139785767,
"learning_rate": 1.4022274030539802e-05,
"loss": 0.2838096618652344,
"step": 570
},
{
"epoch": 0.7779291553133515,
"grad_norm": 0.3847193121910095,
"learning_rate": 1.4002051643780275e-05,
"loss": 0.31407347321510315,
"step": 571
},
{
"epoch": 0.779291553133515,
"grad_norm": 0.3895076811313629,
"learning_rate": 1.398180975083042e-05,
"loss": 0.3542616367340088,
"step": 572
},
{
"epoch": 0.7806539509536785,
"grad_norm": 0.4060671329498291,
"learning_rate": 1.3961548450350184e-05,
"loss": 0.3690754175186157,
"step": 573
},
{
"epoch": 0.782016348773842,
"grad_norm": 0.3732289671897888,
"learning_rate": 1.3941267841094118e-05,
"loss": 0.3208252191543579,
"step": 574
},
{
"epoch": 0.7833787465940054,
"grad_norm": 0.4188457429409027,
"learning_rate": 1.3920968021910872e-05,
"loss": 0.3940437436103821,
"step": 575
},
{
"epoch": 0.784741144414169,
"grad_norm": 0.3824058175086975,
"learning_rate": 1.3900649091742734e-05,
"loss": 0.3468947410583496,
"step": 576
},
{
"epoch": 0.7861035422343324,
"grad_norm": 0.36254164576530457,
"learning_rate": 1.3880311149625141e-05,
"loss": 0.3099260926246643,
"step": 577
},
{
"epoch": 0.7874659400544959,
"grad_norm": 0.34511882066726685,
"learning_rate": 1.3859954294686185e-05,
"loss": 0.2614838778972626,
"step": 578
},
{
"epoch": 0.7888283378746594,
"grad_norm": 0.42470481991767883,
"learning_rate": 1.3839578626146143e-05,
"loss": 0.36970824003219604,
"step": 579
},
{
"epoch": 0.7901907356948229,
"grad_norm": 0.43448764085769653,
"learning_rate": 1.3819184243317008e-05,
"loss": 0.38234907388687134,
"step": 580
},
{
"epoch": 0.7915531335149864,
"grad_norm": 0.41992104053497314,
"learning_rate": 1.3798771245601961e-05,
"loss": 0.3760548233985901,
"step": 581
},
{
"epoch": 0.7929155313351499,
"grad_norm": 0.3761197328567505,
"learning_rate": 1.3778339732494933e-05,
"loss": 0.31548407673835754,
"step": 582
},
{
"epoch": 0.7942779291553134,
"grad_norm": 0.40058112144470215,
"learning_rate": 1.3757889803580085e-05,
"loss": 0.3305339515209198,
"step": 583
},
{
"epoch": 0.7956403269754768,
"grad_norm": 0.4397111237049103,
"learning_rate": 1.373742155853135e-05,
"loss": 0.39684200286865234,
"step": 584
},
{
"epoch": 0.7970027247956403,
"grad_norm": 0.3730669319629669,
"learning_rate": 1.3716935097111926e-05,
"loss": 0.30914586782455444,
"step": 585
},
{
"epoch": 0.7983651226158038,
"grad_norm": 0.3845134377479553,
"learning_rate": 1.3696430519173802e-05,
"loss": 0.3142814040184021,
"step": 586
},
{
"epoch": 0.7997275204359673,
"grad_norm": 0.4052923619747162,
"learning_rate": 1.367590792465727e-05,
"loss": 0.33971768617630005,
"step": 587
},
{
"epoch": 0.8010899182561307,
"grad_norm": 0.35842764377593994,
"learning_rate": 1.3655367413590433e-05,
"loss": 0.3152307868003845,
"step": 588
},
{
"epoch": 0.8024523160762943,
"grad_norm": 0.41765162348747253,
"learning_rate": 1.3634809086088715e-05,
"loss": 0.36689841747283936,
"step": 589
},
{
"epoch": 0.8038147138964578,
"grad_norm": 0.3411005735397339,
"learning_rate": 1.361423304235439e-05,
"loss": 0.2985839545726776,
"step": 590
},
{
"epoch": 0.8051771117166212,
"grad_norm": 0.38796186447143555,
"learning_rate": 1.359363938267607e-05,
"loss": 0.31117764115333557,
"step": 591
},
{
"epoch": 0.8065395095367848,
"grad_norm": 0.4477858543395996,
"learning_rate": 1.3573028207428239e-05,
"loss": 0.3759269714355469,
"step": 592
},
{
"epoch": 0.8079019073569482,
"grad_norm": 0.3874566853046417,
"learning_rate": 1.3552399617070742e-05,
"loss": 0.35220903158187866,
"step": 593
},
{
"epoch": 0.8092643051771117,
"grad_norm": 0.3966315686702728,
"learning_rate": 1.3531753712148312e-05,
"loss": 0.327017605304718,
"step": 594
},
{
"epoch": 0.8106267029972752,
"grad_norm": 0.38722142577171326,
"learning_rate": 1.3511090593290073e-05,
"loss": 0.3531439006328583,
"step": 595
},
{
"epoch": 0.8119891008174387,
"grad_norm": 0.35546261072158813,
"learning_rate": 1.3490410361209051e-05,
"loss": 0.29371026158332825,
"step": 596
},
{
"epoch": 0.8133514986376021,
"grad_norm": 0.4084646701812744,
"learning_rate": 1.3469713116701683e-05,
"loss": 0.3657349944114685,
"step": 597
},
{
"epoch": 0.8147138964577657,
"grad_norm": 0.3828872740268707,
"learning_rate": 1.3448998960647324e-05,
"loss": 0.27304738759994507,
"step": 598
},
{
"epoch": 0.8160762942779292,
"grad_norm": 0.40250927209854126,
"learning_rate": 1.3428267994007756e-05,
"loss": 0.36223679780960083,
"step": 599
},
{
"epoch": 0.8174386920980926,
"grad_norm": 0.38634181022644043,
"learning_rate": 1.3407520317826697e-05,
"loss": 0.32051095366477966,
"step": 600
},
{
"epoch": 0.8188010899182562,
"grad_norm": 0.3875936269760132,
"learning_rate": 1.3386756033229314e-05,
"loss": 0.31921273469924927,
"step": 601
},
{
"epoch": 0.8201634877384196,
"grad_norm": 0.4128807783126831,
"learning_rate": 1.3365975241421712e-05,
"loss": 0.35329893231391907,
"step": 602
},
{
"epoch": 0.8215258855585831,
"grad_norm": 0.3846687376499176,
"learning_rate": 1.3345178043690463e-05,
"loss": 0.3306158185005188,
"step": 603
},
{
"epoch": 0.8228882833787466,
"grad_norm": 0.37342211604118347,
"learning_rate": 1.3324364541402102e-05,
"loss": 0.29243338108062744,
"step": 604
},
{
"epoch": 0.8242506811989101,
"grad_norm": 0.4314493238925934,
"learning_rate": 1.3303534836002629e-05,
"loss": 0.3962687849998474,
"step": 605
},
{
"epoch": 0.8256130790190735,
"grad_norm": 0.41795799136161804,
"learning_rate": 1.328268902901702e-05,
"loss": 0.3593180775642395,
"step": 606
},
{
"epoch": 0.8269754768392371,
"grad_norm": 0.41061070561408997,
"learning_rate": 1.326182722204873e-05,
"loss": 0.3891078233718872,
"step": 607
},
{
"epoch": 0.8283378746594006,
"grad_norm": 0.3990163803100586,
"learning_rate": 1.32409495167792e-05,
"loss": 0.3603130578994751,
"step": 608
},
{
"epoch": 0.829700272479564,
"grad_norm": 0.37110787630081177,
"learning_rate": 1.3220056014967359e-05,
"loss": 0.3168402910232544,
"step": 609
},
{
"epoch": 0.8310626702997275,
"grad_norm": 0.42547863721847534,
"learning_rate": 1.3199146818449134e-05,
"loss": 0.4026668667793274,
"step": 610
},
{
"epoch": 0.832425068119891,
"grad_norm": 0.47038719058036804,
"learning_rate": 1.317822202913694e-05,
"loss": 0.44461554288864136,
"step": 611
},
{
"epoch": 0.8337874659400545,
"grad_norm": 0.38207799196243286,
"learning_rate": 1.3157281749019199e-05,
"loss": 0.3126547336578369,
"step": 612
},
{
"epoch": 0.8351498637602179,
"grad_norm": 0.3733403980731964,
"learning_rate": 1.3136326080159836e-05,
"loss": 0.31984227895736694,
"step": 613
},
{
"epoch": 0.8365122615803815,
"grad_norm": 0.37421712279319763,
"learning_rate": 1.3115355124697775e-05,
"loss": 0.3349495232105255,
"step": 614
},
{
"epoch": 0.837874659400545,
"grad_norm": 0.4028952121734619,
"learning_rate": 1.3094368984846453e-05,
"loss": 0.3623151183128357,
"step": 615
},
{
"epoch": 0.8392370572207084,
"grad_norm": 0.3345983028411865,
"learning_rate": 1.3073367762893316e-05,
"loss": 0.25584784150123596,
"step": 616
},
{
"epoch": 0.840599455040872,
"grad_norm": 0.4131789803504944,
"learning_rate": 1.3052351561199321e-05,
"loss": 0.36740627884864807,
"step": 617
},
{
"epoch": 0.8419618528610354,
"grad_norm": 0.38671812415122986,
"learning_rate": 1.3031320482198433e-05,
"loss": 0.32881784439086914,
"step": 618
},
{
"epoch": 0.8433242506811989,
"grad_norm": 0.43707364797592163,
"learning_rate": 1.3010274628397137e-05,
"loss": 0.4240247905254364,
"step": 619
},
{
"epoch": 0.8446866485013624,
"grad_norm": 0.37011024355888367,
"learning_rate": 1.298921410237392e-05,
"loss": 0.3215616047382355,
"step": 620
},
{
"epoch": 0.8460490463215259,
"grad_norm": 0.40730512142181396,
"learning_rate": 1.2968139006778797e-05,
"loss": 0.3768579661846161,
"step": 621
},
{
"epoch": 0.8474114441416893,
"grad_norm": 0.43396347761154175,
"learning_rate": 1.2947049444332782e-05,
"loss": 0.41485506296157837,
"step": 622
},
{
"epoch": 0.8487738419618529,
"grad_norm": 0.43405383825302124,
"learning_rate": 1.292594551782741e-05,
"loss": 0.40304839611053467,
"step": 623
},
{
"epoch": 0.8501362397820164,
"grad_norm": 0.44254007935523987,
"learning_rate": 1.2904827330124223e-05,
"loss": 0.39402496814727783,
"step": 624
},
{
"epoch": 0.8514986376021798,
"grad_norm": 0.36872732639312744,
"learning_rate": 1.2883694984154273e-05,
"loss": 0.3053838312625885,
"step": 625
},
{
"epoch": 0.8528610354223434,
"grad_norm": 0.4020664095878601,
"learning_rate": 1.2862548582917622e-05,
"loss": 0.34385716915130615,
"step": 626
},
{
"epoch": 0.8542234332425068,
"grad_norm": 0.37539398670196533,
"learning_rate": 1.2841388229482834e-05,
"loss": 0.32522130012512207,
"step": 627
},
{
"epoch": 0.8555858310626703,
"grad_norm": 0.4901396632194519,
"learning_rate": 1.2820214026986481e-05,
"loss": 0.4514284133911133,
"step": 628
},
{
"epoch": 0.8569482288828338,
"grad_norm": 0.42575520277023315,
"learning_rate": 1.2799026078632638e-05,
"loss": 0.3785257935523987,
"step": 629
},
{
"epoch": 0.8583106267029973,
"grad_norm": 0.4292484223842621,
"learning_rate": 1.2777824487692373e-05,
"loss": 0.37280866503715515,
"step": 630
},
{
"epoch": 0.8596730245231607,
"grad_norm": 0.38012853264808655,
"learning_rate": 1.2756609357503248e-05,
"loss": 0.3435444235801697,
"step": 631
},
{
"epoch": 0.8610354223433242,
"grad_norm": 0.37119728326797485,
"learning_rate": 1.2735380791468814e-05,
"loss": 0.315696656703949,
"step": 632
},
{
"epoch": 0.8623978201634878,
"grad_norm": 0.4296729564666748,
"learning_rate": 1.271413889305812e-05,
"loss": 0.3677264451980591,
"step": 633
},
{
"epoch": 0.8637602179836512,
"grad_norm": 0.43633759021759033,
"learning_rate": 1.2692883765805188e-05,
"loss": 0.42795848846435547,
"step": 634
},
{
"epoch": 0.8651226158038147,
"grad_norm": 0.35888829827308655,
"learning_rate": 1.2671615513308524e-05,
"loss": 0.307987779378891,
"step": 635
},
{
"epoch": 0.8664850136239782,
"grad_norm": 0.39719316363334656,
"learning_rate": 1.2650334239230598e-05,
"loss": 0.34023380279541016,
"step": 636
},
{
"epoch": 0.8678474114441417,
"grad_norm": 0.39609214663505554,
"learning_rate": 1.2629040047297356e-05,
"loss": 0.3457816243171692,
"step": 637
},
{
"epoch": 0.8692098092643051,
"grad_norm": 0.3761802911758423,
"learning_rate": 1.2607733041297703e-05,
"loss": 0.3295412063598633,
"step": 638
},
{
"epoch": 0.8705722070844687,
"grad_norm": 0.4121301472187042,
"learning_rate": 1.2586413325083e-05,
"loss": 0.3802829086780548,
"step": 639
},
{
"epoch": 0.8719346049046321,
"grad_norm": 0.42629581689834595,
"learning_rate": 1.2565081002566563e-05,
"loss": 0.41711223125457764,
"step": 640
},
{
"epoch": 0.8732970027247956,
"grad_norm": 0.4152551591396332,
"learning_rate": 1.2543736177723147e-05,
"loss": 0.3565431833267212,
"step": 641
},
{
"epoch": 0.8746594005449592,
"grad_norm": 0.3676934838294983,
"learning_rate": 1.2522378954588443e-05,
"loss": 0.3011196553707123,
"step": 642
},
{
"epoch": 0.8760217983651226,
"grad_norm": 0.3511006534099579,
"learning_rate": 1.2501009437258576e-05,
"loss": 0.27889248728752136,
"step": 643
},
{
"epoch": 0.8773841961852861,
"grad_norm": 0.33939114212989807,
"learning_rate": 1.2479627729889587e-05,
"loss": 0.2643985152244568,
"step": 644
},
{
"epoch": 0.8787465940054496,
"grad_norm": 0.46138131618499756,
"learning_rate": 1.245823393669694e-05,
"loss": 0.431702196598053,
"step": 645
},
{
"epoch": 0.8801089918256131,
"grad_norm": 0.39123064279556274,
"learning_rate": 1.2436828161955004e-05,
"loss": 0.35548466444015503,
"step": 646
},
{
"epoch": 0.8814713896457765,
"grad_norm": 0.37139931321144104,
"learning_rate": 1.2415410509996537e-05,
"loss": 0.3073146939277649,
"step": 647
},
{
"epoch": 0.8828337874659401,
"grad_norm": 0.39180418848991394,
"learning_rate": 1.2393981085212204e-05,
"loss": 0.3435180187225342,
"step": 648
},
{
"epoch": 0.8841961852861036,
"grad_norm": 0.3841586410999298,
"learning_rate": 1.2372539992050037e-05,
"loss": 0.3478638529777527,
"step": 649
},
{
"epoch": 0.885558583106267,
"grad_norm": 0.4314171373844147,
"learning_rate": 1.2351087335014945e-05,
"loss": 0.34792008996009827,
"step": 650
},
{
"epoch": 0.8869209809264306,
"grad_norm": 0.36530905961990356,
"learning_rate": 1.2329623218668197e-05,
"loss": 0.31735897064208984,
"step": 651
},
{
"epoch": 0.888283378746594,
"grad_norm": 0.4034612476825714,
"learning_rate": 1.2308147747626926e-05,
"loss": 0.3590067923069,
"step": 652
},
{
"epoch": 0.8896457765667575,
"grad_norm": 0.3697197437286377,
"learning_rate": 1.2286661026563597e-05,
"loss": 0.30383849143981934,
"step": 653
},
{
"epoch": 0.8910081743869209,
"grad_norm": 0.36092400550842285,
"learning_rate": 1.2265163160205514e-05,
"loss": 0.2998080551624298,
"step": 654
},
{
"epoch": 0.8923705722070845,
"grad_norm": 0.41580548882484436,
"learning_rate": 1.2243654253334299e-05,
"loss": 0.39810556173324585,
"step": 655
},
{
"epoch": 0.8937329700272479,
"grad_norm": 0.43990781903266907,
"learning_rate": 1.2222134410785386e-05,
"loss": 0.37231457233428955,
"step": 656
},
{
"epoch": 0.8950953678474114,
"grad_norm": 0.3491821587085724,
"learning_rate": 1.2200603737447515e-05,
"loss": 0.27115774154663086,
"step": 657
},
{
"epoch": 0.896457765667575,
"grad_norm": 0.41555097699165344,
"learning_rate": 1.2179062338262217e-05,
"loss": 0.3704499900341034,
"step": 658
},
{
"epoch": 0.8978201634877384,
"grad_norm": 0.41340363025665283,
"learning_rate": 1.2157510318223296e-05,
"loss": 0.3653028905391693,
"step": 659
},
{
"epoch": 0.8991825613079019,
"grad_norm": 0.42316463589668274,
"learning_rate": 1.2135947782376322e-05,
"loss": 0.37947019934654236,
"step": 660
},
{
"epoch": 0.9005449591280654,
"grad_norm": 0.4614158570766449,
"learning_rate": 1.2114374835818122e-05,
"loss": 0.4423688054084778,
"step": 661
},
{
"epoch": 0.9019073569482289,
"grad_norm": 0.3793640434741974,
"learning_rate": 1.2092791583696266e-05,
"loss": 0.3140786290168762,
"step": 662
},
{
"epoch": 0.9032697547683923,
"grad_norm": 0.41017946600914,
"learning_rate": 1.207119813120855e-05,
"loss": 0.3691558241844177,
"step": 663
},
{
"epoch": 0.9046321525885559,
"grad_norm": 0.42569923400878906,
"learning_rate": 1.2049594583602495e-05,
"loss": 0.39558589458465576,
"step": 664
},
{
"epoch": 0.9059945504087193,
"grad_norm": 0.38926219940185547,
"learning_rate": 1.2027981046174817e-05,
"loss": 0.36622583866119385,
"step": 665
},
{
"epoch": 0.9073569482288828,
"grad_norm": 0.3998047709465027,
"learning_rate": 1.2006357624270927e-05,
"loss": 0.30081361532211304,
"step": 666
},
{
"epoch": 0.9087193460490464,
"grad_norm": 0.3933253884315491,
"learning_rate": 1.198472442328442e-05,
"loss": 0.3435978293418884,
"step": 667
},
{
"epoch": 0.9100817438692098,
"grad_norm": 0.3495563864707947,
"learning_rate": 1.1963081548656539e-05,
"loss": 0.29989689588546753,
"step": 668
},
{
"epoch": 0.9114441416893733,
"grad_norm": 0.4060697853565216,
"learning_rate": 1.1941429105875686e-05,
"loss": 0.3763880431652069,
"step": 669
},
{
"epoch": 0.9128065395095368,
"grad_norm": 0.33757802844047546,
"learning_rate": 1.1919767200476904e-05,
"loss": 0.2711639106273651,
"step": 670
},
{
"epoch": 0.9141689373297003,
"grad_norm": 0.3644583523273468,
"learning_rate": 1.1898095938041352e-05,
"loss": 0.2867870330810547,
"step": 671
},
{
"epoch": 0.9155313351498637,
"grad_norm": 0.4242314100265503,
"learning_rate": 1.187641542419579e-05,
"loss": 0.3521907925605774,
"step": 672
},
{
"epoch": 0.9168937329700273,
"grad_norm": 0.4531702995300293,
"learning_rate": 1.1854725764612078e-05,
"loss": 0.40591543912887573,
"step": 673
},
{
"epoch": 0.9182561307901907,
"grad_norm": 0.4164139926433563,
"learning_rate": 1.183302706500665e-05,
"loss": 0.3550049662590027,
"step": 674
},
{
"epoch": 0.9196185286103542,
"grad_norm": 0.37005481123924255,
"learning_rate": 1.181131943114e-05,
"loss": 0.30720236897468567,
"step": 675
},
{
"epoch": 0.9209809264305178,
"grad_norm": 0.3972548246383667,
"learning_rate": 1.1789602968816172e-05,
"loss": 0.318911075592041,
"step": 676
},
{
"epoch": 0.9223433242506812,
"grad_norm": 0.37663185596466064,
"learning_rate": 1.1767877783882235e-05,
"loss": 0.31429654359817505,
"step": 677
},
{
"epoch": 0.9237057220708447,
"grad_norm": 0.373142808675766,
"learning_rate": 1.1746143982227778e-05,
"loss": 0.30172014236450195,
"step": 678
},
{
"epoch": 0.9250681198910081,
"grad_norm": 0.42605262994766235,
"learning_rate": 1.1724401669784385e-05,
"loss": 0.37753045558929443,
"step": 679
},
{
"epoch": 0.9264305177111717,
"grad_norm": 0.41822153329849243,
"learning_rate": 1.1702650952525116e-05,
"loss": 0.39182257652282715,
"step": 680
},
{
"epoch": 0.9277929155313351,
"grad_norm": 0.39568570256233215,
"learning_rate": 1.168089193646401e-05,
"loss": 0.33675575256347656,
"step": 681
},
{
"epoch": 0.9291553133514986,
"grad_norm": 0.3839928209781647,
"learning_rate": 1.1659124727655546e-05,
"loss": 0.34991830587387085,
"step": 682
},
{
"epoch": 0.9305177111716622,
"grad_norm": 0.4019002914428711,
"learning_rate": 1.1637349432194137e-05,
"loss": 0.3185323476791382,
"step": 683
},
{
"epoch": 0.9318801089918256,
"grad_norm": 0.3919433057308197,
"learning_rate": 1.1615566156213609e-05,
"loss": 0.31005561351776123,
"step": 684
},
{
"epoch": 0.9332425068119891,
"grad_norm": 0.3654928207397461,
"learning_rate": 1.1593775005886687e-05,
"loss": 0.3170800507068634,
"step": 685
},
{
"epoch": 0.9346049046321526,
"grad_norm": 0.4315038025379181,
"learning_rate": 1.1571976087424478e-05,
"loss": 0.4050094485282898,
"step": 686
},
{
"epoch": 0.9359673024523161,
"grad_norm": 0.3850279450416565,
"learning_rate": 1.1550169507075939e-05,
"loss": 0.3527379631996155,
"step": 687
},
{
"epoch": 0.9373297002724795,
"grad_norm": 0.39894241094589233,
"learning_rate": 1.1528355371127396e-05,
"loss": 0.35128986835479736,
"step": 688
},
{
"epoch": 0.9386920980926431,
"grad_norm": 0.39317700266838074,
"learning_rate": 1.1506533785901977e-05,
"loss": 0.32606303691864014,
"step": 689
},
{
"epoch": 0.9400544959128065,
"grad_norm": 0.33535271883010864,
"learning_rate": 1.148470485775913e-05,
"loss": 0.26627829670906067,
"step": 690
},
{
"epoch": 0.94141689373297,
"grad_norm": 0.38109129667282104,
"learning_rate": 1.146286869309409e-05,
"loss": 0.3295374810695648,
"step": 691
},
{
"epoch": 0.9427792915531336,
"grad_norm": 0.3924390375614166,
"learning_rate": 1.1441025398337365e-05,
"loss": 0.3435341417789459,
"step": 692
},
{
"epoch": 0.944141689373297,
"grad_norm": 0.42672890424728394,
"learning_rate": 1.141917507995421e-05,
"loss": 0.36071181297302246,
"step": 693
},
{
"epoch": 0.9455040871934605,
"grad_norm": 0.417208194732666,
"learning_rate": 1.1397317844444125e-05,
"loss": 0.38000190258026123,
"step": 694
},
{
"epoch": 0.946866485013624,
"grad_norm": 0.38172468543052673,
"learning_rate": 1.137545379834031e-05,
"loss": 0.30421239137649536,
"step": 695
},
{
"epoch": 0.9482288828337875,
"grad_norm": 0.32868707180023193,
"learning_rate": 1.1353583048209171e-05,
"loss": 0.2640570104122162,
"step": 696
},
{
"epoch": 0.9495912806539509,
"grad_norm": 0.39148321747779846,
"learning_rate": 1.1331705700649786e-05,
"loss": 0.32436051964759827,
"step": 697
},
{
"epoch": 0.9509536784741145,
"grad_norm": 0.4759628176689148,
"learning_rate": 1.1309821862293385e-05,
"loss": 0.41697192192077637,
"step": 698
},
{
"epoch": 0.952316076294278,
"grad_norm": 0.38073548674583435,
"learning_rate": 1.128793163980284e-05,
"loss": 0.323738157749176,
"step": 699
},
{
"epoch": 0.9536784741144414,
"grad_norm": 0.3676002025604248,
"learning_rate": 1.1266035139872142e-05,
"loss": 0.288091778755188,
"step": 700
},
{
"epoch": 0.9550408719346049,
"grad_norm": 0.3463435173034668,
"learning_rate": 1.1244132469225872e-05,
"loss": 0.29015398025512695,
"step": 701
},
{
"epoch": 0.9564032697547684,
"grad_norm": 0.3797283470630646,
"learning_rate": 1.1222223734618689e-05,
"loss": 0.34152185916900635,
"step": 702
},
{
"epoch": 0.9577656675749319,
"grad_norm": 0.3729090094566345,
"learning_rate": 1.120030904283481e-05,
"loss": 0.2951827943325043,
"step": 703
},
{
"epoch": 0.9591280653950953,
"grad_norm": 0.37600177526474,
"learning_rate": 1.1178388500687482e-05,
"loss": 0.3445882797241211,
"step": 704
},
{
"epoch": 0.9604904632152589,
"grad_norm": 0.3924142122268677,
"learning_rate": 1.115646221501848e-05,
"loss": 0.34941422939300537,
"step": 705
},
{
"epoch": 0.9618528610354223,
"grad_norm": 0.3414173424243927,
"learning_rate": 1.1134530292697558e-05,
"loss": 0.2858058214187622,
"step": 706
},
{
"epoch": 0.9632152588555858,
"grad_norm": 0.35240837931632996,
"learning_rate": 1.1112592840621954e-05,
"loss": 0.29714637994766235,
"step": 707
},
{
"epoch": 0.9645776566757494,
"grad_norm": 0.4237878620624542,
"learning_rate": 1.1090649965715852e-05,
"loss": 0.3578713536262512,
"step": 708
},
{
"epoch": 0.9659400544959128,
"grad_norm": 0.33575090765953064,
"learning_rate": 1.1068701774929868e-05,
"loss": 0.2601892352104187,
"step": 709
},
{
"epoch": 0.9673024523160763,
"grad_norm": 0.4071582555770874,
"learning_rate": 1.1046748375240532e-05,
"loss": 0.3512653410434723,
"step": 710
},
{
"epoch": 0.9686648501362398,
"grad_norm": 0.3828555941581726,
"learning_rate": 1.1024789873649761e-05,
"loss": 0.3424757719039917,
"step": 711
},
{
"epoch": 0.9700272479564033,
"grad_norm": 0.3985345661640167,
"learning_rate": 1.1002826377184334e-05,
"loss": 0.3405340313911438,
"step": 712
},
{
"epoch": 0.9713896457765667,
"grad_norm": 0.3681023120880127,
"learning_rate": 1.0980857992895381e-05,
"loss": 0.30252936482429504,
"step": 713
},
{
"epoch": 0.9727520435967303,
"grad_norm": 0.4133037328720093,
"learning_rate": 1.0958884827857853e-05,
"loss": 0.39566880464553833,
"step": 714
},
{
"epoch": 0.9741144414168937,
"grad_norm": 0.3889504671096802,
"learning_rate": 1.0936906989170004e-05,
"loss": 0.3164287805557251,
"step": 715
},
{
"epoch": 0.9754768392370572,
"grad_norm": 0.42561522126197815,
"learning_rate": 1.0914924583952864e-05,
"loss": 0.39496541023254395,
"step": 716
},
{
"epoch": 0.9768392370572208,
"grad_norm": 0.4179486632347107,
"learning_rate": 1.0892937719349723e-05,
"loss": 0.3516708016395569,
"step": 717
},
{
"epoch": 0.9782016348773842,
"grad_norm": 0.4315149188041687,
"learning_rate": 1.087094650252561e-05,
"loss": 0.40251898765563965,
"step": 718
},
{
"epoch": 0.9795640326975477,
"grad_norm": 0.4050588309764862,
"learning_rate": 1.0848951040666762e-05,
"loss": 0.3581554591655731,
"step": 719
},
{
"epoch": 0.9809264305177112,
"grad_norm": 0.4035855829715729,
"learning_rate": 1.0826951440980105e-05,
"loss": 0.3374115526676178,
"step": 720
},
{
"epoch": 0.9822888283378747,
"grad_norm": 0.3701610565185547,
"learning_rate": 1.0804947810692736e-05,
"loss": 0.3004315495491028,
"step": 721
},
{
"epoch": 0.9836512261580381,
"grad_norm": 0.4019452631473541,
"learning_rate": 1.07829402570514e-05,
"loss": 0.3550060987472534,
"step": 722
},
{
"epoch": 0.9850136239782016,
"grad_norm": 0.4384351968765259,
"learning_rate": 1.076092888732196e-05,
"loss": 0.4013964831829071,
"step": 723
},
{
"epoch": 0.9863760217983651,
"grad_norm": 0.41561856865882874,
"learning_rate": 1.073891380878888e-05,
"loss": 0.3638450801372528,
"step": 724
},
{
"epoch": 0.9877384196185286,
"grad_norm": 0.4159381687641144,
"learning_rate": 1.0716895128754704e-05,
"loss": 0.3947365880012512,
"step": 725
},
{
"epoch": 0.989100817438692,
"grad_norm": 0.3710257112979889,
"learning_rate": 1.069487295453952e-05,
"loss": 0.32494619488716125,
"step": 726
},
{
"epoch": 0.9904632152588556,
"grad_norm": 0.3763940930366516,
"learning_rate": 1.0672847393480466e-05,
"loss": 0.32123690843582153,
"step": 727
},
{
"epoch": 0.9918256130790191,
"grad_norm": 0.4262928366661072,
"learning_rate": 1.0650818552931162e-05,
"loss": 0.344281405210495,
"step": 728
},
{
"epoch": 0.9931880108991825,
"grad_norm": 0.4106824994087219,
"learning_rate": 1.0628786540261235e-05,
"loss": 0.3853453993797302,
"step": 729
},
{
"epoch": 0.9945504087193461,
"grad_norm": 0.4608916640281677,
"learning_rate": 1.0606751462855764e-05,
"loss": 0.3871040940284729,
"step": 730
},
{
"epoch": 0.9959128065395095,
"grad_norm": 0.41636621952056885,
"learning_rate": 1.0584713428114764e-05,
"loss": 0.34777921438217163,
"step": 731
},
{
"epoch": 0.997275204359673,
"grad_norm": 0.41956406831741333,
"learning_rate": 1.0562672543452666e-05,
"loss": 0.38779330253601074,
"step": 732
},
{
"epoch": 0.9986376021798365,
"grad_norm": 0.35188162326812744,
"learning_rate": 1.0540628916297791e-05,
"loss": 0.2766571640968323,
"step": 733
},
{
"epoch": 1.0,
"grad_norm": 0.3943222165107727,
"learning_rate": 1.0518582654091824e-05,
"loss": 0.33342719078063965,
"step": 734
}
],
"logging_steps": 1,
"max_steps": 1468,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.300068371614925e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}