flyingbugs's picture
Model save
165393a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 849,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035335689045936395,
"grad_norm": 60.915947218212864,
"learning_rate": 0.0,
"loss": 10.9992,
"step": 1
},
{
"epoch": 0.007067137809187279,
"grad_norm": 64.10318663909388,
"learning_rate": 5.882352941176471e-07,
"loss": 10.9171,
"step": 2
},
{
"epoch": 0.01060070671378092,
"grad_norm": 60.47482396290011,
"learning_rate": 1.1764705882352942e-06,
"loss": 11.0562,
"step": 3
},
{
"epoch": 0.014134275618374558,
"grad_norm": 63.71352051567491,
"learning_rate": 1.7647058823529412e-06,
"loss": 10.8496,
"step": 4
},
{
"epoch": 0.0176678445229682,
"grad_norm": 69.613514959522,
"learning_rate": 2.3529411764705885e-06,
"loss": 10.767,
"step": 5
},
{
"epoch": 0.02120141342756184,
"grad_norm": 78.8599942990314,
"learning_rate": 2.9411764705882355e-06,
"loss": 10.4573,
"step": 6
},
{
"epoch": 0.024734982332155476,
"grad_norm": 110.83568335175693,
"learning_rate": 3.5294117647058825e-06,
"loss": 9.0411,
"step": 7
},
{
"epoch": 0.028268551236749116,
"grad_norm": 128.0506477179604,
"learning_rate": 4.11764705882353e-06,
"loss": 8.3947,
"step": 8
},
{
"epoch": 0.03180212014134275,
"grad_norm": 57.84424544715983,
"learning_rate": 4.705882352941177e-06,
"loss": 3.1653,
"step": 9
},
{
"epoch": 0.0353356890459364,
"grad_norm": 47.43758353410931,
"learning_rate": 5.294117647058824e-06,
"loss": 2.8269,
"step": 10
},
{
"epoch": 0.038869257950530034,
"grad_norm": 30.47579424893921,
"learning_rate": 5.882352941176471e-06,
"loss": 2.1518,
"step": 11
},
{
"epoch": 0.04240282685512368,
"grad_norm": 6.6064124401289055,
"learning_rate": 6.470588235294119e-06,
"loss": 1.2543,
"step": 12
},
{
"epoch": 0.045936395759717315,
"grad_norm": 4.213460425424173,
"learning_rate": 7.058823529411765e-06,
"loss": 1.1488,
"step": 13
},
{
"epoch": 0.04946996466431095,
"grad_norm": 3.1561702981393362,
"learning_rate": 7.647058823529413e-06,
"loss": 1.0632,
"step": 14
},
{
"epoch": 0.053003533568904596,
"grad_norm": 2.3023241595554103,
"learning_rate": 8.23529411764706e-06,
"loss": 0.999,
"step": 15
},
{
"epoch": 0.05653710247349823,
"grad_norm": 1.5732319581114818,
"learning_rate": 8.823529411764707e-06,
"loss": 0.9123,
"step": 16
},
{
"epoch": 0.06007067137809187,
"grad_norm": 151.88689001835468,
"learning_rate": 9.411764705882354e-06,
"loss": 0.9755,
"step": 17
},
{
"epoch": 0.0636042402826855,
"grad_norm": 11.280512510145474,
"learning_rate": 1e-05,
"loss": 0.7992,
"step": 18
},
{
"epoch": 0.06713780918727916,
"grad_norm": 1.5318352996869098,
"learning_rate": 1.0588235294117648e-05,
"loss": 0.8071,
"step": 19
},
{
"epoch": 0.0706713780918728,
"grad_norm": 1.038654499930616,
"learning_rate": 1.1176470588235295e-05,
"loss": 0.7669,
"step": 20
},
{
"epoch": 0.07420494699646643,
"grad_norm": 0.7700972154611778,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.7321,
"step": 21
},
{
"epoch": 0.07773851590106007,
"grad_norm": 0.6807121821831401,
"learning_rate": 1.2352941176470589e-05,
"loss": 0.6724,
"step": 22
},
{
"epoch": 0.0812720848056537,
"grad_norm": 0.8753468251115443,
"learning_rate": 1.2941176470588238e-05,
"loss": 0.6904,
"step": 23
},
{
"epoch": 0.08480565371024736,
"grad_norm": 0.8471768043784549,
"learning_rate": 1.3529411764705883e-05,
"loss": 0.6802,
"step": 24
},
{
"epoch": 0.08833922261484099,
"grad_norm": 0.6825164333199342,
"learning_rate": 1.411764705882353e-05,
"loss": 0.6598,
"step": 25
},
{
"epoch": 0.09187279151943463,
"grad_norm": 0.5228167319555727,
"learning_rate": 1.4705882352941177e-05,
"loss": 0.6326,
"step": 26
},
{
"epoch": 0.09540636042402827,
"grad_norm": 0.6983766074123856,
"learning_rate": 1.5294117647058826e-05,
"loss": 0.6154,
"step": 27
},
{
"epoch": 0.0989399293286219,
"grad_norm": 0.7116754206593494,
"learning_rate": 1.588235294117647e-05,
"loss": 0.6269,
"step": 28
},
{
"epoch": 0.10247349823321555,
"grad_norm": 0.5169699652022289,
"learning_rate": 1.647058823529412e-05,
"loss": 0.6016,
"step": 29
},
{
"epoch": 0.10600706713780919,
"grad_norm": 0.5371566354732616,
"learning_rate": 1.7058823529411767e-05,
"loss": 0.5748,
"step": 30
},
{
"epoch": 0.10954063604240283,
"grad_norm": 0.5224543330968032,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.5898,
"step": 31
},
{
"epoch": 0.11307420494699646,
"grad_norm": 0.47454483722545887,
"learning_rate": 1.8235294117647057e-05,
"loss": 0.5519,
"step": 32
},
{
"epoch": 0.1166077738515901,
"grad_norm": 0.5232460300562881,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.5832,
"step": 33
},
{
"epoch": 0.12014134275618374,
"grad_norm": 0.3835383325612776,
"learning_rate": 1.9411764705882355e-05,
"loss": 0.5602,
"step": 34
},
{
"epoch": 0.12367491166077739,
"grad_norm": 0.3817117716241729,
"learning_rate": 2e-05,
"loss": 0.5491,
"step": 35
},
{
"epoch": 0.127208480565371,
"grad_norm": 0.457775894654442,
"learning_rate": 2.058823529411765e-05,
"loss": 0.5476,
"step": 36
},
{
"epoch": 0.13074204946996468,
"grad_norm": 0.4325617493536125,
"learning_rate": 2.1176470588235296e-05,
"loss": 0.549,
"step": 37
},
{
"epoch": 0.13427561837455831,
"grad_norm": 0.38383135270733976,
"learning_rate": 2.1764705882352943e-05,
"loss": 0.5528,
"step": 38
},
{
"epoch": 0.13780918727915195,
"grad_norm": 0.2992854423323367,
"learning_rate": 2.235294117647059e-05,
"loss": 0.518,
"step": 39
},
{
"epoch": 0.1413427561837456,
"grad_norm": 0.38740229663140635,
"learning_rate": 2.2941176470588237e-05,
"loss": 0.558,
"step": 40
},
{
"epoch": 0.14487632508833923,
"grad_norm": 0.43891017636377233,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.5446,
"step": 41
},
{
"epoch": 0.14840989399293286,
"grad_norm": 0.29844952078426507,
"learning_rate": 2.411764705882353e-05,
"loss": 0.5159,
"step": 42
},
{
"epoch": 0.1519434628975265,
"grad_norm": 0.31588233441676267,
"learning_rate": 2.4705882352941178e-05,
"loss": 0.5092,
"step": 43
},
{
"epoch": 0.15547703180212014,
"grad_norm": 0.3799226945090326,
"learning_rate": 2.5294117647058825e-05,
"loss": 0.5105,
"step": 44
},
{
"epoch": 0.15901060070671377,
"grad_norm": 0.29880981827963293,
"learning_rate": 2.5882352941176475e-05,
"loss": 0.504,
"step": 45
},
{
"epoch": 0.1625441696113074,
"grad_norm": 0.31834513835229894,
"learning_rate": 2.647058823529412e-05,
"loss": 0.5292,
"step": 46
},
{
"epoch": 0.16607773851590105,
"grad_norm": 0.29955818409743495,
"learning_rate": 2.7058823529411766e-05,
"loss": 0.5112,
"step": 47
},
{
"epoch": 0.1696113074204947,
"grad_norm": 0.3130731713365594,
"learning_rate": 2.7647058823529416e-05,
"loss": 0.5201,
"step": 48
},
{
"epoch": 0.17314487632508835,
"grad_norm": 0.28339290286566854,
"learning_rate": 2.823529411764706e-05,
"loss": 0.5032,
"step": 49
},
{
"epoch": 0.17667844522968199,
"grad_norm": 0.31859381886848664,
"learning_rate": 2.8823529411764703e-05,
"loss": 0.5129,
"step": 50
},
{
"epoch": 0.18021201413427562,
"grad_norm": 0.25020833754192034,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.4656,
"step": 51
},
{
"epoch": 0.18374558303886926,
"grad_norm": 0.2595743456229774,
"learning_rate": 3e-05,
"loss": 0.4893,
"step": 52
},
{
"epoch": 0.1872791519434629,
"grad_norm": 0.31856066330423727,
"learning_rate": 3.058823529411765e-05,
"loss": 0.4889,
"step": 53
},
{
"epoch": 0.19081272084805653,
"grad_norm": 0.24160265118906338,
"learning_rate": 3.11764705882353e-05,
"loss": 0.498,
"step": 54
},
{
"epoch": 0.19434628975265017,
"grad_norm": 0.23920247172619422,
"learning_rate": 3.176470588235294e-05,
"loss": 0.4658,
"step": 55
},
{
"epoch": 0.1978798586572438,
"grad_norm": 0.3273257149122676,
"learning_rate": 3.235294117647059e-05,
"loss": 0.4883,
"step": 56
},
{
"epoch": 0.20141342756183744,
"grad_norm": 0.2317315478684021,
"learning_rate": 3.294117647058824e-05,
"loss": 0.4803,
"step": 57
},
{
"epoch": 0.2049469964664311,
"grad_norm": 0.24363506760884004,
"learning_rate": 3.352941176470588e-05,
"loss": 0.4759,
"step": 58
},
{
"epoch": 0.20848056537102475,
"grad_norm": 0.27309799511326494,
"learning_rate": 3.411764705882353e-05,
"loss": 0.4585,
"step": 59
},
{
"epoch": 0.21201413427561838,
"grad_norm": 0.25020575455338995,
"learning_rate": 3.470588235294118e-05,
"loss": 0.4722,
"step": 60
},
{
"epoch": 0.21554770318021202,
"grad_norm": 0.25843518698071305,
"learning_rate": 3.529411764705883e-05,
"loss": 0.4677,
"step": 61
},
{
"epoch": 0.21908127208480566,
"grad_norm": 0.4059319789736498,
"learning_rate": 3.5882352941176474e-05,
"loss": 0.4737,
"step": 62
},
{
"epoch": 0.2226148409893993,
"grad_norm": 0.24816463707665684,
"learning_rate": 3.6470588235294114e-05,
"loss": 0.4674,
"step": 63
},
{
"epoch": 0.22614840989399293,
"grad_norm": 0.24767924295244156,
"learning_rate": 3.705882352941177e-05,
"loss": 0.4725,
"step": 64
},
{
"epoch": 0.22968197879858657,
"grad_norm": 0.728699744020425,
"learning_rate": 3.7647058823529415e-05,
"loss": 0.4741,
"step": 65
},
{
"epoch": 0.2332155477031802,
"grad_norm": 0.23756283262158423,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.4916,
"step": 66
},
{
"epoch": 0.23674911660777384,
"grad_norm": 0.23144574303148777,
"learning_rate": 3.882352941176471e-05,
"loss": 0.4547,
"step": 67
},
{
"epoch": 0.24028268551236748,
"grad_norm": 0.25750431816308317,
"learning_rate": 3.9411764705882356e-05,
"loss": 0.4716,
"step": 68
},
{
"epoch": 0.24381625441696114,
"grad_norm": 0.23956478649970786,
"learning_rate": 4e-05,
"loss": 0.457,
"step": 69
},
{
"epoch": 0.24734982332155478,
"grad_norm": 0.25522979169861726,
"learning_rate": 4.058823529411765e-05,
"loss": 0.4763,
"step": 70
},
{
"epoch": 0.2508833922261484,
"grad_norm": 0.29313104519920113,
"learning_rate": 4.11764705882353e-05,
"loss": 0.5063,
"step": 71
},
{
"epoch": 0.254416961130742,
"grad_norm": 0.2507020422011976,
"learning_rate": 4.1764705882352944e-05,
"loss": 0.4551,
"step": 72
},
{
"epoch": 0.2579505300353357,
"grad_norm": 0.24953845436359837,
"learning_rate": 4.235294117647059e-05,
"loss": 0.4665,
"step": 73
},
{
"epoch": 0.26148409893992935,
"grad_norm": 0.23233676257775024,
"learning_rate": 4.294117647058823e-05,
"loss": 0.4422,
"step": 74
},
{
"epoch": 0.26501766784452296,
"grad_norm": 0.25771719082595745,
"learning_rate": 4.3529411764705885e-05,
"loss": 0.4603,
"step": 75
},
{
"epoch": 0.26855123674911663,
"grad_norm": 0.2617472693024197,
"learning_rate": 4.411764705882353e-05,
"loss": 0.457,
"step": 76
},
{
"epoch": 0.27208480565371024,
"grad_norm": 0.2415237796263424,
"learning_rate": 4.470588235294118e-05,
"loss": 0.4382,
"step": 77
},
{
"epoch": 0.2756183745583039,
"grad_norm": 0.29177363461613093,
"learning_rate": 4.5294117647058826e-05,
"loss": 0.4361,
"step": 78
},
{
"epoch": 0.2791519434628975,
"grad_norm": 0.24526961915466144,
"learning_rate": 4.588235294117647e-05,
"loss": 0.4453,
"step": 79
},
{
"epoch": 0.2826855123674912,
"grad_norm": 0.3235167042689137,
"learning_rate": 4.647058823529412e-05,
"loss": 0.4627,
"step": 80
},
{
"epoch": 0.2862190812720848,
"grad_norm": 0.2774491786206036,
"learning_rate": 4.705882352941177e-05,
"loss": 0.4467,
"step": 81
},
{
"epoch": 0.28975265017667845,
"grad_norm": 0.26399667907923735,
"learning_rate": 4.7647058823529414e-05,
"loss": 0.469,
"step": 82
},
{
"epoch": 0.29328621908127206,
"grad_norm": 0.2694824011788622,
"learning_rate": 4.823529411764706e-05,
"loss": 0.4308,
"step": 83
},
{
"epoch": 0.2968197879858657,
"grad_norm": 0.24844917007871703,
"learning_rate": 4.882352941176471e-05,
"loss": 0.4357,
"step": 84
},
{
"epoch": 0.3003533568904594,
"grad_norm": 0.27120853528650785,
"learning_rate": 4.9411764705882355e-05,
"loss": 0.4466,
"step": 85
},
{
"epoch": 0.303886925795053,
"grad_norm": 0.23599440032214467,
"learning_rate": 5e-05,
"loss": 0.4325,
"step": 86
},
{
"epoch": 0.30742049469964666,
"grad_norm": 0.274289539977184,
"learning_rate": 4.993455497382199e-05,
"loss": 0.4576,
"step": 87
},
{
"epoch": 0.31095406360424027,
"grad_norm": 0.30836653075762893,
"learning_rate": 4.986910994764398e-05,
"loss": 0.4485,
"step": 88
},
{
"epoch": 0.31448763250883394,
"grad_norm": 0.29597436853005576,
"learning_rate": 4.980366492146597e-05,
"loss": 0.4392,
"step": 89
},
{
"epoch": 0.31802120141342755,
"grad_norm": 0.28288300746868683,
"learning_rate": 4.973821989528796e-05,
"loss": 0.4534,
"step": 90
},
{
"epoch": 0.3215547703180212,
"grad_norm": 0.3186543956261268,
"learning_rate": 4.967277486910995e-05,
"loss": 0.4429,
"step": 91
},
{
"epoch": 0.3250883392226148,
"grad_norm": 0.3104322720447184,
"learning_rate": 4.960732984293194e-05,
"loss": 0.4629,
"step": 92
},
{
"epoch": 0.3286219081272085,
"grad_norm": 0.26445337516046435,
"learning_rate": 4.954188481675393e-05,
"loss": 0.4438,
"step": 93
},
{
"epoch": 0.3321554770318021,
"grad_norm": 0.2852481000594912,
"learning_rate": 4.947643979057592e-05,
"loss": 0.4365,
"step": 94
},
{
"epoch": 0.33568904593639576,
"grad_norm": 0.2617926769128037,
"learning_rate": 4.9410994764397906e-05,
"loss": 0.4422,
"step": 95
},
{
"epoch": 0.3392226148409894,
"grad_norm": 0.25703766888160945,
"learning_rate": 4.93455497382199e-05,
"loss": 0.4211,
"step": 96
},
{
"epoch": 0.34275618374558303,
"grad_norm": 0.23065991913082493,
"learning_rate": 4.928010471204188e-05,
"loss": 0.4552,
"step": 97
},
{
"epoch": 0.3462897526501767,
"grad_norm": 0.2557457296483132,
"learning_rate": 4.921465968586388e-05,
"loss": 0.4366,
"step": 98
},
{
"epoch": 0.3498233215547703,
"grad_norm": 0.21943442364699545,
"learning_rate": 4.9149214659685867e-05,
"loss": 0.4311,
"step": 99
},
{
"epoch": 0.35335689045936397,
"grad_norm": 0.2711259301133924,
"learning_rate": 4.9083769633507855e-05,
"loss": 0.4525,
"step": 100
},
{
"epoch": 0.3568904593639576,
"grad_norm": 0.24693662456422702,
"learning_rate": 4.9018324607329844e-05,
"loss": 0.424,
"step": 101
},
{
"epoch": 0.36042402826855124,
"grad_norm": 0.2319620097290189,
"learning_rate": 4.895287958115183e-05,
"loss": 0.4498,
"step": 102
},
{
"epoch": 0.36395759717314485,
"grad_norm": 0.2779603142533072,
"learning_rate": 4.888743455497383e-05,
"loss": 0.4297,
"step": 103
},
{
"epoch": 0.3674911660777385,
"grad_norm": 0.2318080007022888,
"learning_rate": 4.8821989528795816e-05,
"loss": 0.4385,
"step": 104
},
{
"epoch": 0.3710247349823322,
"grad_norm": 0.27447088657845,
"learning_rate": 4.8756544502617804e-05,
"loss": 0.463,
"step": 105
},
{
"epoch": 0.3745583038869258,
"grad_norm": 0.24680872376982393,
"learning_rate": 4.869109947643979e-05,
"loss": 0.437,
"step": 106
},
{
"epoch": 0.37809187279151946,
"grad_norm": 0.23258452949469693,
"learning_rate": 4.862565445026178e-05,
"loss": 0.4296,
"step": 107
},
{
"epoch": 0.38162544169611307,
"grad_norm": 0.2621876825453743,
"learning_rate": 4.856020942408378e-05,
"loss": 0.4477,
"step": 108
},
{
"epoch": 0.38515901060070673,
"grad_norm": 0.22058245519119254,
"learning_rate": 4.8494764397905765e-05,
"loss": 0.4335,
"step": 109
},
{
"epoch": 0.38869257950530034,
"grad_norm": 0.27596660525429606,
"learning_rate": 4.842931937172775e-05,
"loss": 0.4445,
"step": 110
},
{
"epoch": 0.392226148409894,
"grad_norm": 0.21513578503231365,
"learning_rate": 4.836387434554974e-05,
"loss": 0.4307,
"step": 111
},
{
"epoch": 0.3957597173144876,
"grad_norm": 0.2629720105758817,
"learning_rate": 4.829842931937173e-05,
"loss": 0.4356,
"step": 112
},
{
"epoch": 0.3992932862190813,
"grad_norm": 0.27909247278464644,
"learning_rate": 4.823298429319372e-05,
"loss": 0.448,
"step": 113
},
{
"epoch": 0.4028268551236749,
"grad_norm": 0.22664868240455938,
"learning_rate": 4.816753926701571e-05,
"loss": 0.4241,
"step": 114
},
{
"epoch": 0.40636042402826855,
"grad_norm": 0.278364471495712,
"learning_rate": 4.8102094240837696e-05,
"loss": 0.4363,
"step": 115
},
{
"epoch": 0.4098939929328622,
"grad_norm": 0.25681897453774133,
"learning_rate": 4.803664921465969e-05,
"loss": 0.4341,
"step": 116
},
{
"epoch": 0.4134275618374558,
"grad_norm": 0.2502847675719995,
"learning_rate": 4.797120418848168e-05,
"loss": 0.4207,
"step": 117
},
{
"epoch": 0.4169611307420495,
"grad_norm": 0.2834770194171487,
"learning_rate": 4.790575916230366e-05,
"loss": 0.4304,
"step": 118
},
{
"epoch": 0.4204946996466431,
"grad_norm": 0.26093436241203904,
"learning_rate": 4.784031413612566e-05,
"loss": 0.4218,
"step": 119
},
{
"epoch": 0.42402826855123676,
"grad_norm": 0.25891273155901756,
"learning_rate": 4.7774869109947645e-05,
"loss": 0.4174,
"step": 120
},
{
"epoch": 0.4275618374558304,
"grad_norm": 0.2312860577869842,
"learning_rate": 4.770942408376964e-05,
"loss": 0.4261,
"step": 121
},
{
"epoch": 0.43109540636042404,
"grad_norm": 0.2917974512874463,
"learning_rate": 4.764397905759162e-05,
"loss": 0.4184,
"step": 122
},
{
"epoch": 0.43462897526501765,
"grad_norm": 0.23287738339123082,
"learning_rate": 4.757853403141361e-05,
"loss": 0.4104,
"step": 123
},
{
"epoch": 0.4381625441696113,
"grad_norm": 0.2664844496788489,
"learning_rate": 4.7513089005235606e-05,
"loss": 0.4322,
"step": 124
},
{
"epoch": 0.4416961130742049,
"grad_norm": 0.2810454345063088,
"learning_rate": 4.7447643979057595e-05,
"loss": 0.422,
"step": 125
},
{
"epoch": 0.4452296819787986,
"grad_norm": 0.24068841225391116,
"learning_rate": 4.738219895287958e-05,
"loss": 0.4155,
"step": 126
},
{
"epoch": 0.44876325088339225,
"grad_norm": 0.3241712958364669,
"learning_rate": 4.731675392670157e-05,
"loss": 0.4184,
"step": 127
},
{
"epoch": 0.45229681978798586,
"grad_norm": 0.25468659396687165,
"learning_rate": 4.725130890052356e-05,
"loss": 0.41,
"step": 128
},
{
"epoch": 0.4558303886925795,
"grad_norm": 0.27482798933347197,
"learning_rate": 4.7185863874345556e-05,
"loss": 0.3919,
"step": 129
},
{
"epoch": 0.45936395759717313,
"grad_norm": 0.26764592282185334,
"learning_rate": 4.7120418848167544e-05,
"loss": 0.4143,
"step": 130
},
{
"epoch": 0.4628975265017668,
"grad_norm": 0.2544397009243222,
"learning_rate": 4.7054973821989526e-05,
"loss": 0.4177,
"step": 131
},
{
"epoch": 0.4664310954063604,
"grad_norm": 0.25128503633111415,
"learning_rate": 4.698952879581152e-05,
"loss": 0.4179,
"step": 132
},
{
"epoch": 0.46996466431095407,
"grad_norm": 0.30565859002250456,
"learning_rate": 4.692408376963351e-05,
"loss": 0.4083,
"step": 133
},
{
"epoch": 0.4734982332155477,
"grad_norm": 0.2892878501054137,
"learning_rate": 4.6858638743455505e-05,
"loss": 0.435,
"step": 134
},
{
"epoch": 0.47703180212014135,
"grad_norm": 0.28263079058280177,
"learning_rate": 4.6793193717277487e-05,
"loss": 0.4173,
"step": 135
},
{
"epoch": 0.48056537102473496,
"grad_norm": 0.2818521474852096,
"learning_rate": 4.6727748691099475e-05,
"loss": 0.4279,
"step": 136
},
{
"epoch": 0.4840989399293286,
"grad_norm": 0.2706194531255763,
"learning_rate": 4.666230366492147e-05,
"loss": 0.4232,
"step": 137
},
{
"epoch": 0.4876325088339223,
"grad_norm": 0.27264415935427744,
"learning_rate": 4.659685863874346e-05,
"loss": 0.4316,
"step": 138
},
{
"epoch": 0.4911660777385159,
"grad_norm": 0.2606117321700416,
"learning_rate": 4.653141361256545e-05,
"loss": 0.4175,
"step": 139
},
{
"epoch": 0.49469964664310956,
"grad_norm": 0.30893212845204077,
"learning_rate": 4.6465968586387436e-05,
"loss": 0.4279,
"step": 140
},
{
"epoch": 0.49823321554770317,
"grad_norm": 0.316191147636726,
"learning_rate": 4.6400523560209424e-05,
"loss": 0.4296,
"step": 141
},
{
"epoch": 0.5017667844522968,
"grad_norm": 0.44619384103405474,
"learning_rate": 4.633507853403142e-05,
"loss": 0.4214,
"step": 142
},
{
"epoch": 0.5053003533568905,
"grad_norm": 0.36123106925919196,
"learning_rate": 4.626963350785341e-05,
"loss": 0.4129,
"step": 143
},
{
"epoch": 0.508833922261484,
"grad_norm": 0.2827273823787523,
"learning_rate": 4.620418848167539e-05,
"loss": 0.4426,
"step": 144
},
{
"epoch": 0.5123674911660777,
"grad_norm": 0.3271577698217905,
"learning_rate": 4.6138743455497385e-05,
"loss": 0.4114,
"step": 145
},
{
"epoch": 0.5159010600706714,
"grad_norm": 0.3449116596285634,
"learning_rate": 4.6073298429319374e-05,
"loss": 0.4249,
"step": 146
},
{
"epoch": 0.519434628975265,
"grad_norm": 0.35785099467531045,
"learning_rate": 4.600785340314136e-05,
"loss": 0.4219,
"step": 147
},
{
"epoch": 0.5229681978798587,
"grad_norm": 0.2859454863314326,
"learning_rate": 4.594240837696335e-05,
"loss": 0.4236,
"step": 148
},
{
"epoch": 0.5265017667844523,
"grad_norm": 0.31025555262515647,
"learning_rate": 4.587696335078534e-05,
"loss": 0.4171,
"step": 149
},
{
"epoch": 0.5300353356890459,
"grad_norm": 0.2968047216548135,
"learning_rate": 4.5811518324607335e-05,
"loss": 0.4163,
"step": 150
},
{
"epoch": 0.5335689045936396,
"grad_norm": 0.297324784894771,
"learning_rate": 4.574607329842932e-05,
"loss": 0.4303,
"step": 151
},
{
"epoch": 0.5371024734982333,
"grad_norm": 0.2937385244466456,
"learning_rate": 4.568062827225131e-05,
"loss": 0.4213,
"step": 152
},
{
"epoch": 0.5406360424028268,
"grad_norm": 0.32084633596161016,
"learning_rate": 4.56151832460733e-05,
"loss": 0.434,
"step": 153
},
{
"epoch": 0.5441696113074205,
"grad_norm": 0.28808014111390984,
"learning_rate": 4.554973821989529e-05,
"loss": 0.4192,
"step": 154
},
{
"epoch": 0.5477031802120141,
"grad_norm": 0.28043048275510013,
"learning_rate": 4.5484293193717284e-05,
"loss": 0.4356,
"step": 155
},
{
"epoch": 0.5512367491166078,
"grad_norm": 0.26722328496771375,
"learning_rate": 4.5418848167539266e-05,
"loss": 0.4097,
"step": 156
},
{
"epoch": 0.5547703180212014,
"grad_norm": 0.28761451608412936,
"learning_rate": 4.535340314136126e-05,
"loss": 0.4186,
"step": 157
},
{
"epoch": 0.558303886925795,
"grad_norm": 0.25997336915483626,
"learning_rate": 4.528795811518325e-05,
"loss": 0.4281,
"step": 158
},
{
"epoch": 0.5618374558303887,
"grad_norm": 0.26984807993403653,
"learning_rate": 4.522251308900524e-05,
"loss": 0.4155,
"step": 159
},
{
"epoch": 0.5653710247349824,
"grad_norm": 0.259428663430806,
"learning_rate": 4.5157068062827226e-05,
"loss": 0.4272,
"step": 160
},
{
"epoch": 0.568904593639576,
"grad_norm": 0.27018996202437295,
"learning_rate": 4.5091623036649215e-05,
"loss": 0.4107,
"step": 161
},
{
"epoch": 0.5724381625441696,
"grad_norm": 0.28533599879851723,
"learning_rate": 4.50261780104712e-05,
"loss": 0.3969,
"step": 162
},
{
"epoch": 0.5759717314487632,
"grad_norm": 0.24200050236306417,
"learning_rate": 4.49607329842932e-05,
"loss": 0.4096,
"step": 163
},
{
"epoch": 0.5795053003533569,
"grad_norm": 0.2994472215128913,
"learning_rate": 4.489528795811519e-05,
"loss": 0.4028,
"step": 164
},
{
"epoch": 0.5830388692579506,
"grad_norm": 0.2665264704805344,
"learning_rate": 4.4829842931937176e-05,
"loss": 0.4011,
"step": 165
},
{
"epoch": 0.5865724381625441,
"grad_norm": 0.28378189066791937,
"learning_rate": 4.4764397905759164e-05,
"loss": 0.4149,
"step": 166
},
{
"epoch": 0.5901060070671378,
"grad_norm": 0.24179155830908294,
"learning_rate": 4.469895287958115e-05,
"loss": 0.4091,
"step": 167
},
{
"epoch": 0.5936395759717314,
"grad_norm": 0.31574025669035954,
"learning_rate": 4.463350785340315e-05,
"loss": 0.4314,
"step": 168
},
{
"epoch": 0.5971731448763251,
"grad_norm": 0.23643166056263606,
"learning_rate": 4.456806282722513e-05,
"loss": 0.4111,
"step": 169
},
{
"epoch": 0.6007067137809188,
"grad_norm": 0.24855634010638014,
"learning_rate": 4.4502617801047125e-05,
"loss": 0.4127,
"step": 170
},
{
"epoch": 0.6042402826855123,
"grad_norm": 0.552889018363363,
"learning_rate": 4.4437172774869113e-05,
"loss": 0.434,
"step": 171
},
{
"epoch": 0.607773851590106,
"grad_norm": 0.24426049935632982,
"learning_rate": 4.43717277486911e-05,
"loss": 0.4169,
"step": 172
},
{
"epoch": 0.6113074204946997,
"grad_norm": 0.26560345384981926,
"learning_rate": 4.430628272251309e-05,
"loss": 0.4451,
"step": 173
},
{
"epoch": 0.6148409893992933,
"grad_norm": 0.24674453152732914,
"learning_rate": 4.424083769633508e-05,
"loss": 0.4124,
"step": 174
},
{
"epoch": 0.6183745583038869,
"grad_norm": 0.23653497598415876,
"learning_rate": 4.417539267015707e-05,
"loss": 0.439,
"step": 175
},
{
"epoch": 0.6219081272084805,
"grad_norm": 0.27816683995639657,
"learning_rate": 4.410994764397906e-05,
"loss": 0.4121,
"step": 176
},
{
"epoch": 0.6254416961130742,
"grad_norm": 0.23829492805608396,
"learning_rate": 4.4044502617801045e-05,
"loss": 0.4025,
"step": 177
},
{
"epoch": 0.6289752650176679,
"grad_norm": 0.23626070956443163,
"learning_rate": 4.397905759162304e-05,
"loss": 0.3949,
"step": 178
},
{
"epoch": 0.6325088339222615,
"grad_norm": 0.26913124550480905,
"learning_rate": 4.391361256544503e-05,
"loss": 0.4198,
"step": 179
},
{
"epoch": 0.6360424028268551,
"grad_norm": 0.26444792575098935,
"learning_rate": 4.384816753926702e-05,
"loss": 0.4152,
"step": 180
},
{
"epoch": 0.6395759717314488,
"grad_norm": 0.28736718825769164,
"learning_rate": 4.3782722513089005e-05,
"loss": 0.4184,
"step": 181
},
{
"epoch": 0.6431095406360424,
"grad_norm": 0.24916328448496777,
"learning_rate": 4.3717277486910994e-05,
"loss": 0.4372,
"step": 182
},
{
"epoch": 0.6466431095406361,
"grad_norm": 0.2580684547501532,
"learning_rate": 4.365183246073299e-05,
"loss": 0.4078,
"step": 183
},
{
"epoch": 0.6501766784452296,
"grad_norm": 0.21443876570470546,
"learning_rate": 4.358638743455498e-05,
"loss": 0.4112,
"step": 184
},
{
"epoch": 0.6537102473498233,
"grad_norm": 0.2585770749848044,
"learning_rate": 4.3520942408376966e-05,
"loss": 0.4275,
"step": 185
},
{
"epoch": 0.657243816254417,
"grad_norm": 0.2306904512126803,
"learning_rate": 4.3455497382198955e-05,
"loss": 0.4062,
"step": 186
},
{
"epoch": 0.6607773851590106,
"grad_norm": 0.22770367490745005,
"learning_rate": 4.339005235602094e-05,
"loss": 0.4169,
"step": 187
},
{
"epoch": 0.6643109540636042,
"grad_norm": 0.2160233520776506,
"learning_rate": 4.332460732984294e-05,
"loss": 0.3954,
"step": 188
},
{
"epoch": 0.6678445229681979,
"grad_norm": 0.22325050602227733,
"learning_rate": 4.325916230366493e-05,
"loss": 0.3919,
"step": 189
},
{
"epoch": 0.6713780918727915,
"grad_norm": 0.22586695676150254,
"learning_rate": 4.319371727748691e-05,
"loss": 0.4041,
"step": 190
},
{
"epoch": 0.6749116607773852,
"grad_norm": 0.19641141654473473,
"learning_rate": 4.3128272251308904e-05,
"loss": 0.39,
"step": 191
},
{
"epoch": 0.6784452296819788,
"grad_norm": 0.2514249590269792,
"learning_rate": 4.306282722513089e-05,
"loss": 0.4102,
"step": 192
},
{
"epoch": 0.6819787985865724,
"grad_norm": 0.21866986557153864,
"learning_rate": 4.299738219895288e-05,
"loss": 0.4035,
"step": 193
},
{
"epoch": 0.6855123674911661,
"grad_norm": 0.25496496117460954,
"learning_rate": 4.293193717277487e-05,
"loss": 0.4036,
"step": 194
},
{
"epoch": 0.6890459363957597,
"grad_norm": 0.2284669898013072,
"learning_rate": 4.286649214659686e-05,
"loss": 0.4036,
"step": 195
},
{
"epoch": 0.6925795053003534,
"grad_norm": 0.22109210174266103,
"learning_rate": 4.280104712041885e-05,
"loss": 0.4008,
"step": 196
},
{
"epoch": 0.696113074204947,
"grad_norm": 0.2550246646227659,
"learning_rate": 4.273560209424084e-05,
"loss": 0.4218,
"step": 197
},
{
"epoch": 0.6996466431095406,
"grad_norm": 0.24670707538924813,
"learning_rate": 4.267015706806283e-05,
"loss": 0.4022,
"step": 198
},
{
"epoch": 0.7031802120141343,
"grad_norm": 0.24528358706276326,
"learning_rate": 4.260471204188482e-05,
"loss": 0.4098,
"step": 199
},
{
"epoch": 0.7067137809187279,
"grad_norm": 0.2804256190765326,
"learning_rate": 4.253926701570681e-05,
"loss": 0.4092,
"step": 200
},
{
"epoch": 0.7102473498233216,
"grad_norm": 0.2575565314122766,
"learning_rate": 4.24738219895288e-05,
"loss": 0.4049,
"step": 201
},
{
"epoch": 0.7137809187279152,
"grad_norm": 0.2853344191193188,
"learning_rate": 4.240837696335079e-05,
"loss": 0.4202,
"step": 202
},
{
"epoch": 0.7173144876325088,
"grad_norm": 0.21537639057332267,
"learning_rate": 4.234293193717277e-05,
"loss": 0.4289,
"step": 203
},
{
"epoch": 0.7208480565371025,
"grad_norm": 0.22304542649399647,
"learning_rate": 4.227748691099477e-05,
"loss": 0.4104,
"step": 204
},
{
"epoch": 0.7243816254416962,
"grad_norm": 0.22935406527517788,
"learning_rate": 4.2212041884816757e-05,
"loss": 0.4,
"step": 205
},
{
"epoch": 0.7279151943462897,
"grad_norm": 0.23304248610855605,
"learning_rate": 4.2146596858638745e-05,
"loss": 0.4107,
"step": 206
},
{
"epoch": 0.7314487632508834,
"grad_norm": 0.23823942972540235,
"learning_rate": 4.2081151832460734e-05,
"loss": 0.432,
"step": 207
},
{
"epoch": 0.734982332155477,
"grad_norm": 0.2543799613946858,
"learning_rate": 4.201570680628272e-05,
"loss": 0.4084,
"step": 208
},
{
"epoch": 0.7385159010600707,
"grad_norm": 0.21095357814341076,
"learning_rate": 4.195026178010472e-05,
"loss": 0.4098,
"step": 209
},
{
"epoch": 0.7420494699646644,
"grad_norm": 0.20730300999287024,
"learning_rate": 4.1884816753926706e-05,
"loss": 0.3811,
"step": 210
},
{
"epoch": 0.7455830388692579,
"grad_norm": 0.2099000376737902,
"learning_rate": 4.181937172774869e-05,
"loss": 0.3894,
"step": 211
},
{
"epoch": 0.7491166077738516,
"grad_norm": 0.2137194170060052,
"learning_rate": 4.175392670157068e-05,
"loss": 0.4038,
"step": 212
},
{
"epoch": 0.7526501766784452,
"grad_norm": 0.22371893504573911,
"learning_rate": 4.168848167539267e-05,
"loss": 0.3907,
"step": 213
},
{
"epoch": 0.7561837455830389,
"grad_norm": 0.2370792283738368,
"learning_rate": 4.162303664921467e-05,
"loss": 0.412,
"step": 214
},
{
"epoch": 0.7597173144876325,
"grad_norm": 0.2236273921731486,
"learning_rate": 4.155759162303665e-05,
"loss": 0.4033,
"step": 215
},
{
"epoch": 0.7632508833922261,
"grad_norm": 0.23095430328041772,
"learning_rate": 4.149214659685864e-05,
"loss": 0.4061,
"step": 216
},
{
"epoch": 0.7667844522968198,
"grad_norm": 0.20554192856952402,
"learning_rate": 4.142670157068063e-05,
"loss": 0.3917,
"step": 217
},
{
"epoch": 0.7703180212014135,
"grad_norm": 0.24685357063684762,
"learning_rate": 4.136125654450262e-05,
"loss": 0.4114,
"step": 218
},
{
"epoch": 0.773851590106007,
"grad_norm": 0.22543283998138222,
"learning_rate": 4.129581151832461e-05,
"loss": 0.3807,
"step": 219
},
{
"epoch": 0.7773851590106007,
"grad_norm": 0.21434337705880507,
"learning_rate": 4.12303664921466e-05,
"loss": 0.3959,
"step": 220
},
{
"epoch": 0.7809187279151943,
"grad_norm": 0.23238999996068074,
"learning_rate": 4.1164921465968586e-05,
"loss": 0.4095,
"step": 221
},
{
"epoch": 0.784452296819788,
"grad_norm": 0.24052040121040505,
"learning_rate": 4.109947643979058e-05,
"loss": 0.3922,
"step": 222
},
{
"epoch": 0.7879858657243817,
"grad_norm": 0.2163145201137993,
"learning_rate": 4.103403141361257e-05,
"loss": 0.405,
"step": 223
},
{
"epoch": 0.7915194346289752,
"grad_norm": 0.20351642197790695,
"learning_rate": 4.096858638743455e-05,
"loss": 0.3867,
"step": 224
},
{
"epoch": 0.7950530035335689,
"grad_norm": 0.22202704181135127,
"learning_rate": 4.090314136125655e-05,
"loss": 0.4211,
"step": 225
},
{
"epoch": 0.7985865724381626,
"grad_norm": 0.25500241711903054,
"learning_rate": 4.0837696335078535e-05,
"loss": 0.3999,
"step": 226
},
{
"epoch": 0.8021201413427562,
"grad_norm": 0.2213815985701867,
"learning_rate": 4.077225130890053e-05,
"loss": 0.4097,
"step": 227
},
{
"epoch": 0.8056537102473498,
"grad_norm": 0.2090038017790055,
"learning_rate": 4.070680628272251e-05,
"loss": 0.3895,
"step": 228
},
{
"epoch": 0.8091872791519434,
"grad_norm": 0.21989502340862954,
"learning_rate": 4.06413612565445e-05,
"loss": 0.3973,
"step": 229
},
{
"epoch": 0.8127208480565371,
"grad_norm": 0.21154791324283787,
"learning_rate": 4.0575916230366496e-05,
"loss": 0.3893,
"step": 230
},
{
"epoch": 0.8162544169611308,
"grad_norm": 0.24907311172327215,
"learning_rate": 4.0510471204188485e-05,
"loss": 0.4085,
"step": 231
},
{
"epoch": 0.8197879858657244,
"grad_norm": 0.2335087351779645,
"learning_rate": 4.044502617801047e-05,
"loss": 0.397,
"step": 232
},
{
"epoch": 0.823321554770318,
"grad_norm": 0.22916339357572055,
"learning_rate": 4.037958115183246e-05,
"loss": 0.4189,
"step": 233
},
{
"epoch": 0.8268551236749117,
"grad_norm": 0.2345032103145984,
"learning_rate": 4.031413612565445e-05,
"loss": 0.4249,
"step": 234
},
{
"epoch": 0.8303886925795053,
"grad_norm": 0.25772495207119794,
"learning_rate": 4.0248691099476446e-05,
"loss": 0.4027,
"step": 235
},
{
"epoch": 0.833922261484099,
"grad_norm": 0.21670957287713344,
"learning_rate": 4.018324607329843e-05,
"loss": 0.3886,
"step": 236
},
{
"epoch": 0.8374558303886925,
"grad_norm": 0.2639563686803717,
"learning_rate": 4.011780104712042e-05,
"loss": 0.4046,
"step": 237
},
{
"epoch": 0.8409893992932862,
"grad_norm": 0.2358482603214206,
"learning_rate": 4.005235602094241e-05,
"loss": 0.3981,
"step": 238
},
{
"epoch": 0.8445229681978799,
"grad_norm": 0.26814903642864074,
"learning_rate": 3.99869109947644e-05,
"loss": 0.4162,
"step": 239
},
{
"epoch": 0.8480565371024735,
"grad_norm": 0.2305536154230553,
"learning_rate": 3.992146596858639e-05,
"loss": 0.3925,
"step": 240
},
{
"epoch": 0.8515901060070671,
"grad_norm": 0.24937470325344963,
"learning_rate": 3.985602094240838e-05,
"loss": 0.3956,
"step": 241
},
{
"epoch": 0.8551236749116607,
"grad_norm": 0.2342195715420916,
"learning_rate": 3.9790575916230365e-05,
"loss": 0.4059,
"step": 242
},
{
"epoch": 0.8586572438162544,
"grad_norm": 0.2566087023417787,
"learning_rate": 3.972513089005236e-05,
"loss": 0.4063,
"step": 243
},
{
"epoch": 0.8621908127208481,
"grad_norm": 0.23478926590717414,
"learning_rate": 3.965968586387435e-05,
"loss": 0.3867,
"step": 244
},
{
"epoch": 0.8657243816254417,
"grad_norm": 0.2798144378177403,
"learning_rate": 3.959424083769634e-05,
"loss": 0.3786,
"step": 245
},
{
"epoch": 0.8692579505300353,
"grad_norm": 0.26349054746710127,
"learning_rate": 3.9528795811518326e-05,
"loss": 0.4156,
"step": 246
},
{
"epoch": 0.872791519434629,
"grad_norm": 0.3117277920653392,
"learning_rate": 3.9463350785340314e-05,
"loss": 0.3995,
"step": 247
},
{
"epoch": 0.8763250883392226,
"grad_norm": 0.2837710772230277,
"learning_rate": 3.939790575916231e-05,
"loss": 0.4167,
"step": 248
},
{
"epoch": 0.8798586572438163,
"grad_norm": 0.26249163491377153,
"learning_rate": 3.933246073298429e-05,
"loss": 0.381,
"step": 249
},
{
"epoch": 0.8833922261484098,
"grad_norm": 0.30295185732757657,
"learning_rate": 3.926701570680629e-05,
"loss": 0.4182,
"step": 250
},
{
"epoch": 0.8869257950530035,
"grad_norm": 0.2741004156009664,
"learning_rate": 3.9201570680628275e-05,
"loss": 0.4019,
"step": 251
},
{
"epoch": 0.8904593639575972,
"grad_norm": 0.2557267625112447,
"learning_rate": 3.9136125654450264e-05,
"loss": 0.3933,
"step": 252
},
{
"epoch": 0.8939929328621908,
"grad_norm": 0.2370373480018411,
"learning_rate": 3.907068062827225e-05,
"loss": 0.3923,
"step": 253
},
{
"epoch": 0.8975265017667845,
"grad_norm": 0.26227996332830733,
"learning_rate": 3.900523560209424e-05,
"loss": 0.3931,
"step": 254
},
{
"epoch": 0.901060070671378,
"grad_norm": 0.21438294763142587,
"learning_rate": 3.893979057591623e-05,
"loss": 0.3984,
"step": 255
},
{
"epoch": 0.9045936395759717,
"grad_norm": 0.2335115276521748,
"learning_rate": 3.8874345549738225e-05,
"loss": 0.3932,
"step": 256
},
{
"epoch": 0.9081272084805654,
"grad_norm": 0.22454393322869942,
"learning_rate": 3.880890052356021e-05,
"loss": 0.3853,
"step": 257
},
{
"epoch": 0.911660777385159,
"grad_norm": 0.2596163843004442,
"learning_rate": 3.87434554973822e-05,
"loss": 0.3801,
"step": 258
},
{
"epoch": 0.9151943462897526,
"grad_norm": 0.19420345121894123,
"learning_rate": 3.867801047120419e-05,
"loss": 0.3828,
"step": 259
},
{
"epoch": 0.9187279151943463,
"grad_norm": 0.22683045863729945,
"learning_rate": 3.861256544502618e-05,
"loss": 0.3989,
"step": 260
},
{
"epoch": 0.9222614840989399,
"grad_norm": 0.22855042914336784,
"learning_rate": 3.8547120418848174e-05,
"loss": 0.3969,
"step": 261
},
{
"epoch": 0.9257950530035336,
"grad_norm": 0.21046655596251596,
"learning_rate": 3.8481675392670156e-05,
"loss": 0.3799,
"step": 262
},
{
"epoch": 0.9293286219081273,
"grad_norm": 0.23049478766989265,
"learning_rate": 3.841623036649215e-05,
"loss": 0.4028,
"step": 263
},
{
"epoch": 0.9328621908127208,
"grad_norm": 0.20902890642844427,
"learning_rate": 3.835078534031414e-05,
"loss": 0.3907,
"step": 264
},
{
"epoch": 0.9363957597173145,
"grad_norm": 0.20683520185197424,
"learning_rate": 3.828534031413613e-05,
"loss": 0.4048,
"step": 265
},
{
"epoch": 0.9399293286219081,
"grad_norm": 0.22321668513959014,
"learning_rate": 3.8219895287958116e-05,
"loss": 0.413,
"step": 266
},
{
"epoch": 0.9434628975265018,
"grad_norm": 0.2181734656436944,
"learning_rate": 3.8154450261780105e-05,
"loss": 0.3827,
"step": 267
},
{
"epoch": 0.9469964664310954,
"grad_norm": 0.2276911197710597,
"learning_rate": 3.80890052356021e-05,
"loss": 0.3936,
"step": 268
},
{
"epoch": 0.950530035335689,
"grad_norm": 0.23932874112389352,
"learning_rate": 3.802356020942409e-05,
"loss": 0.3877,
"step": 269
},
{
"epoch": 0.9540636042402827,
"grad_norm": 0.22251186496446881,
"learning_rate": 3.795811518324607e-05,
"loss": 0.402,
"step": 270
},
{
"epoch": 0.9575971731448764,
"grad_norm": 0.27973449578154436,
"learning_rate": 3.7892670157068066e-05,
"loss": 0.3827,
"step": 271
},
{
"epoch": 0.9611307420494699,
"grad_norm": 0.26523201054623285,
"learning_rate": 3.7827225130890054e-05,
"loss": 0.4056,
"step": 272
},
{
"epoch": 0.9646643109540636,
"grad_norm": 0.24156477293316247,
"learning_rate": 3.776178010471204e-05,
"loss": 0.3963,
"step": 273
},
{
"epoch": 0.9681978798586572,
"grad_norm": 0.247247933584926,
"learning_rate": 3.769633507853403e-05,
"loss": 0.3869,
"step": 274
},
{
"epoch": 0.9717314487632509,
"grad_norm": 0.24084252783826215,
"learning_rate": 3.763089005235602e-05,
"loss": 0.3891,
"step": 275
},
{
"epoch": 0.9752650176678446,
"grad_norm": 0.23139130330483365,
"learning_rate": 3.7565445026178015e-05,
"loss": 0.3904,
"step": 276
},
{
"epoch": 0.9787985865724381,
"grad_norm": 0.2158522253953061,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.4072,
"step": 277
},
{
"epoch": 0.9823321554770318,
"grad_norm": 0.2403314631839938,
"learning_rate": 3.743455497382199e-05,
"loss": 0.3966,
"step": 278
},
{
"epoch": 0.9858657243816255,
"grad_norm": 0.23259587006529284,
"learning_rate": 3.736910994764398e-05,
"loss": 0.4015,
"step": 279
},
{
"epoch": 0.9893992932862191,
"grad_norm": 0.23086223341274428,
"learning_rate": 3.730366492146597e-05,
"loss": 0.3795,
"step": 280
},
{
"epoch": 0.9929328621908127,
"grad_norm": 0.20230822074457405,
"learning_rate": 3.7238219895287964e-05,
"loss": 0.3815,
"step": 281
},
{
"epoch": 0.9964664310954063,
"grad_norm": 0.2280368150756851,
"learning_rate": 3.717277486910995e-05,
"loss": 0.3847,
"step": 282
},
{
"epoch": 1.0,
"grad_norm": 0.19545884067146066,
"learning_rate": 3.7107329842931935e-05,
"loss": 0.3875,
"step": 283
},
{
"epoch": 1.0035335689045937,
"grad_norm": 0.28644557133651055,
"learning_rate": 3.704188481675393e-05,
"loss": 0.3298,
"step": 284
},
{
"epoch": 1.0070671378091873,
"grad_norm": 0.246181441410164,
"learning_rate": 3.697643979057592e-05,
"loss": 0.3181,
"step": 285
},
{
"epoch": 1.010600706713781,
"grad_norm": 0.2381701924051878,
"learning_rate": 3.691099476439791e-05,
"loss": 0.3328,
"step": 286
},
{
"epoch": 1.0141342756183747,
"grad_norm": 0.24659950933351185,
"learning_rate": 3.6845549738219895e-05,
"loss": 0.3205,
"step": 287
},
{
"epoch": 1.017667844522968,
"grad_norm": 0.2914797980768853,
"learning_rate": 3.6780104712041884e-05,
"loss": 0.314,
"step": 288
},
{
"epoch": 1.0212014134275618,
"grad_norm": 0.24512265162041555,
"learning_rate": 3.671465968586388e-05,
"loss": 0.3162,
"step": 289
},
{
"epoch": 1.0247349823321554,
"grad_norm": 0.22593092660338965,
"learning_rate": 3.664921465968587e-05,
"loss": 0.3216,
"step": 290
},
{
"epoch": 1.028268551236749,
"grad_norm": 0.22946799578429655,
"learning_rate": 3.6583769633507856e-05,
"loss": 0.3166,
"step": 291
},
{
"epoch": 1.0318021201413428,
"grad_norm": 0.2960974173892056,
"learning_rate": 3.6518324607329845e-05,
"loss": 0.3316,
"step": 292
},
{
"epoch": 1.0353356890459364,
"grad_norm": 0.23831749027274296,
"learning_rate": 3.645287958115183e-05,
"loss": 0.3126,
"step": 293
},
{
"epoch": 1.03886925795053,
"grad_norm": 0.2296400192793851,
"learning_rate": 3.638743455497383e-05,
"loss": 0.3152,
"step": 294
},
{
"epoch": 1.0424028268551238,
"grad_norm": 0.2666013927167255,
"learning_rate": 3.632198952879581e-05,
"loss": 0.3211,
"step": 295
},
{
"epoch": 1.0459363957597174,
"grad_norm": 0.2017188626594801,
"learning_rate": 3.62565445026178e-05,
"loss": 0.2987,
"step": 296
},
{
"epoch": 1.0494699646643109,
"grad_norm": 0.24680327190368004,
"learning_rate": 3.6191099476439794e-05,
"loss": 0.3264,
"step": 297
},
{
"epoch": 1.0530035335689045,
"grad_norm": 0.23199791849929335,
"learning_rate": 3.612565445026178e-05,
"loss": 0.3132,
"step": 298
},
{
"epoch": 1.0565371024734982,
"grad_norm": 0.23869637562051588,
"learning_rate": 3.606020942408377e-05,
"loss": 0.3265,
"step": 299
},
{
"epoch": 1.0600706713780919,
"grad_norm": 0.22473195380527508,
"learning_rate": 3.599476439790576e-05,
"loss": 0.3015,
"step": 300
},
{
"epoch": 1.0636042402826855,
"grad_norm": 0.23493094331158693,
"learning_rate": 3.592931937172775e-05,
"loss": 0.2889,
"step": 301
},
{
"epoch": 1.0671378091872792,
"grad_norm": 0.22414265321405832,
"learning_rate": 3.586387434554974e-05,
"loss": 0.3268,
"step": 302
},
{
"epoch": 1.0706713780918728,
"grad_norm": 0.21481974366607212,
"learning_rate": 3.579842931937173e-05,
"loss": 0.3086,
"step": 303
},
{
"epoch": 1.0742049469964665,
"grad_norm": 0.20876393358670048,
"learning_rate": 3.5732984293193713e-05,
"loss": 0.3034,
"step": 304
},
{
"epoch": 1.0777385159010602,
"grad_norm": 0.19404425365880137,
"learning_rate": 3.566753926701571e-05,
"loss": 0.2976,
"step": 305
},
{
"epoch": 1.0812720848056536,
"grad_norm": 0.2330147569485359,
"learning_rate": 3.56020942408377e-05,
"loss": 0.3375,
"step": 306
},
{
"epoch": 1.0848056537102473,
"grad_norm": 0.22345787287571597,
"learning_rate": 3.553664921465969e-05,
"loss": 0.3293,
"step": 307
},
{
"epoch": 1.088339222614841,
"grad_norm": 0.20500037658761122,
"learning_rate": 3.5471204188481674e-05,
"loss": 0.3233,
"step": 308
},
{
"epoch": 1.0918727915194346,
"grad_norm": 0.2363661046470141,
"learning_rate": 3.540575916230366e-05,
"loss": 0.323,
"step": 309
},
{
"epoch": 1.0954063604240283,
"grad_norm": 0.2063708256259596,
"learning_rate": 3.534031413612566e-05,
"loss": 0.32,
"step": 310
},
{
"epoch": 1.098939929328622,
"grad_norm": 0.21445020610104967,
"learning_rate": 3.5274869109947647e-05,
"loss": 0.3105,
"step": 311
},
{
"epoch": 1.1024734982332156,
"grad_norm": 0.21848308675353983,
"learning_rate": 3.5209424083769635e-05,
"loss": 0.3057,
"step": 312
},
{
"epoch": 1.1060070671378093,
"grad_norm": 0.21785651614885643,
"learning_rate": 3.5143979057591624e-05,
"loss": 0.3193,
"step": 313
},
{
"epoch": 1.1095406360424027,
"grad_norm": 0.20921803593322577,
"learning_rate": 3.507853403141361e-05,
"loss": 0.3173,
"step": 314
},
{
"epoch": 1.1130742049469964,
"grad_norm": 0.2150900572683641,
"learning_rate": 3.501308900523561e-05,
"loss": 0.3201,
"step": 315
},
{
"epoch": 1.11660777385159,
"grad_norm": 0.23252037987280388,
"learning_rate": 3.4947643979057596e-05,
"loss": 0.3248,
"step": 316
},
{
"epoch": 1.1201413427561837,
"grad_norm": 0.24635081104273918,
"learning_rate": 3.488219895287958e-05,
"loss": 0.3393,
"step": 317
},
{
"epoch": 1.1236749116607774,
"grad_norm": 0.22540836469451786,
"learning_rate": 3.481675392670157e-05,
"loss": 0.3385,
"step": 318
},
{
"epoch": 1.127208480565371,
"grad_norm": 0.25983037388647073,
"learning_rate": 3.475130890052356e-05,
"loss": 0.3057,
"step": 319
},
{
"epoch": 1.1307420494699647,
"grad_norm": 0.23458400529827525,
"learning_rate": 3.468586387434556e-05,
"loss": 0.3219,
"step": 320
},
{
"epoch": 1.1342756183745584,
"grad_norm": 0.21194972760690098,
"learning_rate": 3.462041884816754e-05,
"loss": 0.3035,
"step": 321
},
{
"epoch": 1.137809187279152,
"grad_norm": 0.2359057746689865,
"learning_rate": 3.455497382198953e-05,
"loss": 0.3379,
"step": 322
},
{
"epoch": 1.1413427561837457,
"grad_norm": 0.183280417607474,
"learning_rate": 3.448952879581152e-05,
"loss": 0.2982,
"step": 323
},
{
"epoch": 1.1448763250883391,
"grad_norm": 0.22445482125910166,
"learning_rate": 3.442408376963351e-05,
"loss": 0.3181,
"step": 324
},
{
"epoch": 1.1484098939929328,
"grad_norm": 0.20709956522770923,
"learning_rate": 3.43586387434555e-05,
"loss": 0.3061,
"step": 325
},
{
"epoch": 1.1519434628975265,
"grad_norm": 0.1962996039331923,
"learning_rate": 3.429319371727749e-05,
"loss": 0.3081,
"step": 326
},
{
"epoch": 1.1554770318021201,
"grad_norm": 0.21772893532666654,
"learning_rate": 3.4227748691099476e-05,
"loss": 0.3156,
"step": 327
},
{
"epoch": 1.1590106007067138,
"grad_norm": 0.21856155526695173,
"learning_rate": 3.416230366492147e-05,
"loss": 0.3216,
"step": 328
},
{
"epoch": 1.1625441696113075,
"grad_norm": 0.216148615776384,
"learning_rate": 3.409685863874345e-05,
"loss": 0.3295,
"step": 329
},
{
"epoch": 1.1660777385159011,
"grad_norm": 0.2136688220358534,
"learning_rate": 3.403141361256545e-05,
"loss": 0.3162,
"step": 330
},
{
"epoch": 1.1696113074204948,
"grad_norm": 0.22193150606988146,
"learning_rate": 3.396596858638744e-05,
"loss": 0.3208,
"step": 331
},
{
"epoch": 1.1731448763250882,
"grad_norm": 0.24107500867079923,
"learning_rate": 3.3900523560209426e-05,
"loss": 0.3366,
"step": 332
},
{
"epoch": 1.176678445229682,
"grad_norm": 0.23280505336393884,
"learning_rate": 3.3835078534031414e-05,
"loss": 0.3084,
"step": 333
},
{
"epoch": 1.1802120141342756,
"grad_norm": 0.221142306220354,
"learning_rate": 3.37696335078534e-05,
"loss": 0.3153,
"step": 334
},
{
"epoch": 1.1837455830388692,
"grad_norm": 0.22624148882528342,
"learning_rate": 3.370418848167539e-05,
"loss": 0.3025,
"step": 335
},
{
"epoch": 1.187279151943463,
"grad_norm": 0.23726994702952095,
"learning_rate": 3.3638743455497386e-05,
"loss": 0.3059,
"step": 336
},
{
"epoch": 1.1908127208480566,
"grad_norm": 0.2004801698990464,
"learning_rate": 3.3573298429319375e-05,
"loss": 0.3016,
"step": 337
},
{
"epoch": 1.1943462897526502,
"grad_norm": 0.20917901510881576,
"learning_rate": 3.350785340314136e-05,
"loss": 0.2994,
"step": 338
},
{
"epoch": 1.197879858657244,
"grad_norm": 0.21463380716897973,
"learning_rate": 3.344240837696335e-05,
"loss": 0.3232,
"step": 339
},
{
"epoch": 1.2014134275618376,
"grad_norm": 0.2456594431324656,
"learning_rate": 3.337696335078534e-05,
"loss": 0.3189,
"step": 340
},
{
"epoch": 1.2049469964664312,
"grad_norm": 0.21293544353851196,
"learning_rate": 3.3311518324607336e-05,
"loss": 0.3011,
"step": 341
},
{
"epoch": 1.2084805653710247,
"grad_norm": 0.24068088729907228,
"learning_rate": 3.324607329842932e-05,
"loss": 0.316,
"step": 342
},
{
"epoch": 1.2120141342756183,
"grad_norm": 0.2525547604298334,
"learning_rate": 3.318062827225131e-05,
"loss": 0.3328,
"step": 343
},
{
"epoch": 1.215547703180212,
"grad_norm": 0.24957446532900615,
"learning_rate": 3.31151832460733e-05,
"loss": 0.3077,
"step": 344
},
{
"epoch": 1.2190812720848057,
"grad_norm": 0.21858717685320606,
"learning_rate": 3.304973821989529e-05,
"loss": 0.3341,
"step": 345
},
{
"epoch": 1.2226148409893993,
"grad_norm": 0.22845641979758724,
"learning_rate": 3.298429319371728e-05,
"loss": 0.3062,
"step": 346
},
{
"epoch": 1.226148409893993,
"grad_norm": 0.24355966443975988,
"learning_rate": 3.291884816753927e-05,
"loss": 0.3159,
"step": 347
},
{
"epoch": 1.2296819787985867,
"grad_norm": 0.2266617775003546,
"learning_rate": 3.285340314136126e-05,
"loss": 0.3208,
"step": 348
},
{
"epoch": 1.23321554770318,
"grad_norm": 0.24420569453468005,
"learning_rate": 3.278795811518325e-05,
"loss": 0.3245,
"step": 349
},
{
"epoch": 1.2367491166077738,
"grad_norm": 0.23505093613984024,
"learning_rate": 3.272251308900524e-05,
"loss": 0.3231,
"step": 350
},
{
"epoch": 1.2402826855123674,
"grad_norm": 0.2217615774782852,
"learning_rate": 3.265706806282723e-05,
"loss": 0.3221,
"step": 351
},
{
"epoch": 1.243816254416961,
"grad_norm": 0.22957026541481648,
"learning_rate": 3.2591623036649216e-05,
"loss": 0.3369,
"step": 352
},
{
"epoch": 1.2473498233215548,
"grad_norm": 0.24168709156073725,
"learning_rate": 3.2526178010471204e-05,
"loss": 0.3169,
"step": 353
},
{
"epoch": 1.2508833922261484,
"grad_norm": 0.23110088658805064,
"learning_rate": 3.246073298429319e-05,
"loss": 0.3242,
"step": 354
},
{
"epoch": 1.254416961130742,
"grad_norm": 0.23381613068833576,
"learning_rate": 3.239528795811518e-05,
"loss": 0.3117,
"step": 355
},
{
"epoch": 1.2579505300353357,
"grad_norm": 0.2061504277228851,
"learning_rate": 3.232984293193718e-05,
"loss": 0.2962,
"step": 356
},
{
"epoch": 1.2614840989399294,
"grad_norm": 0.23462705730231123,
"learning_rate": 3.2264397905759165e-05,
"loss": 0.319,
"step": 357
},
{
"epoch": 1.265017667844523,
"grad_norm": 0.2075871787114381,
"learning_rate": 3.2198952879581154e-05,
"loss": 0.3259,
"step": 358
},
{
"epoch": 1.2685512367491167,
"grad_norm": 0.22738824603983201,
"learning_rate": 3.213350785340314e-05,
"loss": 0.318,
"step": 359
},
{
"epoch": 1.2720848056537102,
"grad_norm": 0.2199300428485946,
"learning_rate": 3.206806282722513e-05,
"loss": 0.3153,
"step": 360
},
{
"epoch": 1.2756183745583038,
"grad_norm": 0.19528341860822246,
"learning_rate": 3.2002617801047126e-05,
"loss": 0.3085,
"step": 361
},
{
"epoch": 1.2791519434628975,
"grad_norm": 0.21163815765256305,
"learning_rate": 3.1937172774869115e-05,
"loss": 0.3136,
"step": 362
},
{
"epoch": 1.2826855123674912,
"grad_norm": 0.23441625947113304,
"learning_rate": 3.1871727748691096e-05,
"loss": 0.3134,
"step": 363
},
{
"epoch": 1.2862190812720848,
"grad_norm": 0.21724693353126123,
"learning_rate": 3.180628272251309e-05,
"loss": 0.3139,
"step": 364
},
{
"epoch": 1.2897526501766785,
"grad_norm": 0.21903014439227053,
"learning_rate": 3.174083769633508e-05,
"loss": 0.3323,
"step": 365
},
{
"epoch": 1.293286219081272,
"grad_norm": 0.21103120670805472,
"learning_rate": 3.167539267015707e-05,
"loss": 0.315,
"step": 366
},
{
"epoch": 1.2968197879858656,
"grad_norm": 0.2079844238655552,
"learning_rate": 3.160994764397906e-05,
"loss": 0.3146,
"step": 367
},
{
"epoch": 1.3003533568904593,
"grad_norm": 0.202149682188863,
"learning_rate": 3.1544502617801046e-05,
"loss": 0.3306,
"step": 368
},
{
"epoch": 1.303886925795053,
"grad_norm": 0.21926843901253198,
"learning_rate": 3.147905759162304e-05,
"loss": 0.3185,
"step": 369
},
{
"epoch": 1.3074204946996466,
"grad_norm": 0.22164687081665763,
"learning_rate": 3.141361256544503e-05,
"loss": 0.3252,
"step": 370
},
{
"epoch": 1.3109540636042403,
"grad_norm": 0.9782035902749687,
"learning_rate": 3.134816753926702e-05,
"loss": 0.3634,
"step": 371
},
{
"epoch": 1.314487632508834,
"grad_norm": 0.23250653630170853,
"learning_rate": 3.1282722513089006e-05,
"loss": 0.311,
"step": 372
},
{
"epoch": 1.3180212014134276,
"grad_norm": 0.25389810703134524,
"learning_rate": 3.1217277486910995e-05,
"loss": 0.3087,
"step": 373
},
{
"epoch": 1.3215547703180213,
"grad_norm": 0.22623506006425292,
"learning_rate": 3.115183246073299e-05,
"loss": 0.324,
"step": 374
},
{
"epoch": 1.325088339222615,
"grad_norm": 0.2743617357098891,
"learning_rate": 3.108638743455498e-05,
"loss": 0.327,
"step": 375
},
{
"epoch": 1.3286219081272086,
"grad_norm": 0.29209034732360883,
"learning_rate": 3.102094240837696e-05,
"loss": 0.3163,
"step": 376
},
{
"epoch": 1.332155477031802,
"grad_norm": 0.23817203148632873,
"learning_rate": 3.0955497382198956e-05,
"loss": 0.3183,
"step": 377
},
{
"epoch": 1.3356890459363957,
"grad_norm": 0.32389816433733043,
"learning_rate": 3.0890052356020944e-05,
"loss": 0.3229,
"step": 378
},
{
"epoch": 1.3392226148409894,
"grad_norm": 0.24767917607203235,
"learning_rate": 3.082460732984293e-05,
"loss": 0.3239,
"step": 379
},
{
"epoch": 1.342756183745583,
"grad_norm": 0.2320939039992941,
"learning_rate": 3.075916230366492e-05,
"loss": 0.3019,
"step": 380
},
{
"epoch": 1.3462897526501767,
"grad_norm": 0.2388489114044,
"learning_rate": 3.069371727748691e-05,
"loss": 0.3078,
"step": 381
},
{
"epoch": 1.3498233215547704,
"grad_norm": 0.30119510077575323,
"learning_rate": 3.0628272251308905e-05,
"loss": 0.3316,
"step": 382
},
{
"epoch": 1.353356890459364,
"grad_norm": 0.22616235614825414,
"learning_rate": 3.0562827225130894e-05,
"loss": 0.3036,
"step": 383
},
{
"epoch": 1.3568904593639575,
"grad_norm": 0.23507018582316008,
"learning_rate": 3.049738219895288e-05,
"loss": 0.3193,
"step": 384
},
{
"epoch": 1.3604240282685511,
"grad_norm": 0.2519000434406437,
"learning_rate": 3.043193717277487e-05,
"loss": 0.3331,
"step": 385
},
{
"epoch": 1.3639575971731448,
"grad_norm": 0.26788325147114805,
"learning_rate": 3.036649214659686e-05,
"loss": 0.3404,
"step": 386
},
{
"epoch": 1.3674911660777385,
"grad_norm": 0.25830034537540286,
"learning_rate": 3.030104712041885e-05,
"loss": 0.3139,
"step": 387
},
{
"epoch": 1.3710247349823321,
"grad_norm": 0.23139511563263715,
"learning_rate": 3.023560209424084e-05,
"loss": 0.332,
"step": 388
},
{
"epoch": 1.3745583038869258,
"grad_norm": 0.2912201936135298,
"learning_rate": 3.0170157068062828e-05,
"loss": 0.3143,
"step": 389
},
{
"epoch": 1.3780918727915195,
"grad_norm": 0.27309586065935926,
"learning_rate": 3.010471204188482e-05,
"loss": 0.3285,
"step": 390
},
{
"epoch": 1.3816254416961131,
"grad_norm": 0.21713669680403141,
"learning_rate": 3.003926701570681e-05,
"loss": 0.3283,
"step": 391
},
{
"epoch": 1.3851590106007068,
"grad_norm": 0.2288906557758973,
"learning_rate": 2.99738219895288e-05,
"loss": 0.3125,
"step": 392
},
{
"epoch": 1.3886925795053005,
"grad_norm": 0.23753503370731477,
"learning_rate": 2.990837696335079e-05,
"loss": 0.3242,
"step": 393
},
{
"epoch": 1.3922261484098941,
"grad_norm": 0.21945546136919755,
"learning_rate": 2.9842931937172774e-05,
"loss": 0.322,
"step": 394
},
{
"epoch": 1.3957597173144876,
"grad_norm": 0.2495289930003258,
"learning_rate": 2.9777486910994766e-05,
"loss": 0.319,
"step": 395
},
{
"epoch": 1.3992932862190812,
"grad_norm": 0.22659937821817347,
"learning_rate": 2.9712041884816754e-05,
"loss": 0.3028,
"step": 396
},
{
"epoch": 1.4028268551236749,
"grad_norm": 0.2395178080554209,
"learning_rate": 2.9646596858638743e-05,
"loss": 0.3164,
"step": 397
},
{
"epoch": 1.4063604240282686,
"grad_norm": 0.21347575083575157,
"learning_rate": 2.9581151832460735e-05,
"loss": 0.3041,
"step": 398
},
{
"epoch": 1.4098939929328622,
"grad_norm": 0.21120748700734265,
"learning_rate": 2.9515706806282723e-05,
"loss": 0.3113,
"step": 399
},
{
"epoch": 1.4134275618374559,
"grad_norm": 0.24795070510010908,
"learning_rate": 2.9450261780104715e-05,
"loss": 0.3271,
"step": 400
},
{
"epoch": 1.4169611307420495,
"grad_norm": 0.23965941456368967,
"learning_rate": 2.9384816753926704e-05,
"loss": 0.3206,
"step": 401
},
{
"epoch": 1.420494699646643,
"grad_norm": 0.21060776260375713,
"learning_rate": 2.931937172774869e-05,
"loss": 0.3166,
"step": 402
},
{
"epoch": 1.4240282685512367,
"grad_norm": 0.24839759981637807,
"learning_rate": 2.9253926701570684e-05,
"loss": 0.3284,
"step": 403
},
{
"epoch": 1.4275618374558303,
"grad_norm": 0.24513818206649013,
"learning_rate": 2.918848167539267e-05,
"loss": 0.3361,
"step": 404
},
{
"epoch": 1.431095406360424,
"grad_norm": 0.18931056380216793,
"learning_rate": 2.9123036649214664e-05,
"loss": 0.3219,
"step": 405
},
{
"epoch": 1.4346289752650176,
"grad_norm": 0.21200988060097672,
"learning_rate": 2.905759162303665e-05,
"loss": 0.3123,
"step": 406
},
{
"epoch": 1.4381625441696113,
"grad_norm": 0.27866816682638085,
"learning_rate": 2.8992146596858638e-05,
"loss": 0.3159,
"step": 407
},
{
"epoch": 1.441696113074205,
"grad_norm": 0.20600762260998587,
"learning_rate": 2.892670157068063e-05,
"loss": 0.3071,
"step": 408
},
{
"epoch": 1.4452296819787986,
"grad_norm": 0.20620471921197636,
"learning_rate": 2.886125654450262e-05,
"loss": 0.3061,
"step": 409
},
{
"epoch": 1.4487632508833923,
"grad_norm": 0.23076426157914004,
"learning_rate": 2.879581151832461e-05,
"loss": 0.304,
"step": 410
},
{
"epoch": 1.452296819787986,
"grad_norm": 0.18980108729237125,
"learning_rate": 2.87303664921466e-05,
"loss": 0.3069,
"step": 411
},
{
"epoch": 1.4558303886925796,
"grad_norm": 0.22666550835867313,
"learning_rate": 2.8664921465968587e-05,
"loss": 0.319,
"step": 412
},
{
"epoch": 1.459363957597173,
"grad_norm": 0.20764937544262888,
"learning_rate": 2.859947643979058e-05,
"loss": 0.3366,
"step": 413
},
{
"epoch": 1.4628975265017667,
"grad_norm": 0.22091715412307447,
"learning_rate": 2.8534031413612568e-05,
"loss": 0.3101,
"step": 414
},
{
"epoch": 1.4664310954063604,
"grad_norm": 0.23933939194557322,
"learning_rate": 2.8468586387434553e-05,
"loss": 0.3305,
"step": 415
},
{
"epoch": 1.469964664310954,
"grad_norm": 0.22250311089462133,
"learning_rate": 2.8403141361256548e-05,
"loss": 0.3078,
"step": 416
},
{
"epoch": 1.4734982332155477,
"grad_norm": 0.22045245227336588,
"learning_rate": 2.8337696335078533e-05,
"loss": 0.3314,
"step": 417
},
{
"epoch": 1.4770318021201414,
"grad_norm": 0.25442297017314897,
"learning_rate": 2.827225130890053e-05,
"loss": 0.3333,
"step": 418
},
{
"epoch": 1.4805653710247348,
"grad_norm": 0.200343730236033,
"learning_rate": 2.8206806282722514e-05,
"loss": 0.3286,
"step": 419
},
{
"epoch": 1.4840989399293285,
"grad_norm": 0.212055454666379,
"learning_rate": 2.8141361256544502e-05,
"loss": 0.3273,
"step": 420
},
{
"epoch": 1.4876325088339222,
"grad_norm": 0.2138899967281138,
"learning_rate": 2.8075916230366494e-05,
"loss": 0.3165,
"step": 421
},
{
"epoch": 1.4911660777385158,
"grad_norm": 0.24416582383918575,
"learning_rate": 2.8010471204188483e-05,
"loss": 0.3327,
"step": 422
},
{
"epoch": 1.4946996466431095,
"grad_norm": 0.22029827158669632,
"learning_rate": 2.7945026178010474e-05,
"loss": 0.3177,
"step": 423
},
{
"epoch": 1.4982332155477032,
"grad_norm": 0.20743446392912032,
"learning_rate": 2.7879581151832463e-05,
"loss": 0.307,
"step": 424
},
{
"epoch": 1.5017667844522968,
"grad_norm": 0.23675483485667703,
"learning_rate": 2.7814136125654448e-05,
"loss": 0.3379,
"step": 425
},
{
"epoch": 1.5053003533568905,
"grad_norm": 0.2083792188996813,
"learning_rate": 2.7748691099476443e-05,
"loss": 0.3226,
"step": 426
},
{
"epoch": 1.5088339222614842,
"grad_norm": 0.2218592327334032,
"learning_rate": 2.768324607329843e-05,
"loss": 0.3199,
"step": 427
},
{
"epoch": 1.5123674911660778,
"grad_norm": 0.22330001546620637,
"learning_rate": 2.7617801047120424e-05,
"loss": 0.3152,
"step": 428
},
{
"epoch": 1.5159010600706715,
"grad_norm": 0.1966026901982505,
"learning_rate": 2.755235602094241e-05,
"loss": 0.3288,
"step": 429
},
{
"epoch": 1.5194346289752652,
"grad_norm": 0.24667793099085591,
"learning_rate": 2.7486910994764397e-05,
"loss": 0.3237,
"step": 430
},
{
"epoch": 1.5229681978798588,
"grad_norm": 0.23002494889207303,
"learning_rate": 2.742146596858639e-05,
"loss": 0.319,
"step": 431
},
{
"epoch": 1.5265017667844523,
"grad_norm": 0.20315959163796374,
"learning_rate": 2.7356020942408378e-05,
"loss": 0.3045,
"step": 432
},
{
"epoch": 1.530035335689046,
"grad_norm": 0.21538543419736156,
"learning_rate": 2.7290575916230366e-05,
"loss": 0.3138,
"step": 433
},
{
"epoch": 1.5335689045936396,
"grad_norm": 0.2351128256983889,
"learning_rate": 2.7225130890052358e-05,
"loss": 0.3309,
"step": 434
},
{
"epoch": 1.5371024734982333,
"grad_norm": 0.2354957133661471,
"learning_rate": 2.7159685863874347e-05,
"loss": 0.3122,
"step": 435
},
{
"epoch": 1.5406360424028267,
"grad_norm": 0.19603463499189155,
"learning_rate": 2.709424083769634e-05,
"loss": 0.3382,
"step": 436
},
{
"epoch": 1.5441696113074204,
"grad_norm": 0.19880741223181117,
"learning_rate": 2.7028795811518327e-05,
"loss": 0.3051,
"step": 437
},
{
"epoch": 1.547703180212014,
"grad_norm": 0.22354607120032446,
"learning_rate": 2.6963350785340312e-05,
"loss": 0.3378,
"step": 438
},
{
"epoch": 1.5512367491166077,
"grad_norm": 0.22795594325375385,
"learning_rate": 2.6897905759162307e-05,
"loss": 0.3225,
"step": 439
},
{
"epoch": 1.5547703180212014,
"grad_norm": 0.20923020501951511,
"learning_rate": 2.6832460732984293e-05,
"loss": 0.3164,
"step": 440
},
{
"epoch": 1.558303886925795,
"grad_norm": 0.22486551399281854,
"learning_rate": 2.6767015706806288e-05,
"loss": 0.3193,
"step": 441
},
{
"epoch": 1.5618374558303887,
"grad_norm": 0.20333029691196666,
"learning_rate": 2.6701570680628273e-05,
"loss": 0.3068,
"step": 442
},
{
"epoch": 1.5653710247349824,
"grad_norm": 0.20775827774315345,
"learning_rate": 2.663612565445026e-05,
"loss": 0.3144,
"step": 443
},
{
"epoch": 1.568904593639576,
"grad_norm": 0.20641245106490558,
"learning_rate": 2.6570680628272253e-05,
"loss": 0.3228,
"step": 444
},
{
"epoch": 1.5724381625441697,
"grad_norm": 0.2130706466319999,
"learning_rate": 2.6505235602094242e-05,
"loss": 0.3116,
"step": 445
},
{
"epoch": 1.5759717314487633,
"grad_norm": 0.20144350552710516,
"learning_rate": 2.643979057591623e-05,
"loss": 0.3203,
"step": 446
},
{
"epoch": 1.579505300353357,
"grad_norm": 0.2092134786633196,
"learning_rate": 2.6374345549738222e-05,
"loss": 0.3085,
"step": 447
},
{
"epoch": 1.5830388692579507,
"grad_norm": 0.1930785139416977,
"learning_rate": 2.630890052356021e-05,
"loss": 0.3159,
"step": 448
},
{
"epoch": 1.5865724381625441,
"grad_norm": 0.2146699967939856,
"learning_rate": 2.6243455497382203e-05,
"loss": 0.3206,
"step": 449
},
{
"epoch": 1.5901060070671378,
"grad_norm": 0.2018962049124116,
"learning_rate": 2.617801047120419e-05,
"loss": 0.3045,
"step": 450
},
{
"epoch": 1.5936395759717314,
"grad_norm": 0.21031644459808338,
"learning_rate": 2.6112565445026176e-05,
"loss": 0.3193,
"step": 451
},
{
"epoch": 1.5971731448763251,
"grad_norm": 0.2372506460254518,
"learning_rate": 2.604712041884817e-05,
"loss": 0.323,
"step": 452
},
{
"epoch": 1.6007067137809188,
"grad_norm": 0.19385388994145367,
"learning_rate": 2.5981675392670157e-05,
"loss": 0.2993,
"step": 453
},
{
"epoch": 1.6042402826855122,
"grad_norm": 0.2164506284174296,
"learning_rate": 2.591623036649215e-05,
"loss": 0.3064,
"step": 454
},
{
"epoch": 1.6077738515901059,
"grad_norm": 0.2195154798641426,
"learning_rate": 2.5850785340314137e-05,
"loss": 0.3243,
"step": 455
},
{
"epoch": 1.6113074204946995,
"grad_norm": 0.2117465558333842,
"learning_rate": 2.5785340314136126e-05,
"loss": 0.3067,
"step": 456
},
{
"epoch": 1.6148409893992932,
"grad_norm": 0.20859602020466209,
"learning_rate": 2.5719895287958117e-05,
"loss": 0.3157,
"step": 457
},
{
"epoch": 1.6183745583038869,
"grad_norm": 0.19592885243669278,
"learning_rate": 2.5654450261780106e-05,
"loss": 0.3105,
"step": 458
},
{
"epoch": 1.6219081272084805,
"grad_norm": 0.19638941565988363,
"learning_rate": 2.5589005235602098e-05,
"loss": 0.3185,
"step": 459
},
{
"epoch": 1.6254416961130742,
"grad_norm": 0.20391037076634516,
"learning_rate": 2.5523560209424086e-05,
"loss": 0.3287,
"step": 460
},
{
"epoch": 1.6289752650176679,
"grad_norm": 0.20081028542514334,
"learning_rate": 2.545811518324607e-05,
"loss": 0.3107,
"step": 461
},
{
"epoch": 1.6325088339222615,
"grad_norm": 0.19173588582622914,
"learning_rate": 2.5392670157068067e-05,
"loss": 0.3105,
"step": 462
},
{
"epoch": 1.6360424028268552,
"grad_norm": 0.19133828961954372,
"learning_rate": 2.5327225130890052e-05,
"loss": 0.3287,
"step": 463
},
{
"epoch": 1.6395759717314489,
"grad_norm": 0.19111946349997552,
"learning_rate": 2.526178010471204e-05,
"loss": 0.2979,
"step": 464
},
{
"epoch": 1.6431095406360425,
"grad_norm": 0.21928030042880425,
"learning_rate": 2.5196335078534032e-05,
"loss": 0.3112,
"step": 465
},
{
"epoch": 1.6466431095406362,
"grad_norm": 0.22447973986153602,
"learning_rate": 2.513089005235602e-05,
"loss": 0.337,
"step": 466
},
{
"epoch": 1.6501766784452296,
"grad_norm": 0.21440461141548056,
"learning_rate": 2.5065445026178013e-05,
"loss": 0.3215,
"step": 467
},
{
"epoch": 1.6537102473498233,
"grad_norm": 0.18702923016680167,
"learning_rate": 2.5e-05,
"loss": 0.3174,
"step": 468
},
{
"epoch": 1.657243816254417,
"grad_norm": 0.21809081303573621,
"learning_rate": 2.493455497382199e-05,
"loss": 0.3131,
"step": 469
},
{
"epoch": 1.6607773851590106,
"grad_norm": 0.1968328350289131,
"learning_rate": 2.486910994764398e-05,
"loss": 0.3109,
"step": 470
},
{
"epoch": 1.664310954063604,
"grad_norm": 0.1902063355779412,
"learning_rate": 2.480366492146597e-05,
"loss": 0.2976,
"step": 471
},
{
"epoch": 1.6678445229681977,
"grad_norm": 0.20269450051172483,
"learning_rate": 2.473821989528796e-05,
"loss": 0.3043,
"step": 472
},
{
"epoch": 1.6713780918727914,
"grad_norm": 0.21428832800173347,
"learning_rate": 2.467277486910995e-05,
"loss": 0.3277,
"step": 473
},
{
"epoch": 1.674911660777385,
"grad_norm": 0.21377519234547374,
"learning_rate": 2.460732984293194e-05,
"loss": 0.319,
"step": 474
},
{
"epoch": 1.6784452296819787,
"grad_norm": 0.19440628751928357,
"learning_rate": 2.4541884816753928e-05,
"loss": 0.3077,
"step": 475
},
{
"epoch": 1.6819787985865724,
"grad_norm": 0.19127187810347648,
"learning_rate": 2.4476439790575916e-05,
"loss": 0.3303,
"step": 476
},
{
"epoch": 1.685512367491166,
"grad_norm": 0.21494991088264948,
"learning_rate": 2.4410994764397908e-05,
"loss": 0.3247,
"step": 477
},
{
"epoch": 1.6890459363957597,
"grad_norm": 0.1936972864043452,
"learning_rate": 2.4345549738219896e-05,
"loss": 0.309,
"step": 478
},
{
"epoch": 1.6925795053003534,
"grad_norm": 0.20353253659786902,
"learning_rate": 2.428010471204189e-05,
"loss": 0.3021,
"step": 479
},
{
"epoch": 1.696113074204947,
"grad_norm": 0.2412529888359991,
"learning_rate": 2.4214659685863873e-05,
"loss": 0.3142,
"step": 480
},
{
"epoch": 1.6996466431095407,
"grad_norm": 0.17778697764538212,
"learning_rate": 2.4149214659685865e-05,
"loss": 0.3107,
"step": 481
},
{
"epoch": 1.7031802120141344,
"grad_norm": 0.21027443406507412,
"learning_rate": 2.4083769633507854e-05,
"loss": 0.3132,
"step": 482
},
{
"epoch": 1.706713780918728,
"grad_norm": 0.2044235608069531,
"learning_rate": 2.4018324607329846e-05,
"loss": 0.3199,
"step": 483
},
{
"epoch": 1.7102473498233217,
"grad_norm": 0.20611540623101018,
"learning_rate": 2.395287958115183e-05,
"loss": 0.3243,
"step": 484
},
{
"epoch": 1.7137809187279152,
"grad_norm": 0.20442147641128466,
"learning_rate": 2.3887434554973823e-05,
"loss": 0.3292,
"step": 485
},
{
"epoch": 1.7173144876325088,
"grad_norm": 0.19260726164139877,
"learning_rate": 2.382198952879581e-05,
"loss": 0.3021,
"step": 486
},
{
"epoch": 1.7208480565371025,
"grad_norm": 0.22186245167216126,
"learning_rate": 2.3756544502617803e-05,
"loss": 0.3459,
"step": 487
},
{
"epoch": 1.7243816254416962,
"grad_norm": 0.19732505859816749,
"learning_rate": 2.369109947643979e-05,
"loss": 0.3091,
"step": 488
},
{
"epoch": 1.7279151943462896,
"grad_norm": 0.19457870396367832,
"learning_rate": 2.362565445026178e-05,
"loss": 0.3398,
"step": 489
},
{
"epoch": 1.7314487632508833,
"grad_norm": 0.22069137792949856,
"learning_rate": 2.3560209424083772e-05,
"loss": 0.3136,
"step": 490
},
{
"epoch": 1.734982332155477,
"grad_norm": 0.1715833105368973,
"learning_rate": 2.349476439790576e-05,
"loss": 0.3119,
"step": 491
},
{
"epoch": 1.7385159010600706,
"grad_norm": 0.19700646101321603,
"learning_rate": 2.3429319371727752e-05,
"loss": 0.3113,
"step": 492
},
{
"epoch": 1.7420494699646643,
"grad_norm": 0.19071209653282534,
"learning_rate": 2.3363874345549738e-05,
"loss": 0.3136,
"step": 493
},
{
"epoch": 1.745583038869258,
"grad_norm": 0.19998540545050963,
"learning_rate": 2.329842931937173e-05,
"loss": 0.3261,
"step": 494
},
{
"epoch": 1.7491166077738516,
"grad_norm": 0.1952592766006168,
"learning_rate": 2.3232984293193718e-05,
"loss": 0.3255,
"step": 495
},
{
"epoch": 1.7526501766784452,
"grad_norm": 0.18574450596573983,
"learning_rate": 2.316753926701571e-05,
"loss": 0.3138,
"step": 496
},
{
"epoch": 1.756183745583039,
"grad_norm": 0.19941585251802152,
"learning_rate": 2.3102094240837695e-05,
"loss": 0.3163,
"step": 497
},
{
"epoch": 1.7597173144876326,
"grad_norm": 0.1982210953220068,
"learning_rate": 2.3036649214659687e-05,
"loss": 0.3286,
"step": 498
},
{
"epoch": 1.7632508833922262,
"grad_norm": 0.19345071371064035,
"learning_rate": 2.2971204188481675e-05,
"loss": 0.3169,
"step": 499
},
{
"epoch": 1.76678445229682,
"grad_norm": 0.19347763265980422,
"learning_rate": 2.2905759162303667e-05,
"loss": 0.3156,
"step": 500
},
{
"epoch": 1.7703180212014136,
"grad_norm": 0.1795166346986815,
"learning_rate": 2.2840314136125656e-05,
"loss": 0.3019,
"step": 501
},
{
"epoch": 1.773851590106007,
"grad_norm": 0.18811911681077517,
"learning_rate": 2.2774869109947644e-05,
"loss": 0.3009,
"step": 502
},
{
"epoch": 1.7773851590106007,
"grad_norm": 0.19217045164466942,
"learning_rate": 2.2709424083769633e-05,
"loss": 0.3151,
"step": 503
},
{
"epoch": 1.7809187279151943,
"grad_norm": 0.20556451270057643,
"learning_rate": 2.2643979057591625e-05,
"loss": 0.3181,
"step": 504
},
{
"epoch": 1.784452296819788,
"grad_norm": 0.19551378506661773,
"learning_rate": 2.2578534031413613e-05,
"loss": 0.3158,
"step": 505
},
{
"epoch": 1.7879858657243817,
"grad_norm": 0.19010323240559324,
"learning_rate": 2.25130890052356e-05,
"loss": 0.3414,
"step": 506
},
{
"epoch": 1.7915194346289751,
"grad_norm": 0.20125042061374748,
"learning_rate": 2.2447643979057594e-05,
"loss": 0.337,
"step": 507
},
{
"epoch": 1.7950530035335688,
"grad_norm": 0.2084730528380873,
"learning_rate": 2.2382198952879582e-05,
"loss": 0.3134,
"step": 508
},
{
"epoch": 1.7985865724381624,
"grad_norm": 0.1824333361488012,
"learning_rate": 2.2316753926701574e-05,
"loss": 0.3103,
"step": 509
},
{
"epoch": 1.802120141342756,
"grad_norm": 0.20422512785235175,
"learning_rate": 2.2251308900523562e-05,
"loss": 0.3255,
"step": 510
},
{
"epoch": 1.8056537102473498,
"grad_norm": 0.20113058219023555,
"learning_rate": 2.218586387434555e-05,
"loss": 0.3342,
"step": 511
},
{
"epoch": 1.8091872791519434,
"grad_norm": 0.19513047124964958,
"learning_rate": 2.212041884816754e-05,
"loss": 0.3224,
"step": 512
},
{
"epoch": 1.812720848056537,
"grad_norm": 0.20107628182054482,
"learning_rate": 2.205497382198953e-05,
"loss": 0.2909,
"step": 513
},
{
"epoch": 1.8162544169611308,
"grad_norm": 0.19700193466920085,
"learning_rate": 2.198952879581152e-05,
"loss": 0.3212,
"step": 514
},
{
"epoch": 1.8197879858657244,
"grad_norm": 0.17906218841204288,
"learning_rate": 2.192408376963351e-05,
"loss": 0.3019,
"step": 515
},
{
"epoch": 1.823321554770318,
"grad_norm": 0.19303884390774878,
"learning_rate": 2.1858638743455497e-05,
"loss": 0.3014,
"step": 516
},
{
"epoch": 1.8268551236749118,
"grad_norm": 0.20685382901702712,
"learning_rate": 2.179319371727749e-05,
"loss": 0.3083,
"step": 517
},
{
"epoch": 1.8303886925795054,
"grad_norm": 0.2037285790331814,
"learning_rate": 2.1727748691099477e-05,
"loss": 0.3167,
"step": 518
},
{
"epoch": 1.833922261484099,
"grad_norm": 0.19717863824703727,
"learning_rate": 2.166230366492147e-05,
"loss": 0.3186,
"step": 519
},
{
"epoch": 1.8374558303886925,
"grad_norm": 0.19800316974742535,
"learning_rate": 2.1596858638743454e-05,
"loss": 0.3146,
"step": 520
},
{
"epoch": 1.8409893992932862,
"grad_norm": 0.19130662465402037,
"learning_rate": 2.1531413612565446e-05,
"loss": 0.3235,
"step": 521
},
{
"epoch": 1.8445229681978799,
"grad_norm": 0.20052194433839013,
"learning_rate": 2.1465968586387435e-05,
"loss": 0.3081,
"step": 522
},
{
"epoch": 1.8480565371024735,
"grad_norm": 0.18663073692719714,
"learning_rate": 2.1400523560209427e-05,
"loss": 0.3166,
"step": 523
},
{
"epoch": 1.851590106007067,
"grad_norm": 0.27034126023862604,
"learning_rate": 2.1335078534031415e-05,
"loss": 0.3255,
"step": 524
},
{
"epoch": 1.8551236749116606,
"grad_norm": 0.1808488868839241,
"learning_rate": 2.1269633507853404e-05,
"loss": 0.3149,
"step": 525
},
{
"epoch": 1.8586572438162543,
"grad_norm": 0.19737138257270082,
"learning_rate": 2.1204188481675396e-05,
"loss": 0.2994,
"step": 526
},
{
"epoch": 1.862190812720848,
"grad_norm": 0.18843751133683795,
"learning_rate": 2.1138743455497384e-05,
"loss": 0.3206,
"step": 527
},
{
"epoch": 1.8657243816254416,
"grad_norm": 0.1823366087764783,
"learning_rate": 2.1073298429319373e-05,
"loss": 0.3055,
"step": 528
},
{
"epoch": 1.8692579505300353,
"grad_norm": 0.19472687255409105,
"learning_rate": 2.100785340314136e-05,
"loss": 0.3328,
"step": 529
},
{
"epoch": 1.872791519434629,
"grad_norm": 0.19385068890270665,
"learning_rate": 2.0942408376963353e-05,
"loss": 0.3223,
"step": 530
},
{
"epoch": 1.8763250883392226,
"grad_norm": 0.19019718027424132,
"learning_rate": 2.087696335078534e-05,
"loss": 0.318,
"step": 531
},
{
"epoch": 1.8798586572438163,
"grad_norm": 0.1949727412878405,
"learning_rate": 2.0811518324607333e-05,
"loss": 0.3106,
"step": 532
},
{
"epoch": 1.88339222614841,
"grad_norm": 0.19912542893995466,
"learning_rate": 2.074607329842932e-05,
"loss": 0.3241,
"step": 533
},
{
"epoch": 1.8869257950530036,
"grad_norm": 0.1947607900752559,
"learning_rate": 2.068062827225131e-05,
"loss": 0.3235,
"step": 534
},
{
"epoch": 1.8904593639575973,
"grad_norm": 0.21458554144569084,
"learning_rate": 2.06151832460733e-05,
"loss": 0.3241,
"step": 535
},
{
"epoch": 1.893992932862191,
"grad_norm": 0.20969585965810955,
"learning_rate": 2.054973821989529e-05,
"loss": 0.3173,
"step": 536
},
{
"epoch": 1.8975265017667846,
"grad_norm": 0.19221586628416598,
"learning_rate": 2.0484293193717276e-05,
"loss": 0.3122,
"step": 537
},
{
"epoch": 1.901060070671378,
"grad_norm": 0.20588113019726068,
"learning_rate": 2.0418848167539268e-05,
"loss": 0.3331,
"step": 538
},
{
"epoch": 1.9045936395759717,
"grad_norm": 0.2035058754626739,
"learning_rate": 2.0353403141361256e-05,
"loss": 0.3282,
"step": 539
},
{
"epoch": 1.9081272084805654,
"grad_norm": 0.17404926415563193,
"learning_rate": 2.0287958115183248e-05,
"loss": 0.293,
"step": 540
},
{
"epoch": 1.911660777385159,
"grad_norm": 0.19240218509183796,
"learning_rate": 2.0222513089005237e-05,
"loss": 0.2988,
"step": 541
},
{
"epoch": 1.9151943462897525,
"grad_norm": 0.5101233762870835,
"learning_rate": 2.0157068062827225e-05,
"loss": 0.3101,
"step": 542
},
{
"epoch": 1.9187279151943462,
"grad_norm": 0.22919095274144033,
"learning_rate": 2.0091623036649214e-05,
"loss": 0.3162,
"step": 543
},
{
"epoch": 1.9222614840989398,
"grad_norm": 0.19615848878848,
"learning_rate": 2.0026178010471206e-05,
"loss": 0.2995,
"step": 544
},
{
"epoch": 1.9257950530035335,
"grad_norm": 0.19608325143937116,
"learning_rate": 1.9960732984293194e-05,
"loss": 0.325,
"step": 545
},
{
"epoch": 1.9293286219081272,
"grad_norm": 0.22214298377068412,
"learning_rate": 1.9895287958115183e-05,
"loss": 0.3159,
"step": 546
},
{
"epoch": 1.9328621908127208,
"grad_norm": 0.19295382679319809,
"learning_rate": 1.9829842931937174e-05,
"loss": 0.3196,
"step": 547
},
{
"epoch": 1.9363957597173145,
"grad_norm": 0.2184011206348645,
"learning_rate": 1.9764397905759163e-05,
"loss": 0.3315,
"step": 548
},
{
"epoch": 1.9399293286219081,
"grad_norm": 0.21389646143776325,
"learning_rate": 1.9698952879581155e-05,
"loss": 0.3127,
"step": 549
},
{
"epoch": 1.9434628975265018,
"grad_norm": 0.1916368756320203,
"learning_rate": 1.9633507853403143e-05,
"loss": 0.3301,
"step": 550
},
{
"epoch": 1.9469964664310955,
"grad_norm": 0.1884927940628731,
"learning_rate": 1.9568062827225132e-05,
"loss": 0.2931,
"step": 551
},
{
"epoch": 1.9505300353356891,
"grad_norm": 0.17630264137701024,
"learning_rate": 1.950261780104712e-05,
"loss": 0.2996,
"step": 552
},
{
"epoch": 1.9540636042402828,
"grad_norm": 0.19989195563537895,
"learning_rate": 1.9437172774869112e-05,
"loss": 0.3235,
"step": 553
},
{
"epoch": 1.9575971731448765,
"grad_norm": 0.1797274327054754,
"learning_rate": 1.93717277486911e-05,
"loss": 0.3097,
"step": 554
},
{
"epoch": 1.96113074204947,
"grad_norm": 0.19970896816976344,
"learning_rate": 1.930628272251309e-05,
"loss": 0.3004,
"step": 555
},
{
"epoch": 1.9646643109540636,
"grad_norm": 0.17978020640321726,
"learning_rate": 1.9240837696335078e-05,
"loss": 0.3171,
"step": 556
},
{
"epoch": 1.9681978798586572,
"grad_norm": 0.19404211831216917,
"learning_rate": 1.917539267015707e-05,
"loss": 0.325,
"step": 557
},
{
"epoch": 1.971731448763251,
"grad_norm": 0.173833588354134,
"learning_rate": 1.9109947643979058e-05,
"loss": 0.2885,
"step": 558
},
{
"epoch": 1.9752650176678446,
"grad_norm": 0.18854980718642558,
"learning_rate": 1.904450261780105e-05,
"loss": 0.3141,
"step": 559
},
{
"epoch": 1.978798586572438,
"grad_norm": 0.19314969132740878,
"learning_rate": 1.8979057591623035e-05,
"loss": 0.3304,
"step": 560
},
{
"epoch": 1.9823321554770317,
"grad_norm": 0.18063031390822812,
"learning_rate": 1.8913612565445027e-05,
"loss": 0.3074,
"step": 561
},
{
"epoch": 1.9858657243816253,
"grad_norm": 0.20192798258500452,
"learning_rate": 1.8848167539267016e-05,
"loss": 0.3081,
"step": 562
},
{
"epoch": 1.989399293286219,
"grad_norm": 0.19580021991081917,
"learning_rate": 1.8782722513089007e-05,
"loss": 0.3256,
"step": 563
},
{
"epoch": 1.9929328621908127,
"grad_norm": 0.19579536362591232,
"learning_rate": 1.8717277486910996e-05,
"loss": 0.3233,
"step": 564
},
{
"epoch": 1.9964664310954063,
"grad_norm": 0.18726167226984994,
"learning_rate": 1.8651832460732985e-05,
"loss": 0.3155,
"step": 565
},
{
"epoch": 2.0,
"grad_norm": 0.18526453771250875,
"learning_rate": 1.8586387434554976e-05,
"loss": 0.2916,
"step": 566
},
{
"epoch": 2.0035335689045937,
"grad_norm": 0.313733537703411,
"learning_rate": 1.8520942408376965e-05,
"loss": 0.2391,
"step": 567
},
{
"epoch": 2.0070671378091873,
"grad_norm": 0.2139886609268667,
"learning_rate": 1.8455497382198953e-05,
"loss": 0.2408,
"step": 568
},
{
"epoch": 2.010600706713781,
"grad_norm": 0.3397372785098326,
"learning_rate": 1.8390052356020942e-05,
"loss": 0.2388,
"step": 569
},
{
"epoch": 2.0141342756183747,
"grad_norm": 0.24893309754147236,
"learning_rate": 1.8324607329842934e-05,
"loss": 0.2284,
"step": 570
},
{
"epoch": 2.0176678445229683,
"grad_norm": 0.22896405684331084,
"learning_rate": 1.8259162303664922e-05,
"loss": 0.2283,
"step": 571
},
{
"epoch": 2.021201413427562,
"grad_norm": 0.27446558934945536,
"learning_rate": 1.8193717277486914e-05,
"loss": 0.2346,
"step": 572
},
{
"epoch": 2.0247349823321557,
"grad_norm": 0.24400885882704268,
"learning_rate": 1.81282722513089e-05,
"loss": 0.2394,
"step": 573
},
{
"epoch": 2.0282685512367493,
"grad_norm": 0.2174113094719782,
"learning_rate": 1.806282722513089e-05,
"loss": 0.2348,
"step": 574
},
{
"epoch": 2.0318021201413425,
"grad_norm": 0.2854848758819835,
"learning_rate": 1.799738219895288e-05,
"loss": 0.2463,
"step": 575
},
{
"epoch": 2.035335689045936,
"grad_norm": 0.2094161356419985,
"learning_rate": 1.793193717277487e-05,
"loss": 0.2339,
"step": 576
},
{
"epoch": 2.03886925795053,
"grad_norm": 0.19053773509439118,
"learning_rate": 1.7866492146596857e-05,
"loss": 0.2034,
"step": 577
},
{
"epoch": 2.0424028268551235,
"grad_norm": 0.20351237653846452,
"learning_rate": 1.780104712041885e-05,
"loss": 0.239,
"step": 578
},
{
"epoch": 2.045936395759717,
"grad_norm": 0.20121539454250265,
"learning_rate": 1.7735602094240837e-05,
"loss": 0.2271,
"step": 579
},
{
"epoch": 2.049469964664311,
"grad_norm": 0.18091723396025775,
"learning_rate": 1.767015706806283e-05,
"loss": 0.2315,
"step": 580
},
{
"epoch": 2.0530035335689045,
"grad_norm": 0.20119590031435344,
"learning_rate": 1.7604712041884818e-05,
"loss": 0.243,
"step": 581
},
{
"epoch": 2.056537102473498,
"grad_norm": 0.20049246498275813,
"learning_rate": 1.7539267015706806e-05,
"loss": 0.2255,
"step": 582
},
{
"epoch": 2.060070671378092,
"grad_norm": 0.22382362066147798,
"learning_rate": 1.7473821989528798e-05,
"loss": 0.2361,
"step": 583
},
{
"epoch": 2.0636042402826855,
"grad_norm": 0.2072847641545473,
"learning_rate": 1.7408376963350786e-05,
"loss": 0.228,
"step": 584
},
{
"epoch": 2.067137809187279,
"grad_norm": 0.20020391104334753,
"learning_rate": 1.734293193717278e-05,
"loss": 0.2279,
"step": 585
},
{
"epoch": 2.070671378091873,
"grad_norm": 0.22966176583480186,
"learning_rate": 1.7277486910994763e-05,
"loss": 0.2372,
"step": 586
},
{
"epoch": 2.0742049469964665,
"grad_norm": 0.1946035128234729,
"learning_rate": 1.7212041884816755e-05,
"loss": 0.2423,
"step": 587
},
{
"epoch": 2.07773851590106,
"grad_norm": 0.20982641712768157,
"learning_rate": 1.7146596858638744e-05,
"loss": 0.2317,
"step": 588
},
{
"epoch": 2.081272084805654,
"grad_norm": 0.192503749374612,
"learning_rate": 1.7081151832460736e-05,
"loss": 0.2289,
"step": 589
},
{
"epoch": 2.0848056537102475,
"grad_norm": 0.20491576934622158,
"learning_rate": 1.7015706806282724e-05,
"loss": 0.2303,
"step": 590
},
{
"epoch": 2.088339222614841,
"grad_norm": 0.24262885329402847,
"learning_rate": 1.6950261780104713e-05,
"loss": 0.2192,
"step": 591
},
{
"epoch": 2.091872791519435,
"grad_norm": 0.1977540654757339,
"learning_rate": 1.68848167539267e-05,
"loss": 0.2269,
"step": 592
},
{
"epoch": 2.095406360424028,
"grad_norm": 0.18498506813704485,
"learning_rate": 1.6819371727748693e-05,
"loss": 0.224,
"step": 593
},
{
"epoch": 2.0989399293286217,
"grad_norm": 0.19398268016059655,
"learning_rate": 1.675392670157068e-05,
"loss": 0.2305,
"step": 594
},
{
"epoch": 2.1024734982332154,
"grad_norm": 0.18393549375881862,
"learning_rate": 1.668848167539267e-05,
"loss": 0.2262,
"step": 595
},
{
"epoch": 2.106007067137809,
"grad_norm": 0.1860457835424858,
"learning_rate": 1.662303664921466e-05,
"loss": 0.2146,
"step": 596
},
{
"epoch": 2.1095406360424027,
"grad_norm": 0.17959065192225365,
"learning_rate": 1.655759162303665e-05,
"loss": 0.215,
"step": 597
},
{
"epoch": 2.1130742049469964,
"grad_norm": 0.19165574670101904,
"learning_rate": 1.649214659685864e-05,
"loss": 0.2266,
"step": 598
},
{
"epoch": 2.11660777385159,
"grad_norm": 0.18796312976815543,
"learning_rate": 1.642670157068063e-05,
"loss": 0.2425,
"step": 599
},
{
"epoch": 2.1201413427561837,
"grad_norm": 0.18161435413151616,
"learning_rate": 1.636125654450262e-05,
"loss": 0.2196,
"step": 600
},
{
"epoch": 2.1236749116607774,
"grad_norm": 0.19052701307742212,
"learning_rate": 1.6295811518324608e-05,
"loss": 0.2396,
"step": 601
},
{
"epoch": 2.127208480565371,
"grad_norm": 0.19344280266083863,
"learning_rate": 1.6230366492146596e-05,
"loss": 0.224,
"step": 602
},
{
"epoch": 2.1307420494699647,
"grad_norm": 0.18380515192094332,
"learning_rate": 1.616492146596859e-05,
"loss": 0.231,
"step": 603
},
{
"epoch": 2.1342756183745584,
"grad_norm": 0.17797217814434263,
"learning_rate": 1.6099476439790577e-05,
"loss": 0.2197,
"step": 604
},
{
"epoch": 2.137809187279152,
"grad_norm": 0.25342525798606186,
"learning_rate": 1.6034031413612565e-05,
"loss": 0.2528,
"step": 605
},
{
"epoch": 2.1413427561837457,
"grad_norm": 0.19065283936832148,
"learning_rate": 1.5968586387434557e-05,
"loss": 0.2415,
"step": 606
},
{
"epoch": 2.1448763250883394,
"grad_norm": 0.180423600567546,
"learning_rate": 1.5903141361256546e-05,
"loss": 0.2345,
"step": 607
},
{
"epoch": 2.148409893992933,
"grad_norm": 0.2077742867716798,
"learning_rate": 1.5837696335078534e-05,
"loss": 0.2385,
"step": 608
},
{
"epoch": 2.1519434628975267,
"grad_norm": 0.17850039637053272,
"learning_rate": 1.5772251308900523e-05,
"loss": 0.2228,
"step": 609
},
{
"epoch": 2.1554770318021204,
"grad_norm": 0.19116780656180973,
"learning_rate": 1.5706806282722515e-05,
"loss": 0.2364,
"step": 610
},
{
"epoch": 2.1590106007067136,
"grad_norm": 0.1992172627054553,
"learning_rate": 1.5641361256544503e-05,
"loss": 0.2286,
"step": 611
},
{
"epoch": 2.1625441696113072,
"grad_norm": 0.18067038836435922,
"learning_rate": 1.5575916230366495e-05,
"loss": 0.2284,
"step": 612
},
{
"epoch": 2.166077738515901,
"grad_norm": 0.202816197924796,
"learning_rate": 1.551047120418848e-05,
"loss": 0.2382,
"step": 613
},
{
"epoch": 2.1696113074204946,
"grad_norm": 0.21131145708087853,
"learning_rate": 1.5445026178010472e-05,
"loss": 0.2287,
"step": 614
},
{
"epoch": 2.1731448763250882,
"grad_norm": 0.18390779019125453,
"learning_rate": 1.537958115183246e-05,
"loss": 0.2325,
"step": 615
},
{
"epoch": 2.176678445229682,
"grad_norm": 0.19293312185267333,
"learning_rate": 1.5314136125654453e-05,
"loss": 0.2259,
"step": 616
},
{
"epoch": 2.1802120141342756,
"grad_norm": 0.19678219169161754,
"learning_rate": 1.524869109947644e-05,
"loss": 0.2275,
"step": 617
},
{
"epoch": 2.1837455830388692,
"grad_norm": 0.20433339697482053,
"learning_rate": 1.518324607329843e-05,
"loss": 0.2277,
"step": 618
},
{
"epoch": 2.187279151943463,
"grad_norm": 0.1788456565700629,
"learning_rate": 1.511780104712042e-05,
"loss": 0.232,
"step": 619
},
{
"epoch": 2.1908127208480566,
"grad_norm": 0.1923605032784442,
"learning_rate": 1.505235602094241e-05,
"loss": 0.2215,
"step": 620
},
{
"epoch": 2.1943462897526502,
"grad_norm": 0.21367282642858493,
"learning_rate": 1.49869109947644e-05,
"loss": 0.2398,
"step": 621
},
{
"epoch": 2.197879858657244,
"grad_norm": 0.1894069816682639,
"learning_rate": 1.4921465968586387e-05,
"loss": 0.2286,
"step": 622
},
{
"epoch": 2.2014134275618376,
"grad_norm": 0.18827504319918725,
"learning_rate": 1.4856020942408377e-05,
"loss": 0.2276,
"step": 623
},
{
"epoch": 2.204946996466431,
"grad_norm": 0.19330596608013897,
"learning_rate": 1.4790575916230367e-05,
"loss": 0.2414,
"step": 624
},
{
"epoch": 2.208480565371025,
"grad_norm": 0.19743784954842997,
"learning_rate": 1.4725130890052358e-05,
"loss": 0.2299,
"step": 625
},
{
"epoch": 2.2120141342756185,
"grad_norm": 0.19899372342505975,
"learning_rate": 1.4659685863874344e-05,
"loss": 0.2351,
"step": 626
},
{
"epoch": 2.215547703180212,
"grad_norm": 0.2003550099207659,
"learning_rate": 1.4594240837696335e-05,
"loss": 0.2433,
"step": 627
},
{
"epoch": 2.2190812720848054,
"grad_norm": 0.21196121659141753,
"learning_rate": 1.4528795811518325e-05,
"loss": 0.238,
"step": 628
},
{
"epoch": 2.222614840989399,
"grad_norm": 0.20036290181115837,
"learning_rate": 1.4463350785340315e-05,
"loss": 0.2427,
"step": 629
},
{
"epoch": 2.2261484098939928,
"grad_norm": 0.1845797709204667,
"learning_rate": 1.4397905759162305e-05,
"loss": 0.2311,
"step": 630
},
{
"epoch": 2.2296819787985864,
"grad_norm": 0.19618301614029116,
"learning_rate": 1.4332460732984294e-05,
"loss": 0.2284,
"step": 631
},
{
"epoch": 2.23321554770318,
"grad_norm": 0.18750926106082005,
"learning_rate": 1.4267015706806284e-05,
"loss": 0.2239,
"step": 632
},
{
"epoch": 2.2367491166077738,
"grad_norm": 0.21538713160466885,
"learning_rate": 1.4201570680628274e-05,
"loss": 0.2401,
"step": 633
},
{
"epoch": 2.2402826855123674,
"grad_norm": 0.18387363592426773,
"learning_rate": 1.4136125654450264e-05,
"loss": 0.2316,
"step": 634
},
{
"epoch": 2.243816254416961,
"grad_norm": 0.2008203684672948,
"learning_rate": 1.4070680628272251e-05,
"loss": 0.2408,
"step": 635
},
{
"epoch": 2.2473498233215548,
"grad_norm": 0.18266816775910638,
"learning_rate": 1.4005235602094241e-05,
"loss": 0.2263,
"step": 636
},
{
"epoch": 2.2508833922261484,
"grad_norm": 0.1946230011492327,
"learning_rate": 1.3939790575916231e-05,
"loss": 0.2329,
"step": 637
},
{
"epoch": 2.254416961130742,
"grad_norm": 0.1826823199445633,
"learning_rate": 1.3874345549738222e-05,
"loss": 0.2259,
"step": 638
},
{
"epoch": 2.2579505300353357,
"grad_norm": 0.1912824221351653,
"learning_rate": 1.3808900523560212e-05,
"loss": 0.2338,
"step": 639
},
{
"epoch": 2.2614840989399294,
"grad_norm": 0.188162715338196,
"learning_rate": 1.3743455497382199e-05,
"loss": 0.2449,
"step": 640
},
{
"epoch": 2.265017667844523,
"grad_norm": 0.19741781364992034,
"learning_rate": 1.3678010471204189e-05,
"loss": 0.2265,
"step": 641
},
{
"epoch": 2.2685512367491167,
"grad_norm": 0.1812585942078153,
"learning_rate": 1.3612565445026179e-05,
"loss": 0.2144,
"step": 642
},
{
"epoch": 2.2720848056537104,
"grad_norm": 0.19914861402094536,
"learning_rate": 1.354712041884817e-05,
"loss": 0.2365,
"step": 643
},
{
"epoch": 2.275618374558304,
"grad_norm": 0.1779009897331266,
"learning_rate": 1.3481675392670156e-05,
"loss": 0.2317,
"step": 644
},
{
"epoch": 2.2791519434628977,
"grad_norm": 0.19889730658880675,
"learning_rate": 1.3416230366492146e-05,
"loss": 0.2548,
"step": 645
},
{
"epoch": 2.2826855123674914,
"grad_norm": 0.19598745833554967,
"learning_rate": 1.3350785340314136e-05,
"loss": 0.2421,
"step": 646
},
{
"epoch": 2.2862190812720846,
"grad_norm": 0.19871280094849272,
"learning_rate": 1.3285340314136127e-05,
"loss": 0.2371,
"step": 647
},
{
"epoch": 2.2897526501766783,
"grad_norm": 0.19175825746617,
"learning_rate": 1.3219895287958115e-05,
"loss": 0.2293,
"step": 648
},
{
"epoch": 2.293286219081272,
"grad_norm": 0.1935669822734246,
"learning_rate": 1.3154450261780105e-05,
"loss": 0.2428,
"step": 649
},
{
"epoch": 2.2968197879858656,
"grad_norm": 0.1973155554278464,
"learning_rate": 1.3089005235602096e-05,
"loss": 0.2332,
"step": 650
},
{
"epoch": 2.3003533568904593,
"grad_norm": 0.1911591440697295,
"learning_rate": 1.3023560209424086e-05,
"loss": 0.2254,
"step": 651
},
{
"epoch": 2.303886925795053,
"grad_norm": 0.18247237307656608,
"learning_rate": 1.2958115183246074e-05,
"loss": 0.2312,
"step": 652
},
{
"epoch": 2.3074204946996466,
"grad_norm": 0.18788835057792652,
"learning_rate": 1.2892670157068063e-05,
"loss": 0.2261,
"step": 653
},
{
"epoch": 2.3109540636042403,
"grad_norm": 0.18168131811921545,
"learning_rate": 1.2827225130890053e-05,
"loss": 0.2177,
"step": 654
},
{
"epoch": 2.314487632508834,
"grad_norm": 0.184818724049329,
"learning_rate": 1.2761780104712043e-05,
"loss": 0.2471,
"step": 655
},
{
"epoch": 2.3180212014134276,
"grad_norm": 0.17844055130115527,
"learning_rate": 1.2696335078534033e-05,
"loss": 0.2202,
"step": 656
},
{
"epoch": 2.3215547703180213,
"grad_norm": 0.18974419342500715,
"learning_rate": 1.263089005235602e-05,
"loss": 0.2295,
"step": 657
},
{
"epoch": 2.325088339222615,
"grad_norm": 0.19485323878457486,
"learning_rate": 1.256544502617801e-05,
"loss": 0.2186,
"step": 658
},
{
"epoch": 2.3286219081272086,
"grad_norm": 0.17619682997040556,
"learning_rate": 1.25e-05,
"loss": 0.2254,
"step": 659
},
{
"epoch": 2.3321554770318023,
"grad_norm": 0.1817739083367298,
"learning_rate": 1.243455497382199e-05,
"loss": 0.2325,
"step": 660
},
{
"epoch": 2.335689045936396,
"grad_norm": 0.19736936039663003,
"learning_rate": 1.236910994764398e-05,
"loss": 0.2232,
"step": 661
},
{
"epoch": 2.3392226148409896,
"grad_norm": 0.18460908084162367,
"learning_rate": 1.230366492146597e-05,
"loss": 0.2187,
"step": 662
},
{
"epoch": 2.342756183745583,
"grad_norm": 0.1942968025537919,
"learning_rate": 1.2238219895287958e-05,
"loss": 0.2249,
"step": 663
},
{
"epoch": 2.3462897526501765,
"grad_norm": 0.19272797914097448,
"learning_rate": 1.2172774869109948e-05,
"loss": 0.2331,
"step": 664
},
{
"epoch": 2.34982332155477,
"grad_norm": 0.21641947675845358,
"learning_rate": 1.2107329842931937e-05,
"loss": 0.2398,
"step": 665
},
{
"epoch": 2.353356890459364,
"grad_norm": 0.1914653998501696,
"learning_rate": 1.2041884816753927e-05,
"loss": 0.2185,
"step": 666
},
{
"epoch": 2.3568904593639575,
"grad_norm": 0.1882330611472886,
"learning_rate": 1.1976439790575915e-05,
"loss": 0.2251,
"step": 667
},
{
"epoch": 2.360424028268551,
"grad_norm": 0.18944788598140125,
"learning_rate": 1.1910994764397906e-05,
"loss": 0.2345,
"step": 668
},
{
"epoch": 2.363957597173145,
"grad_norm": 0.17146301985462817,
"learning_rate": 1.1845549738219896e-05,
"loss": 0.2241,
"step": 669
},
{
"epoch": 2.3674911660777385,
"grad_norm": 0.17938705003950822,
"learning_rate": 1.1780104712041886e-05,
"loss": 0.2267,
"step": 670
},
{
"epoch": 2.371024734982332,
"grad_norm": 0.19522096030634284,
"learning_rate": 1.1714659685863876e-05,
"loss": 0.2277,
"step": 671
},
{
"epoch": 2.374558303886926,
"grad_norm": 0.1847631671094928,
"learning_rate": 1.1649214659685865e-05,
"loss": 0.2169,
"step": 672
},
{
"epoch": 2.3780918727915195,
"grad_norm": 0.22299936035274462,
"learning_rate": 1.1583769633507855e-05,
"loss": 0.2392,
"step": 673
},
{
"epoch": 2.381625441696113,
"grad_norm": 0.19191836724971217,
"learning_rate": 1.1518324607329843e-05,
"loss": 0.2329,
"step": 674
},
{
"epoch": 2.385159010600707,
"grad_norm": 0.1884963784495572,
"learning_rate": 1.1452879581151834e-05,
"loss": 0.2409,
"step": 675
},
{
"epoch": 2.3886925795053005,
"grad_norm": 0.19261491080881832,
"learning_rate": 1.1387434554973822e-05,
"loss": 0.2327,
"step": 676
},
{
"epoch": 2.392226148409894,
"grad_norm": 0.1867803201758491,
"learning_rate": 1.1321989528795812e-05,
"loss": 0.2359,
"step": 677
},
{
"epoch": 2.395759717314488,
"grad_norm": 0.18388375737414528,
"learning_rate": 1.12565445026178e-05,
"loss": 0.2135,
"step": 678
},
{
"epoch": 2.3992932862190814,
"grad_norm": 0.17837474863436859,
"learning_rate": 1.1191099476439791e-05,
"loss": 0.2236,
"step": 679
},
{
"epoch": 2.402826855123675,
"grad_norm": 0.1826718285578328,
"learning_rate": 1.1125654450261781e-05,
"loss": 0.2249,
"step": 680
},
{
"epoch": 2.4063604240282688,
"grad_norm": 0.17807930311495368,
"learning_rate": 1.106020942408377e-05,
"loss": 0.2167,
"step": 681
},
{
"epoch": 2.4098939929328624,
"grad_norm": 0.18137250950549594,
"learning_rate": 1.099476439790576e-05,
"loss": 0.2238,
"step": 682
},
{
"epoch": 2.4134275618374557,
"grad_norm": 0.19011762029554422,
"learning_rate": 1.0929319371727748e-05,
"loss": 0.2287,
"step": 683
},
{
"epoch": 2.4169611307420493,
"grad_norm": 0.17352735050612428,
"learning_rate": 1.0863874345549739e-05,
"loss": 0.2278,
"step": 684
},
{
"epoch": 2.420494699646643,
"grad_norm": 0.1791118189922586,
"learning_rate": 1.0798429319371727e-05,
"loss": 0.2294,
"step": 685
},
{
"epoch": 2.4240282685512367,
"grad_norm": 0.1757620855232099,
"learning_rate": 1.0732984293193717e-05,
"loss": 0.2317,
"step": 686
},
{
"epoch": 2.4275618374558303,
"grad_norm": 0.17876135905683324,
"learning_rate": 1.0667539267015708e-05,
"loss": 0.237,
"step": 687
},
{
"epoch": 2.431095406360424,
"grad_norm": 0.17822271833212014,
"learning_rate": 1.0602094240837698e-05,
"loss": 0.2304,
"step": 688
},
{
"epoch": 2.4346289752650176,
"grad_norm": 0.18462511082317737,
"learning_rate": 1.0536649214659686e-05,
"loss": 0.2269,
"step": 689
},
{
"epoch": 2.4381625441696113,
"grad_norm": 0.19812323846662644,
"learning_rate": 1.0471204188481676e-05,
"loss": 0.2347,
"step": 690
},
{
"epoch": 2.441696113074205,
"grad_norm": 0.180064527405259,
"learning_rate": 1.0405759162303667e-05,
"loss": 0.2206,
"step": 691
},
{
"epoch": 2.4452296819787986,
"grad_norm": 0.16794918557821822,
"learning_rate": 1.0340314136125655e-05,
"loss": 0.2259,
"step": 692
},
{
"epoch": 2.4487632508833923,
"grad_norm": 0.18508593030518672,
"learning_rate": 1.0274869109947645e-05,
"loss": 0.2306,
"step": 693
},
{
"epoch": 2.452296819787986,
"grad_norm": 0.18204207861169194,
"learning_rate": 1.0209424083769634e-05,
"loss": 0.2299,
"step": 694
},
{
"epoch": 2.4558303886925796,
"grad_norm": 0.19257618024613238,
"learning_rate": 1.0143979057591624e-05,
"loss": 0.2362,
"step": 695
},
{
"epoch": 2.4593639575971733,
"grad_norm": 0.18235087269820668,
"learning_rate": 1.0078534031413613e-05,
"loss": 0.2301,
"step": 696
},
{
"epoch": 2.462897526501767,
"grad_norm": 0.1771610352100014,
"learning_rate": 1.0013089005235603e-05,
"loss": 0.2284,
"step": 697
},
{
"epoch": 2.46643109540636,
"grad_norm": 0.16852508312931086,
"learning_rate": 9.947643979057591e-06,
"loss": 0.2263,
"step": 698
},
{
"epoch": 2.469964664310954,
"grad_norm": 0.17879754866031772,
"learning_rate": 9.882198952879581e-06,
"loss": 0.2179,
"step": 699
},
{
"epoch": 2.4734982332155475,
"grad_norm": 0.18975466708297106,
"learning_rate": 9.816753926701572e-06,
"loss": 0.2323,
"step": 700
},
{
"epoch": 2.477031802120141,
"grad_norm": 0.19030778371155557,
"learning_rate": 9.75130890052356e-06,
"loss": 0.2401,
"step": 701
},
{
"epoch": 2.480565371024735,
"grad_norm": 0.1721870322288873,
"learning_rate": 9.68586387434555e-06,
"loss": 0.2332,
"step": 702
},
{
"epoch": 2.4840989399293285,
"grad_norm": 0.17612949656387641,
"learning_rate": 9.620418848167539e-06,
"loss": 0.2251,
"step": 703
},
{
"epoch": 2.487632508833922,
"grad_norm": 0.19049574466158792,
"learning_rate": 9.554973821989529e-06,
"loss": 0.2233,
"step": 704
},
{
"epoch": 2.491166077738516,
"grad_norm": 0.19119113177069816,
"learning_rate": 9.489528795811518e-06,
"loss": 0.2388,
"step": 705
},
{
"epoch": 2.4946996466431095,
"grad_norm": 0.1763243903971157,
"learning_rate": 9.424083769633508e-06,
"loss": 0.237,
"step": 706
},
{
"epoch": 2.498233215547703,
"grad_norm": 0.17917172574266083,
"learning_rate": 9.358638743455498e-06,
"loss": 0.2146,
"step": 707
},
{
"epoch": 2.501766784452297,
"grad_norm": 0.18500720632480228,
"learning_rate": 9.293193717277488e-06,
"loss": 0.2175,
"step": 708
},
{
"epoch": 2.5053003533568905,
"grad_norm": 0.17076071175040308,
"learning_rate": 9.227748691099477e-06,
"loss": 0.2233,
"step": 709
},
{
"epoch": 2.508833922261484,
"grad_norm": 0.18703476130370938,
"learning_rate": 9.162303664921467e-06,
"loss": 0.2264,
"step": 710
},
{
"epoch": 2.512367491166078,
"grad_norm": 0.1807528307723562,
"learning_rate": 9.096858638743457e-06,
"loss": 0.2263,
"step": 711
},
{
"epoch": 2.5159010600706715,
"grad_norm": 0.1803942381596997,
"learning_rate": 9.031413612565446e-06,
"loss": 0.2309,
"step": 712
},
{
"epoch": 2.519434628975265,
"grad_norm": 0.1773556486849811,
"learning_rate": 8.965968586387436e-06,
"loss": 0.2441,
"step": 713
},
{
"epoch": 2.522968197879859,
"grad_norm": 0.18542163153506033,
"learning_rate": 8.900523560209424e-06,
"loss": 0.2414,
"step": 714
},
{
"epoch": 2.5265017667844525,
"grad_norm": 0.20626813232549362,
"learning_rate": 8.835078534031415e-06,
"loss": 0.2412,
"step": 715
},
{
"epoch": 2.530035335689046,
"grad_norm": 0.1855977128826217,
"learning_rate": 8.769633507853403e-06,
"loss": 0.2339,
"step": 716
},
{
"epoch": 2.53356890459364,
"grad_norm": 0.1813366114895612,
"learning_rate": 8.704188481675393e-06,
"loss": 0.2242,
"step": 717
},
{
"epoch": 2.5371024734982335,
"grad_norm": 0.19002571308005264,
"learning_rate": 8.638743455497382e-06,
"loss": 0.235,
"step": 718
},
{
"epoch": 2.5406360424028267,
"grad_norm": 0.18539272616962202,
"learning_rate": 8.573298429319372e-06,
"loss": 0.2284,
"step": 719
},
{
"epoch": 2.5441696113074204,
"grad_norm": 0.19414071448670545,
"learning_rate": 8.507853403141362e-06,
"loss": 0.2353,
"step": 720
},
{
"epoch": 2.547703180212014,
"grad_norm": 0.19004484882643627,
"learning_rate": 8.44240837696335e-06,
"loss": 0.2299,
"step": 721
},
{
"epoch": 2.5512367491166077,
"grad_norm": 0.17085293351660877,
"learning_rate": 8.37696335078534e-06,
"loss": 0.2194,
"step": 722
},
{
"epoch": 2.5547703180212014,
"grad_norm": 0.17870425668294465,
"learning_rate": 8.31151832460733e-06,
"loss": 0.2346,
"step": 723
},
{
"epoch": 2.558303886925795,
"grad_norm": 0.18466560846941446,
"learning_rate": 8.24607329842932e-06,
"loss": 0.2132,
"step": 724
},
{
"epoch": 2.5618374558303887,
"grad_norm": 0.1777921475368777,
"learning_rate": 8.18062827225131e-06,
"loss": 0.2235,
"step": 725
},
{
"epoch": 2.5653710247349824,
"grad_norm": 0.1710971165413661,
"learning_rate": 8.115183246073298e-06,
"loss": 0.2272,
"step": 726
},
{
"epoch": 2.568904593639576,
"grad_norm": 0.18128848778107537,
"learning_rate": 8.049738219895288e-06,
"loss": 0.2265,
"step": 727
},
{
"epoch": 2.5724381625441697,
"grad_norm": 0.17963027519739994,
"learning_rate": 7.984293193717279e-06,
"loss": 0.2252,
"step": 728
},
{
"epoch": 2.5759717314487633,
"grad_norm": 0.19091112476672958,
"learning_rate": 7.918848167539267e-06,
"loss": 0.2271,
"step": 729
},
{
"epoch": 2.579505300353357,
"grad_norm": 0.19825785904033563,
"learning_rate": 7.853403141361257e-06,
"loss": 0.236,
"step": 730
},
{
"epoch": 2.5830388692579507,
"grad_norm": 0.1878224722552544,
"learning_rate": 7.787958115183248e-06,
"loss": 0.2322,
"step": 731
},
{
"epoch": 2.586572438162544,
"grad_norm": 0.17309652280394397,
"learning_rate": 7.722513089005236e-06,
"loss": 0.2296,
"step": 732
},
{
"epoch": 2.5901060070671376,
"grad_norm": 0.17845422168913475,
"learning_rate": 7.657068062827226e-06,
"loss": 0.232,
"step": 733
},
{
"epoch": 2.5936395759717312,
"grad_norm": 0.17763690273063018,
"learning_rate": 7.591623036649215e-06,
"loss": 0.2368,
"step": 734
},
{
"epoch": 2.597173144876325,
"grad_norm": 0.19229308399713677,
"learning_rate": 7.526178010471205e-06,
"loss": 0.2255,
"step": 735
},
{
"epoch": 2.6007067137809186,
"grad_norm": 0.1743293912268192,
"learning_rate": 7.4607329842931935e-06,
"loss": 0.2133,
"step": 736
},
{
"epoch": 2.604240282685512,
"grad_norm": 0.17758210208630032,
"learning_rate": 7.395287958115184e-06,
"loss": 0.2339,
"step": 737
},
{
"epoch": 2.607773851590106,
"grad_norm": 0.1730809602186094,
"learning_rate": 7.329842931937172e-06,
"loss": 0.2305,
"step": 738
},
{
"epoch": 2.6113074204946995,
"grad_norm": 0.18476574920184824,
"learning_rate": 7.264397905759162e-06,
"loss": 0.234,
"step": 739
},
{
"epoch": 2.614840989399293,
"grad_norm": 0.1778533090087193,
"learning_rate": 7.1989528795811526e-06,
"loss": 0.2214,
"step": 740
},
{
"epoch": 2.618374558303887,
"grad_norm": 0.17217787306000468,
"learning_rate": 7.133507853403142e-06,
"loss": 0.2384,
"step": 741
},
{
"epoch": 2.6219081272084805,
"grad_norm": 0.18007925906519412,
"learning_rate": 7.068062827225132e-06,
"loss": 0.2251,
"step": 742
},
{
"epoch": 2.625441696113074,
"grad_norm": 0.17473238391733925,
"learning_rate": 7.002617801047121e-06,
"loss": 0.228,
"step": 743
},
{
"epoch": 2.628975265017668,
"grad_norm": 0.17479271644921315,
"learning_rate": 6.937172774869111e-06,
"loss": 0.2191,
"step": 744
},
{
"epoch": 2.6325088339222615,
"grad_norm": 0.16532580992442925,
"learning_rate": 6.871727748691099e-06,
"loss": 0.2171,
"step": 745
},
{
"epoch": 2.636042402826855,
"grad_norm": 0.17099488356759243,
"learning_rate": 6.8062827225130895e-06,
"loss": 0.2088,
"step": 746
},
{
"epoch": 2.639575971731449,
"grad_norm": 0.17352866725108512,
"learning_rate": 6.740837696335078e-06,
"loss": 0.226,
"step": 747
},
{
"epoch": 2.6431095406360425,
"grad_norm": 0.16739331587380185,
"learning_rate": 6.675392670157068e-06,
"loss": 0.2144,
"step": 748
},
{
"epoch": 2.646643109540636,
"grad_norm": 0.18099645515359558,
"learning_rate": 6.609947643979058e-06,
"loss": 0.2329,
"step": 749
},
{
"epoch": 2.65017667844523,
"grad_norm": 0.1844474315417496,
"learning_rate": 6.544502617801048e-06,
"loss": 0.234,
"step": 750
},
{
"epoch": 2.6537102473498235,
"grad_norm": 0.19320613894053068,
"learning_rate": 6.479057591623037e-06,
"loss": 0.2284,
"step": 751
},
{
"epoch": 2.657243816254417,
"grad_norm": 0.16568408210539942,
"learning_rate": 6.4136125654450265e-06,
"loss": 0.2225,
"step": 752
},
{
"epoch": 2.660777385159011,
"grad_norm": 0.16962059001372878,
"learning_rate": 6.348167539267017e-06,
"loss": 0.2298,
"step": 753
},
{
"epoch": 2.664310954063604,
"grad_norm": 0.1772538867004994,
"learning_rate": 6.282722513089005e-06,
"loss": 0.2359,
"step": 754
},
{
"epoch": 2.6678445229681977,
"grad_norm": 0.208458494365575,
"learning_rate": 6.217277486910995e-06,
"loss": 0.2391,
"step": 755
},
{
"epoch": 2.6713780918727914,
"grad_norm": 0.18295771780617642,
"learning_rate": 6.151832460732985e-06,
"loss": 0.2208,
"step": 756
},
{
"epoch": 2.674911660777385,
"grad_norm": 0.18781939728781455,
"learning_rate": 6.086387434554974e-06,
"loss": 0.2283,
"step": 757
},
{
"epoch": 2.6784452296819787,
"grad_norm": 0.1898580024071079,
"learning_rate": 6.0209424083769635e-06,
"loss": 0.2424,
"step": 758
},
{
"epoch": 2.6819787985865724,
"grad_norm": 0.17961541614640156,
"learning_rate": 5.955497382198953e-06,
"loss": 0.236,
"step": 759
},
{
"epoch": 2.685512367491166,
"grad_norm": 0.16567181235802067,
"learning_rate": 5.890052356020943e-06,
"loss": 0.2225,
"step": 760
},
{
"epoch": 2.6890459363957597,
"grad_norm": 0.17790702965628052,
"learning_rate": 5.824607329842932e-06,
"loss": 0.2208,
"step": 761
},
{
"epoch": 2.6925795053003534,
"grad_norm": 0.1906891882199232,
"learning_rate": 5.759162303664922e-06,
"loss": 0.2497,
"step": 762
},
{
"epoch": 2.696113074204947,
"grad_norm": 0.16870765126801932,
"learning_rate": 5.693717277486911e-06,
"loss": 0.2282,
"step": 763
},
{
"epoch": 2.6996466431095407,
"grad_norm": 0.18524540596037145,
"learning_rate": 5.6282722513089e-06,
"loss": 0.2446,
"step": 764
},
{
"epoch": 2.7031802120141344,
"grad_norm": 0.18434145297554605,
"learning_rate": 5.562827225130891e-06,
"loss": 0.2348,
"step": 765
},
{
"epoch": 2.706713780918728,
"grad_norm": 0.171064041185688,
"learning_rate": 5.49738219895288e-06,
"loss": 0.2306,
"step": 766
},
{
"epoch": 2.7102473498233217,
"grad_norm": 0.1812884263450971,
"learning_rate": 5.431937172774869e-06,
"loss": 0.2363,
"step": 767
},
{
"epoch": 2.713780918727915,
"grad_norm": 0.16878883377542475,
"learning_rate": 5.366492146596859e-06,
"loss": 0.2327,
"step": 768
},
{
"epoch": 2.7173144876325086,
"grad_norm": 0.18648845286469032,
"learning_rate": 5.301047120418849e-06,
"loss": 0.2294,
"step": 769
},
{
"epoch": 2.7208480565371023,
"grad_norm": 0.18393440372386363,
"learning_rate": 5.235602094240838e-06,
"loss": 0.2387,
"step": 770
},
{
"epoch": 2.724381625441696,
"grad_norm": 0.17522912008387853,
"learning_rate": 5.170157068062828e-06,
"loss": 0.2359,
"step": 771
},
{
"epoch": 2.7279151943462896,
"grad_norm": 0.1731305786571367,
"learning_rate": 5.104712041884817e-06,
"loss": 0.2408,
"step": 772
},
{
"epoch": 2.7314487632508833,
"grad_norm": 0.18392262080723032,
"learning_rate": 5.039267015706806e-06,
"loss": 0.2378,
"step": 773
},
{
"epoch": 2.734982332155477,
"grad_norm": 0.2324644454310263,
"learning_rate": 4.973821989528796e-06,
"loss": 0.2364,
"step": 774
},
{
"epoch": 2.7385159010600706,
"grad_norm": 0.22020168672975418,
"learning_rate": 4.908376963350786e-06,
"loss": 0.2412,
"step": 775
},
{
"epoch": 2.7420494699646643,
"grad_norm": 0.17317542374435127,
"learning_rate": 4.842931937172775e-06,
"loss": 0.2299,
"step": 776
},
{
"epoch": 2.745583038869258,
"grad_norm": 0.17453862428298303,
"learning_rate": 4.7774869109947645e-06,
"loss": 0.2162,
"step": 777
},
{
"epoch": 2.7491166077738516,
"grad_norm": 0.18955681041909164,
"learning_rate": 4.712041884816754e-06,
"loss": 0.2482,
"step": 778
},
{
"epoch": 2.7526501766784452,
"grad_norm": 0.18104170552403998,
"learning_rate": 4.646596858638744e-06,
"loss": 0.224,
"step": 779
},
{
"epoch": 2.756183745583039,
"grad_norm": 0.18492905369504056,
"learning_rate": 4.5811518324607335e-06,
"loss": 0.2345,
"step": 780
},
{
"epoch": 2.7597173144876326,
"grad_norm": 0.1706257903083117,
"learning_rate": 4.515706806282723e-06,
"loss": 0.2262,
"step": 781
},
{
"epoch": 2.7632508833922262,
"grad_norm": 0.17762579376198123,
"learning_rate": 4.450261780104712e-06,
"loss": 0.2458,
"step": 782
},
{
"epoch": 2.76678445229682,
"grad_norm": 0.18167233315938297,
"learning_rate": 4.3848167539267015e-06,
"loss": 0.2342,
"step": 783
},
{
"epoch": 2.7703180212014136,
"grad_norm": 0.17697110967380986,
"learning_rate": 4.319371727748691e-06,
"loss": 0.2144,
"step": 784
},
{
"epoch": 2.7738515901060072,
"grad_norm": 0.16888070118445714,
"learning_rate": 4.253926701570681e-06,
"loss": 0.2211,
"step": 785
},
{
"epoch": 2.777385159010601,
"grad_norm": 0.17244316217011102,
"learning_rate": 4.18848167539267e-06,
"loss": 0.2355,
"step": 786
},
{
"epoch": 2.7809187279151946,
"grad_norm": 0.17217798599995793,
"learning_rate": 4.12303664921466e-06,
"loss": 0.2315,
"step": 787
},
{
"epoch": 2.7844522968197882,
"grad_norm": 0.16900324423286736,
"learning_rate": 4.057591623036649e-06,
"loss": 0.2175,
"step": 788
},
{
"epoch": 2.787985865724382,
"grad_norm": 0.17252504195615134,
"learning_rate": 3.992146596858639e-06,
"loss": 0.2247,
"step": 789
},
{
"epoch": 2.791519434628975,
"grad_norm": 0.17503324123716169,
"learning_rate": 3.926701570680629e-06,
"loss": 0.2391,
"step": 790
},
{
"epoch": 2.795053003533569,
"grad_norm": 0.17887917773146636,
"learning_rate": 3.861256544502618e-06,
"loss": 0.2356,
"step": 791
},
{
"epoch": 2.7985865724381624,
"grad_norm": 0.16689747548257064,
"learning_rate": 3.7958115183246074e-06,
"loss": 0.2324,
"step": 792
},
{
"epoch": 2.802120141342756,
"grad_norm": 0.16566627704255155,
"learning_rate": 3.7303664921465967e-06,
"loss": 0.2196,
"step": 793
},
{
"epoch": 2.8056537102473498,
"grad_norm": 0.17460271204043498,
"learning_rate": 3.664921465968586e-06,
"loss": 0.2266,
"step": 794
},
{
"epoch": 2.8091872791519434,
"grad_norm": 0.16802373354888647,
"learning_rate": 3.5994764397905763e-06,
"loss": 0.2409,
"step": 795
},
{
"epoch": 2.812720848056537,
"grad_norm": 0.16973041195636776,
"learning_rate": 3.534031413612566e-06,
"loss": 0.2146,
"step": 796
},
{
"epoch": 2.8162544169611308,
"grad_norm": 0.172066590913796,
"learning_rate": 3.4685863874345554e-06,
"loss": 0.2408,
"step": 797
},
{
"epoch": 2.8197879858657244,
"grad_norm": 0.18316651716489063,
"learning_rate": 3.4031413612565448e-06,
"loss": 0.2229,
"step": 798
},
{
"epoch": 2.823321554770318,
"grad_norm": 0.16403280579710008,
"learning_rate": 3.337696335078534e-06,
"loss": 0.2217,
"step": 799
},
{
"epoch": 2.8268551236749118,
"grad_norm": 0.17741209944012906,
"learning_rate": 3.272251308900524e-06,
"loss": 0.2501,
"step": 800
},
{
"epoch": 2.8303886925795054,
"grad_norm": 0.16137396888889607,
"learning_rate": 3.2068062827225132e-06,
"loss": 0.2186,
"step": 801
},
{
"epoch": 2.833922261484099,
"grad_norm": 0.16949048941534778,
"learning_rate": 3.1413612565445026e-06,
"loss": 0.2174,
"step": 802
},
{
"epoch": 2.8374558303886923,
"grad_norm": 0.1686588906993951,
"learning_rate": 3.0759162303664924e-06,
"loss": 0.2299,
"step": 803
},
{
"epoch": 2.840989399293286,
"grad_norm": 0.16448688019936727,
"learning_rate": 3.0104712041884817e-06,
"loss": 0.2177,
"step": 804
},
{
"epoch": 2.8445229681978796,
"grad_norm": 0.16495219381817627,
"learning_rate": 2.9450261780104715e-06,
"loss": 0.2209,
"step": 805
},
{
"epoch": 2.8480565371024733,
"grad_norm": 0.17634472553603125,
"learning_rate": 2.879581151832461e-06,
"loss": 0.2348,
"step": 806
},
{
"epoch": 2.851590106007067,
"grad_norm": 0.16625742857043982,
"learning_rate": 2.81413612565445e-06,
"loss": 0.2264,
"step": 807
},
{
"epoch": 2.8551236749116606,
"grad_norm": 0.16880798094573282,
"learning_rate": 2.74869109947644e-06,
"loss": 0.2347,
"step": 808
},
{
"epoch": 2.8586572438162543,
"grad_norm": 0.16745510202120553,
"learning_rate": 2.6832460732984293e-06,
"loss": 0.2253,
"step": 809
},
{
"epoch": 2.862190812720848,
"grad_norm": 0.1657508156868396,
"learning_rate": 2.617801047120419e-06,
"loss": 0.2352,
"step": 810
},
{
"epoch": 2.8657243816254416,
"grad_norm": 0.1646821558180559,
"learning_rate": 2.5523560209424085e-06,
"loss": 0.2191,
"step": 811
},
{
"epoch": 2.8692579505300353,
"grad_norm": 0.16571228687865172,
"learning_rate": 2.486910994764398e-06,
"loss": 0.2189,
"step": 812
},
{
"epoch": 2.872791519434629,
"grad_norm": 0.17472138811402835,
"learning_rate": 2.4214659685863876e-06,
"loss": 0.2211,
"step": 813
},
{
"epoch": 2.8763250883392226,
"grad_norm": 0.17256595441300265,
"learning_rate": 2.356020942408377e-06,
"loss": 0.2164,
"step": 814
},
{
"epoch": 2.8798586572438163,
"grad_norm": 0.17025040207781883,
"learning_rate": 2.2905759162303667e-06,
"loss": 0.231,
"step": 815
},
{
"epoch": 2.88339222614841,
"grad_norm": 0.17098281011566185,
"learning_rate": 2.225130890052356e-06,
"loss": 0.2343,
"step": 816
},
{
"epoch": 2.8869257950530036,
"grad_norm": 0.1663593223270978,
"learning_rate": 2.1596858638743454e-06,
"loss": 0.2288,
"step": 817
},
{
"epoch": 2.8904593639575973,
"grad_norm": 0.17069617960827696,
"learning_rate": 2.094240837696335e-06,
"loss": 0.2371,
"step": 818
},
{
"epoch": 2.893992932862191,
"grad_norm": 0.16699093863345413,
"learning_rate": 2.0287958115183246e-06,
"loss": 0.2184,
"step": 819
},
{
"epoch": 2.8975265017667846,
"grad_norm": 0.17733698182044444,
"learning_rate": 1.9633507853403143e-06,
"loss": 0.223,
"step": 820
},
{
"epoch": 2.9010600706713783,
"grad_norm": 0.17166540116176068,
"learning_rate": 1.8979057591623037e-06,
"loss": 0.2216,
"step": 821
},
{
"epoch": 2.904593639575972,
"grad_norm": 0.1716140318997439,
"learning_rate": 1.832460732984293e-06,
"loss": 0.2289,
"step": 822
},
{
"epoch": 2.9081272084805656,
"grad_norm": 0.17364616113079412,
"learning_rate": 1.767015706806283e-06,
"loss": 0.23,
"step": 823
},
{
"epoch": 2.9116607773851593,
"grad_norm": 0.16775710455772982,
"learning_rate": 1.7015706806282724e-06,
"loss": 0.2295,
"step": 824
},
{
"epoch": 2.9151943462897525,
"grad_norm": 0.1614361444690107,
"learning_rate": 1.636125654450262e-06,
"loss": 0.2112,
"step": 825
},
{
"epoch": 2.918727915194346,
"grad_norm": 0.16919622335572776,
"learning_rate": 1.5706806282722513e-06,
"loss": 0.2316,
"step": 826
},
{
"epoch": 2.92226148409894,
"grad_norm": 0.16786393863730842,
"learning_rate": 1.5052356020942409e-06,
"loss": 0.2184,
"step": 827
},
{
"epoch": 2.9257950530035335,
"grad_norm": 0.16991202350514956,
"learning_rate": 1.4397905759162304e-06,
"loss": 0.2331,
"step": 828
},
{
"epoch": 2.929328621908127,
"grad_norm": 0.16786823572269294,
"learning_rate": 1.37434554973822e-06,
"loss": 0.2399,
"step": 829
},
{
"epoch": 2.932862190812721,
"grad_norm": 0.16438827374653608,
"learning_rate": 1.3089005235602096e-06,
"loss": 0.2275,
"step": 830
},
{
"epoch": 2.9363957597173145,
"grad_norm": 0.16846548637544675,
"learning_rate": 1.243455497382199e-06,
"loss": 0.2404,
"step": 831
},
{
"epoch": 2.939929328621908,
"grad_norm": 0.16565663831084615,
"learning_rate": 1.1780104712041885e-06,
"loss": 0.2292,
"step": 832
},
{
"epoch": 2.943462897526502,
"grad_norm": 0.16230690808269863,
"learning_rate": 1.112565445026178e-06,
"loss": 0.2192,
"step": 833
},
{
"epoch": 2.9469964664310955,
"grad_norm": 0.17771104973165297,
"learning_rate": 1.0471204188481676e-06,
"loss": 0.2331,
"step": 834
},
{
"epoch": 2.950530035335689,
"grad_norm": 0.1745845197810451,
"learning_rate": 9.816753926701572e-07,
"loss": 0.2358,
"step": 835
},
{
"epoch": 2.954063604240283,
"grad_norm": 0.1753891814760547,
"learning_rate": 9.162303664921465e-07,
"loss": 0.2311,
"step": 836
},
{
"epoch": 2.9575971731448765,
"grad_norm": 0.16785542988565672,
"learning_rate": 8.507853403141362e-07,
"loss": 0.2374,
"step": 837
},
{
"epoch": 2.9611307420494697,
"grad_norm": 0.16973867950381683,
"learning_rate": 7.853403141361256e-07,
"loss": 0.2289,
"step": 838
},
{
"epoch": 2.9646643109540634,
"grad_norm": 0.16705812729898264,
"learning_rate": 7.198952879581152e-07,
"loss": 0.2385,
"step": 839
},
{
"epoch": 2.968197879858657,
"grad_norm": 0.16378873880518796,
"learning_rate": 6.544502617801048e-07,
"loss": 0.2223,
"step": 840
},
{
"epoch": 2.9717314487632507,
"grad_norm": 0.1649356905574732,
"learning_rate": 5.890052356020942e-07,
"loss": 0.2213,
"step": 841
},
{
"epoch": 2.9752650176678443,
"grad_norm": 0.1687471752325434,
"learning_rate": 5.235602094240838e-07,
"loss": 0.2167,
"step": 842
},
{
"epoch": 2.978798586572438,
"grad_norm": 0.1641759936181568,
"learning_rate": 4.5811518324607326e-07,
"loss": 0.2236,
"step": 843
},
{
"epoch": 2.9823321554770317,
"grad_norm": 0.1666836130286124,
"learning_rate": 3.926701570680628e-07,
"loss": 0.2239,
"step": 844
},
{
"epoch": 2.9858657243816253,
"grad_norm": 0.172077591640406,
"learning_rate": 3.272251308900524e-07,
"loss": 0.2341,
"step": 845
},
{
"epoch": 2.989399293286219,
"grad_norm": 0.16566317657104862,
"learning_rate": 2.617801047120419e-07,
"loss": 0.2245,
"step": 846
},
{
"epoch": 2.9929328621908127,
"grad_norm": 0.16745361719266888,
"learning_rate": 1.963350785340314e-07,
"loss": 0.2194,
"step": 847
},
{
"epoch": 2.9964664310954063,
"grad_norm": 0.16767114268645117,
"learning_rate": 1.3089005235602095e-07,
"loss": 0.2351,
"step": 848
},
{
"epoch": 3.0,
"grad_norm": 0.16528168365944196,
"learning_rate": 6.544502617801048e-08,
"loss": 0.2072,
"step": 849
},
{
"epoch": 3.0,
"step": 849,
"total_flos": 9.439633427365102e+18,
"train_loss": 0.43516799947959933,
"train_runtime": 28401.7223,
"train_samples_per_second": 0.478,
"train_steps_per_second": 0.03
}
],
"logging_steps": 1,
"max_steps": 849,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.439633427365102e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}