flyingbugs's picture
Model save
ba5ceb3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99675535366645,
"eval_steps": 500,
"global_step": 2310,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012978585334198572,
"grad_norm": 51.281074849235615,
"learning_rate": 0.0,
"loss": 11.2228,
"step": 1
},
{
"epoch": 0.0025957170668397143,
"grad_norm": 52.288477767034706,
"learning_rate": 2.1645021645021646e-07,
"loss": 11.2142,
"step": 2
},
{
"epoch": 0.003893575600259572,
"grad_norm": 51.690439395204805,
"learning_rate": 4.329004329004329e-07,
"loss": 11.2982,
"step": 3
},
{
"epoch": 0.005191434133679429,
"grad_norm": 52.70272970065567,
"learning_rate": 6.493506493506494e-07,
"loss": 11.2202,
"step": 4
},
{
"epoch": 0.006489292667099286,
"grad_norm": 52.19249689221791,
"learning_rate": 8.658008658008658e-07,
"loss": 11.223,
"step": 5
},
{
"epoch": 0.007787151200519144,
"grad_norm": 52.51499888342824,
"learning_rate": 1.0822510822510822e-06,
"loss": 11.1764,
"step": 6
},
{
"epoch": 0.009085009733939001,
"grad_norm": 54.65313049626493,
"learning_rate": 1.2987012987012988e-06,
"loss": 11.1836,
"step": 7
},
{
"epoch": 0.010382868267358857,
"grad_norm": 56.26332260448297,
"learning_rate": 1.5151515151515152e-06,
"loss": 10.9434,
"step": 8
},
{
"epoch": 0.011680726800778715,
"grad_norm": 56.526613959934075,
"learning_rate": 1.7316017316017317e-06,
"loss": 10.8116,
"step": 9
},
{
"epoch": 0.012978585334198572,
"grad_norm": 73.22487510820876,
"learning_rate": 1.948051948051948e-06,
"loss": 10.0099,
"step": 10
},
{
"epoch": 0.01427644386761843,
"grad_norm": 78.76237638740871,
"learning_rate": 2.1645021645021643e-06,
"loss": 9.7022,
"step": 11
},
{
"epoch": 0.015574302401038288,
"grad_norm": 84.53260376952899,
"learning_rate": 2.3809523809523808e-06,
"loss": 9.2879,
"step": 12
},
{
"epoch": 0.016872160934458143,
"grad_norm": 93.89336284136559,
"learning_rate": 2.5974025974025976e-06,
"loss": 9.1068,
"step": 13
},
{
"epoch": 0.018170019467878003,
"grad_norm": 67.50935080000205,
"learning_rate": 2.813852813852814e-06,
"loss": 4.1366,
"step": 14
},
{
"epoch": 0.01946787800129786,
"grad_norm": 60.451555462271216,
"learning_rate": 3.0303030303030305e-06,
"loss": 3.7321,
"step": 15
},
{
"epoch": 0.020765736534717714,
"grad_norm": 48.941978109448335,
"learning_rate": 3.2467532467532465e-06,
"loss": 3.3276,
"step": 16
},
{
"epoch": 0.022063595068137574,
"grad_norm": 41.193189217962875,
"learning_rate": 3.4632034632034634e-06,
"loss": 3.0135,
"step": 17
},
{
"epoch": 0.02336145360155743,
"grad_norm": 19.806882723512313,
"learning_rate": 3.67965367965368e-06,
"loss": 2.1732,
"step": 18
},
{
"epoch": 0.02465931213497729,
"grad_norm": 6.601644234547365,
"learning_rate": 3.896103896103896e-06,
"loss": 1.5589,
"step": 19
},
{
"epoch": 0.025957170668397145,
"grad_norm": 5.420878318524882,
"learning_rate": 4.112554112554113e-06,
"loss": 1.4694,
"step": 20
},
{
"epoch": 0.027255029201817,
"grad_norm": 4.513535899746422,
"learning_rate": 4.329004329004329e-06,
"loss": 1.4127,
"step": 21
},
{
"epoch": 0.02855288773523686,
"grad_norm": 3.4399120070700926,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.308,
"step": 22
},
{
"epoch": 0.029850746268656716,
"grad_norm": 2.737903832715275,
"learning_rate": 4.7619047619047615e-06,
"loss": 1.2391,
"step": 23
},
{
"epoch": 0.031148604802076575,
"grad_norm": 2.229792362203531,
"learning_rate": 4.978354978354978e-06,
"loss": 1.207,
"step": 24
},
{
"epoch": 0.03244646333549643,
"grad_norm": 1.67506952925642,
"learning_rate": 5.194805194805195e-06,
"loss": 1.164,
"step": 25
},
{
"epoch": 0.03374432186891629,
"grad_norm": 7.810491533013112,
"learning_rate": 5.411255411255411e-06,
"loss": 1.0519,
"step": 26
},
{
"epoch": 0.03504218040233614,
"grad_norm": 1.8418087052814074,
"learning_rate": 5.627705627705628e-06,
"loss": 1.0332,
"step": 27
},
{
"epoch": 0.036340038935756006,
"grad_norm": 1.6946651958442733,
"learning_rate": 5.844155844155844e-06,
"loss": 1.0257,
"step": 28
},
{
"epoch": 0.03763789746917586,
"grad_norm": 1.0837092117736122,
"learning_rate": 6.060606060606061e-06,
"loss": 0.9995,
"step": 29
},
{
"epoch": 0.03893575600259572,
"grad_norm": 1.022247931246769,
"learning_rate": 6.277056277056277e-06,
"loss": 0.9658,
"step": 30
},
{
"epoch": 0.04023361453601557,
"grad_norm": 1.0439465825515253,
"learning_rate": 6.493506493506493e-06,
"loss": 0.9258,
"step": 31
},
{
"epoch": 0.04153147306943543,
"grad_norm": 0.8476038489892367,
"learning_rate": 6.709956709956711e-06,
"loss": 0.9183,
"step": 32
},
{
"epoch": 0.04282933160285529,
"grad_norm": 0.7260505313959857,
"learning_rate": 6.926406926406927e-06,
"loss": 0.8859,
"step": 33
},
{
"epoch": 0.04412719013627515,
"grad_norm": 0.9334322529996619,
"learning_rate": 7.142857142857143e-06,
"loss": 0.8775,
"step": 34
},
{
"epoch": 0.045425048669695,
"grad_norm": 0.7507371805560344,
"learning_rate": 7.35930735930736e-06,
"loss": 0.8506,
"step": 35
},
{
"epoch": 0.04672290720311486,
"grad_norm": 0.7037392218293158,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.8373,
"step": 36
},
{
"epoch": 0.048020765736534715,
"grad_norm": 0.7026115757535957,
"learning_rate": 7.792207792207792e-06,
"loss": 0.8148,
"step": 37
},
{
"epoch": 0.04931862426995458,
"grad_norm": 0.6329073089403997,
"learning_rate": 8.008658008658008e-06,
"loss": 0.7989,
"step": 38
},
{
"epoch": 0.050616482803374434,
"grad_norm": 0.576557174722755,
"learning_rate": 8.225108225108225e-06,
"loss": 0.801,
"step": 39
},
{
"epoch": 0.05191434133679429,
"grad_norm": 0.633584321792007,
"learning_rate": 8.441558441558442e-06,
"loss": 0.8154,
"step": 40
},
{
"epoch": 0.053212199870214145,
"grad_norm": 0.6357768126157509,
"learning_rate": 8.658008658008657e-06,
"loss": 0.7985,
"step": 41
},
{
"epoch": 0.054510058403634,
"grad_norm": 0.4606140950872704,
"learning_rate": 8.874458874458876e-06,
"loss": 0.7875,
"step": 42
},
{
"epoch": 0.055807916937053864,
"grad_norm": 0.42579840291728105,
"learning_rate": 9.090909090909091e-06,
"loss": 0.7882,
"step": 43
},
{
"epoch": 0.05710577547047372,
"grad_norm": 0.5127047756782175,
"learning_rate": 9.307359307359308e-06,
"loss": 0.7668,
"step": 44
},
{
"epoch": 0.058403634003893576,
"grad_norm": 0.5275747680292829,
"learning_rate": 9.523809523809523e-06,
"loss": 0.7556,
"step": 45
},
{
"epoch": 0.05970149253731343,
"grad_norm": 0.4422307893352111,
"learning_rate": 9.740259740259742e-06,
"loss": 0.7469,
"step": 46
},
{
"epoch": 0.06099935107073329,
"grad_norm": 0.3950972183567316,
"learning_rate": 9.956709956709957e-06,
"loss": 0.7257,
"step": 47
},
{
"epoch": 0.06229720960415315,
"grad_norm": 0.4294144227066294,
"learning_rate": 1.0173160173160174e-05,
"loss": 0.7082,
"step": 48
},
{
"epoch": 0.063595068137573,
"grad_norm": 0.4261355016492852,
"learning_rate": 1.038961038961039e-05,
"loss": 0.7202,
"step": 49
},
{
"epoch": 0.06489292667099286,
"grad_norm": 0.40006881327817506,
"learning_rate": 1.0606060606060607e-05,
"loss": 0.7006,
"step": 50
},
{
"epoch": 0.06619078520441272,
"grad_norm": 0.3484479390008924,
"learning_rate": 1.0822510822510823e-05,
"loss": 0.7141,
"step": 51
},
{
"epoch": 0.06748864373783257,
"grad_norm": 0.3508582969164509,
"learning_rate": 1.103896103896104e-05,
"loss": 0.7203,
"step": 52
},
{
"epoch": 0.06878650227125244,
"grad_norm": 0.3754749738655716,
"learning_rate": 1.1255411255411256e-05,
"loss": 0.7465,
"step": 53
},
{
"epoch": 0.07008436080467229,
"grad_norm": 0.33275270814242425,
"learning_rate": 1.1471861471861473e-05,
"loss": 0.6844,
"step": 54
},
{
"epoch": 0.07138221933809215,
"grad_norm": 0.29711493887953333,
"learning_rate": 1.1688311688311688e-05,
"loss": 0.668,
"step": 55
},
{
"epoch": 0.07268007787151201,
"grad_norm": 0.3254569707924083,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.7234,
"step": 56
},
{
"epoch": 0.07397793640493186,
"grad_norm": 0.2925311216603525,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.7004,
"step": 57
},
{
"epoch": 0.07527579493835172,
"grad_norm": 0.2781203466736231,
"learning_rate": 1.2337662337662339e-05,
"loss": 0.6845,
"step": 58
},
{
"epoch": 0.07657365347177157,
"grad_norm": 0.27946888261667213,
"learning_rate": 1.2554112554112554e-05,
"loss": 0.6999,
"step": 59
},
{
"epoch": 0.07787151200519143,
"grad_norm": 0.2728571313678063,
"learning_rate": 1.2770562770562773e-05,
"loss": 0.6639,
"step": 60
},
{
"epoch": 0.0791693705386113,
"grad_norm": 0.3093993935391829,
"learning_rate": 1.2987012987012986e-05,
"loss": 0.701,
"step": 61
},
{
"epoch": 0.08046722907203115,
"grad_norm": 0.2852724472177098,
"learning_rate": 1.3203463203463205e-05,
"loss": 0.681,
"step": 62
},
{
"epoch": 0.08176508760545101,
"grad_norm": 0.2693071822601781,
"learning_rate": 1.3419913419913421e-05,
"loss": 0.6679,
"step": 63
},
{
"epoch": 0.08306294613887086,
"grad_norm": 0.2883803733655785,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.6871,
"step": 64
},
{
"epoch": 0.08436080467229072,
"grad_norm": 0.27168971444927753,
"learning_rate": 1.3852813852813853e-05,
"loss": 0.6478,
"step": 65
},
{
"epoch": 0.08565866320571058,
"grad_norm": 0.2780741659791045,
"learning_rate": 1.406926406926407e-05,
"loss": 0.6654,
"step": 66
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.2669958151004055,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.6494,
"step": 67
},
{
"epoch": 0.0882543802725503,
"grad_norm": 0.2645854701003351,
"learning_rate": 1.4502164502164502e-05,
"loss": 0.6343,
"step": 68
},
{
"epoch": 0.08955223880597014,
"grad_norm": 0.27977755521966374,
"learning_rate": 1.471861471861472e-05,
"loss": 0.6703,
"step": 69
},
{
"epoch": 0.09085009733939,
"grad_norm": 0.2701714280796314,
"learning_rate": 1.4935064935064936e-05,
"loss": 0.6657,
"step": 70
},
{
"epoch": 0.09214795587280987,
"grad_norm": 0.3340236352400633,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.6654,
"step": 71
},
{
"epoch": 0.09344581440622972,
"grad_norm": 0.25125625871192836,
"learning_rate": 1.5367965367965366e-05,
"loss": 0.6829,
"step": 72
},
{
"epoch": 0.09474367293964958,
"grad_norm": 0.27623404854696354,
"learning_rate": 1.5584415584415583e-05,
"loss": 0.6864,
"step": 73
},
{
"epoch": 0.09604153147306943,
"grad_norm": 0.2855287411905892,
"learning_rate": 1.5800865800865803e-05,
"loss": 0.6669,
"step": 74
},
{
"epoch": 0.0973393900064893,
"grad_norm": 0.2544109696892319,
"learning_rate": 1.6017316017316017e-05,
"loss": 0.6441,
"step": 75
},
{
"epoch": 0.09863724853990916,
"grad_norm": 0.29021289781813303,
"learning_rate": 1.6233766233766234e-05,
"loss": 0.6664,
"step": 76
},
{
"epoch": 0.099935107073329,
"grad_norm": 0.26812240880351107,
"learning_rate": 1.645021645021645e-05,
"loss": 0.6391,
"step": 77
},
{
"epoch": 0.10123296560674887,
"grad_norm": 0.27576904300300786,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6355,
"step": 78
},
{
"epoch": 0.10253082414016872,
"grad_norm": 0.2814054287273717,
"learning_rate": 1.6883116883116884e-05,
"loss": 0.6265,
"step": 79
},
{
"epoch": 0.10382868267358858,
"grad_norm": 0.3103049215962741,
"learning_rate": 1.70995670995671e-05,
"loss": 0.6497,
"step": 80
},
{
"epoch": 0.10512654120700844,
"grad_norm": 0.2728333867240695,
"learning_rate": 1.7316017316017315e-05,
"loss": 0.628,
"step": 81
},
{
"epoch": 0.10642439974042829,
"grad_norm": 0.29691347602771223,
"learning_rate": 1.7532467532467535e-05,
"loss": 0.6481,
"step": 82
},
{
"epoch": 0.10772225827384815,
"grad_norm": 0.29273954514595735,
"learning_rate": 1.7748917748917752e-05,
"loss": 0.6261,
"step": 83
},
{
"epoch": 0.109020116807268,
"grad_norm": 0.3074962736781368,
"learning_rate": 1.7965367965367965e-05,
"loss": 0.6299,
"step": 84
},
{
"epoch": 0.11031797534068787,
"grad_norm": 0.29602233662175786,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.6335,
"step": 85
},
{
"epoch": 0.11161583387410773,
"grad_norm": 0.2830666870801868,
"learning_rate": 1.83982683982684e-05,
"loss": 0.641,
"step": 86
},
{
"epoch": 0.11291369240752758,
"grad_norm": 0.3125259689124729,
"learning_rate": 1.8614718614718616e-05,
"loss": 0.6388,
"step": 87
},
{
"epoch": 0.11421155094094744,
"grad_norm": 0.26645549323423784,
"learning_rate": 1.8831168831168833e-05,
"loss": 0.6208,
"step": 88
},
{
"epoch": 0.11550940947436729,
"grad_norm": 0.28954783071217016,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.6273,
"step": 89
},
{
"epoch": 0.11680726800778715,
"grad_norm": 0.28149679569001645,
"learning_rate": 1.9264069264069266e-05,
"loss": 0.6028,
"step": 90
},
{
"epoch": 0.11810512654120701,
"grad_norm": 0.2906262772721881,
"learning_rate": 1.9480519480519483e-05,
"loss": 0.6245,
"step": 91
},
{
"epoch": 0.11940298507462686,
"grad_norm": 0.2711185379042277,
"learning_rate": 1.9696969696969697e-05,
"loss": 0.6267,
"step": 92
},
{
"epoch": 0.12070084360804673,
"grad_norm": 0.3052664513793837,
"learning_rate": 1.9913419913419914e-05,
"loss": 0.6346,
"step": 93
},
{
"epoch": 0.12199870214146658,
"grad_norm": 0.29479074178005676,
"learning_rate": 2.012987012987013e-05,
"loss": 0.6255,
"step": 94
},
{
"epoch": 0.12329656067488644,
"grad_norm": 0.3687002197662538,
"learning_rate": 2.0346320346320347e-05,
"loss": 0.6269,
"step": 95
},
{
"epoch": 0.1245944192083063,
"grad_norm": 0.26974731920341294,
"learning_rate": 2.0562770562770564e-05,
"loss": 0.6355,
"step": 96
},
{
"epoch": 0.12589227774172615,
"grad_norm": 0.35521751114512884,
"learning_rate": 2.077922077922078e-05,
"loss": 0.6293,
"step": 97
},
{
"epoch": 0.127190136275146,
"grad_norm": 0.31122119266101045,
"learning_rate": 2.0995670995670998e-05,
"loss": 0.6548,
"step": 98
},
{
"epoch": 0.12848799480856588,
"grad_norm": 0.32784103974924345,
"learning_rate": 2.1212121212121215e-05,
"loss": 0.6409,
"step": 99
},
{
"epoch": 0.12978585334198572,
"grad_norm": 0.2862191321006967,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.6287,
"step": 100
},
{
"epoch": 0.13108371187540557,
"grad_norm": 0.2888970770108121,
"learning_rate": 2.1645021645021645e-05,
"loss": 0.5825,
"step": 101
},
{
"epoch": 0.13238157040882545,
"grad_norm": 0.27541204550524634,
"learning_rate": 2.1861471861471862e-05,
"loss": 0.6056,
"step": 102
},
{
"epoch": 0.1336794289422453,
"grad_norm": 0.2829745345550545,
"learning_rate": 2.207792207792208e-05,
"loss": 0.6388,
"step": 103
},
{
"epoch": 0.13497728747566515,
"grad_norm": 0.31335331278223877,
"learning_rate": 2.2294372294372296e-05,
"loss": 0.6149,
"step": 104
},
{
"epoch": 0.136275146009085,
"grad_norm": 0.26183513844983125,
"learning_rate": 2.2510822510822512e-05,
"loss": 0.598,
"step": 105
},
{
"epoch": 0.13757300454250487,
"grad_norm": 0.3166303353223508,
"learning_rate": 2.272727272727273e-05,
"loss": 0.6153,
"step": 106
},
{
"epoch": 0.13887086307592472,
"grad_norm": 0.2827827597423759,
"learning_rate": 2.2943722943722946e-05,
"loss": 0.5878,
"step": 107
},
{
"epoch": 0.14016872160934457,
"grad_norm": 0.27950978868403287,
"learning_rate": 2.3160173160173163e-05,
"loss": 0.6022,
"step": 108
},
{
"epoch": 0.14146658014276445,
"grad_norm": 0.31785506543954495,
"learning_rate": 2.3376623376623376e-05,
"loss": 0.6419,
"step": 109
},
{
"epoch": 0.1427644386761843,
"grad_norm": 0.2760724448320942,
"learning_rate": 2.3593073593073593e-05,
"loss": 0.5892,
"step": 110
},
{
"epoch": 0.14406229720960415,
"grad_norm": 0.31705667668464776,
"learning_rate": 2.380952380952381e-05,
"loss": 0.5828,
"step": 111
},
{
"epoch": 0.14536015574302402,
"grad_norm": 0.2786427147511611,
"learning_rate": 2.4025974025974027e-05,
"loss": 0.6189,
"step": 112
},
{
"epoch": 0.14665801427644387,
"grad_norm": 0.33800188191224245,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.5867,
"step": 113
},
{
"epoch": 0.14795587280986372,
"grad_norm": 0.3183986863565769,
"learning_rate": 2.4458874458874457e-05,
"loss": 0.6244,
"step": 114
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.346611504802979,
"learning_rate": 2.4675324675324678e-05,
"loss": 0.6114,
"step": 115
},
{
"epoch": 0.15055158987670345,
"grad_norm": 0.3193746967683076,
"learning_rate": 2.4891774891774894e-05,
"loss": 0.5847,
"step": 116
},
{
"epoch": 0.1518494484101233,
"grad_norm": 0.329720331399979,
"learning_rate": 2.5108225108225108e-05,
"loss": 0.6104,
"step": 117
},
{
"epoch": 0.15314730694354314,
"grad_norm": 0.30497761214035857,
"learning_rate": 2.5324675324675325e-05,
"loss": 0.6147,
"step": 118
},
{
"epoch": 0.15444516547696302,
"grad_norm": 0.3065657873353463,
"learning_rate": 2.5541125541125545e-05,
"loss": 0.5891,
"step": 119
},
{
"epoch": 0.15574302401038287,
"grad_norm": 0.3040591111660935,
"learning_rate": 2.575757575757576e-05,
"loss": 0.5874,
"step": 120
},
{
"epoch": 0.15704088254380272,
"grad_norm": 0.3176140258251669,
"learning_rate": 2.5974025974025972e-05,
"loss": 0.5891,
"step": 121
},
{
"epoch": 0.1583387410772226,
"grad_norm": 0.33129130491628744,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.5754,
"step": 122
},
{
"epoch": 0.15963659961064244,
"grad_norm": 0.3400250207622185,
"learning_rate": 2.640692640692641e-05,
"loss": 0.5927,
"step": 123
},
{
"epoch": 0.1609344581440623,
"grad_norm": 0.3294442929975534,
"learning_rate": 2.6623376623376623e-05,
"loss": 0.6016,
"step": 124
},
{
"epoch": 0.16223231667748214,
"grad_norm": 0.27952039743370355,
"learning_rate": 2.6839826839826843e-05,
"loss": 0.5674,
"step": 125
},
{
"epoch": 0.16353017521090202,
"grad_norm": 0.3263152361115472,
"learning_rate": 2.7056277056277056e-05,
"loss": 0.6185,
"step": 126
},
{
"epoch": 0.16482803374432187,
"grad_norm": 0.34561117525982527,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.6003,
"step": 127
},
{
"epoch": 0.16612589227774172,
"grad_norm": 0.36330136868220264,
"learning_rate": 2.7489177489177493e-05,
"loss": 0.5796,
"step": 128
},
{
"epoch": 0.1674237508111616,
"grad_norm": 0.3448144052747857,
"learning_rate": 2.7705627705627707e-05,
"loss": 0.5858,
"step": 129
},
{
"epoch": 0.16872160934458144,
"grad_norm": 0.30841385505522906,
"learning_rate": 2.792207792207792e-05,
"loss": 0.5913,
"step": 130
},
{
"epoch": 0.1700194678780013,
"grad_norm": 0.3823986000835476,
"learning_rate": 2.813852813852814e-05,
"loss": 0.6089,
"step": 131
},
{
"epoch": 0.17131732641142117,
"grad_norm": 0.3183137204294537,
"learning_rate": 2.8354978354978357e-05,
"loss": 0.5974,
"step": 132
},
{
"epoch": 0.17261518494484102,
"grad_norm": 0.3375228791953999,
"learning_rate": 2.857142857142857e-05,
"loss": 0.5919,
"step": 133
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.34769553896113353,
"learning_rate": 2.878787878787879e-05,
"loss": 0.587,
"step": 134
},
{
"epoch": 0.1752109020116807,
"grad_norm": 0.34053830322587214,
"learning_rate": 2.9004329004329005e-05,
"loss": 0.5846,
"step": 135
},
{
"epoch": 0.1765087605451006,
"grad_norm": 0.3327693629121813,
"learning_rate": 2.922077922077922e-05,
"loss": 0.5929,
"step": 136
},
{
"epoch": 0.17780661907852044,
"grad_norm": 0.37595317145253215,
"learning_rate": 2.943722943722944e-05,
"loss": 0.5836,
"step": 137
},
{
"epoch": 0.1791044776119403,
"grad_norm": 0.31124901930305726,
"learning_rate": 2.9653679653679655e-05,
"loss": 0.5946,
"step": 138
},
{
"epoch": 0.18040233614536016,
"grad_norm": 0.41500685318923003,
"learning_rate": 2.9870129870129872e-05,
"loss": 0.599,
"step": 139
},
{
"epoch": 0.18170019467878,
"grad_norm": 0.4422225800744917,
"learning_rate": 3.0086580086580092e-05,
"loss": 0.6079,
"step": 140
},
{
"epoch": 0.18299805321219986,
"grad_norm": 0.3911349391427895,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.5911,
"step": 141
},
{
"epoch": 0.18429591174561974,
"grad_norm": 0.3565760473012978,
"learning_rate": 3.051948051948052e-05,
"loss": 0.5874,
"step": 142
},
{
"epoch": 0.1855937702790396,
"grad_norm": 0.3316833578762426,
"learning_rate": 3.073593073593073e-05,
"loss": 0.5987,
"step": 143
},
{
"epoch": 0.18689162881245944,
"grad_norm": 0.4255792906025628,
"learning_rate": 3.095238095238095e-05,
"loss": 0.5674,
"step": 144
},
{
"epoch": 0.18818948734587929,
"grad_norm": 0.3111389344438918,
"learning_rate": 3.1168831168831166e-05,
"loss": 0.5916,
"step": 145
},
{
"epoch": 0.18948734587929916,
"grad_norm": 0.40391893328316164,
"learning_rate": 3.1385281385281387e-05,
"loss": 0.5862,
"step": 146
},
{
"epoch": 0.190785204412719,
"grad_norm": 0.3571856870514297,
"learning_rate": 3.160173160173161e-05,
"loss": 0.5783,
"step": 147
},
{
"epoch": 0.19208306294613886,
"grad_norm": 0.34724535128608686,
"learning_rate": 3.181818181818182e-05,
"loss": 0.593,
"step": 148
},
{
"epoch": 0.19338092147955874,
"grad_norm": 0.36623311715616075,
"learning_rate": 3.2034632034632034e-05,
"loss": 0.5791,
"step": 149
},
{
"epoch": 0.1946787800129786,
"grad_norm": 0.35421377131407383,
"learning_rate": 3.2251082251082254e-05,
"loss": 0.5869,
"step": 150
},
{
"epoch": 0.19597663854639844,
"grad_norm": 0.3580175565804796,
"learning_rate": 3.246753246753247e-05,
"loss": 0.5731,
"step": 151
},
{
"epoch": 0.1972744970798183,
"grad_norm": 0.3779107544260428,
"learning_rate": 3.268398268398268e-05,
"loss": 0.5888,
"step": 152
},
{
"epoch": 0.19857235561323816,
"grad_norm": 0.381401724832965,
"learning_rate": 3.29004329004329e-05,
"loss": 0.5754,
"step": 153
},
{
"epoch": 0.199870214146658,
"grad_norm": 0.3996699371198549,
"learning_rate": 3.311688311688312e-05,
"loss": 0.5878,
"step": 154
},
{
"epoch": 0.20116807268007786,
"grad_norm": 0.3498521285804811,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.5624,
"step": 155
},
{
"epoch": 0.20246593121349774,
"grad_norm": 0.4329402753533996,
"learning_rate": 3.3549783549783555e-05,
"loss": 0.5823,
"step": 156
},
{
"epoch": 0.20376378974691758,
"grad_norm": 0.4715275117713498,
"learning_rate": 3.376623376623377e-05,
"loss": 0.5679,
"step": 157
},
{
"epoch": 0.20506164828033743,
"grad_norm": 0.4087297995001702,
"learning_rate": 3.398268398268398e-05,
"loss": 0.5492,
"step": 158
},
{
"epoch": 0.2063595068137573,
"grad_norm": 0.3963104181486302,
"learning_rate": 3.41991341991342e-05,
"loss": 0.5779,
"step": 159
},
{
"epoch": 0.20765736534717716,
"grad_norm": 0.4626784659383467,
"learning_rate": 3.4415584415584416e-05,
"loss": 0.5791,
"step": 160
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.4629189431646934,
"learning_rate": 3.463203463203463e-05,
"loss": 0.5709,
"step": 161
},
{
"epoch": 0.21025308241401688,
"grad_norm": 0.4327284524192223,
"learning_rate": 3.484848484848485e-05,
"loss": 0.5821,
"step": 162
},
{
"epoch": 0.21155094094743673,
"grad_norm": 0.42226923421652224,
"learning_rate": 3.506493506493507e-05,
"loss": 0.579,
"step": 163
},
{
"epoch": 0.21284879948085658,
"grad_norm": 0.37986989822155737,
"learning_rate": 3.528138528138528e-05,
"loss": 0.5656,
"step": 164
},
{
"epoch": 0.21414665801427643,
"grad_norm": 0.4629547655665463,
"learning_rate": 3.5497835497835503e-05,
"loss": 0.5703,
"step": 165
},
{
"epoch": 0.2154445165476963,
"grad_norm": 0.41674661311211725,
"learning_rate": 3.571428571428572e-05,
"loss": 0.5775,
"step": 166
},
{
"epoch": 0.21674237508111616,
"grad_norm": 0.37812170353301494,
"learning_rate": 3.593073593073593e-05,
"loss": 0.5647,
"step": 167
},
{
"epoch": 0.218040233614536,
"grad_norm": 0.3533683709945352,
"learning_rate": 3.6147186147186144e-05,
"loss": 0.5742,
"step": 168
},
{
"epoch": 0.21933809214795588,
"grad_norm": 0.3327311378407231,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.5616,
"step": 169
},
{
"epoch": 0.22063595068137573,
"grad_norm": 0.31532044110811386,
"learning_rate": 3.6580086580086584e-05,
"loss": 0.5434,
"step": 170
},
{
"epoch": 0.22193380921479558,
"grad_norm": 0.3191051407668251,
"learning_rate": 3.67965367965368e-05,
"loss": 0.5719,
"step": 171
},
{
"epoch": 0.22323166774821546,
"grad_norm": 0.36508138364995835,
"learning_rate": 3.701298701298702e-05,
"loss": 0.5496,
"step": 172
},
{
"epoch": 0.2245295262816353,
"grad_norm": 0.35917301960844417,
"learning_rate": 3.722943722943723e-05,
"loss": 0.5599,
"step": 173
},
{
"epoch": 0.22582738481505515,
"grad_norm": 0.332146935347223,
"learning_rate": 3.7445887445887445e-05,
"loss": 0.5729,
"step": 174
},
{
"epoch": 0.227125243348475,
"grad_norm": 0.3761709742507644,
"learning_rate": 3.7662337662337665e-05,
"loss": 0.5395,
"step": 175
},
{
"epoch": 0.22842310188189488,
"grad_norm": 0.357015737631827,
"learning_rate": 3.787878787878788e-05,
"loss": 0.5661,
"step": 176
},
{
"epoch": 0.22972096041531473,
"grad_norm": 0.34903223053324706,
"learning_rate": 3.809523809523809e-05,
"loss": 0.5633,
"step": 177
},
{
"epoch": 0.23101881894873458,
"grad_norm": 0.3592680565530814,
"learning_rate": 3.831168831168831e-05,
"loss": 0.5504,
"step": 178
},
{
"epoch": 0.23231667748215445,
"grad_norm": 0.31763219861115954,
"learning_rate": 3.852813852813853e-05,
"loss": 0.5693,
"step": 179
},
{
"epoch": 0.2336145360155743,
"grad_norm": 0.37831351950166914,
"learning_rate": 3.8744588744588746e-05,
"loss": 0.5545,
"step": 180
},
{
"epoch": 0.23491239454899415,
"grad_norm": 0.3032029950520603,
"learning_rate": 3.8961038961038966e-05,
"loss": 0.5483,
"step": 181
},
{
"epoch": 0.23621025308241403,
"grad_norm": 0.45423832636818023,
"learning_rate": 3.917748917748918e-05,
"loss": 0.5646,
"step": 182
},
{
"epoch": 0.23750811161583388,
"grad_norm": 0.3885683241280637,
"learning_rate": 3.939393939393939e-05,
"loss": 0.5782,
"step": 183
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.4113001708391826,
"learning_rate": 3.9610389610389614e-05,
"loss": 0.5709,
"step": 184
},
{
"epoch": 0.24010382868267358,
"grad_norm": 0.43060691545097807,
"learning_rate": 3.982683982683983e-05,
"loss": 0.5357,
"step": 185
},
{
"epoch": 0.24140168721609345,
"grad_norm": 0.48621329563873417,
"learning_rate": 4.004329004329004e-05,
"loss": 0.5438,
"step": 186
},
{
"epoch": 0.2426995457495133,
"grad_norm": 0.346819520203559,
"learning_rate": 4.025974025974026e-05,
"loss": 0.5448,
"step": 187
},
{
"epoch": 0.24399740428293315,
"grad_norm": 0.5771040244138606,
"learning_rate": 4.047619047619048e-05,
"loss": 0.5609,
"step": 188
},
{
"epoch": 0.24529526281635303,
"grad_norm": 0.5691856093486398,
"learning_rate": 4.0692640692640695e-05,
"loss": 0.5509,
"step": 189
},
{
"epoch": 0.24659312134977288,
"grad_norm": 0.5658078548327832,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.5457,
"step": 190
},
{
"epoch": 0.24789097988319272,
"grad_norm": 0.32565141992028185,
"learning_rate": 4.112554112554113e-05,
"loss": 0.5576,
"step": 191
},
{
"epoch": 0.2491888384166126,
"grad_norm": 0.693035116005457,
"learning_rate": 4.134199134199134e-05,
"loss": 0.5818,
"step": 192
},
{
"epoch": 0.25048669695003245,
"grad_norm": 0.5767521454545272,
"learning_rate": 4.155844155844156e-05,
"loss": 0.5651,
"step": 193
},
{
"epoch": 0.2517845554834523,
"grad_norm": 0.5780821207088752,
"learning_rate": 4.1774891774891775e-05,
"loss": 0.569,
"step": 194
},
{
"epoch": 0.25308241401687215,
"grad_norm": 0.37604239901597153,
"learning_rate": 4.1991341991341996e-05,
"loss": 0.543,
"step": 195
},
{
"epoch": 0.254380272550292,
"grad_norm": 0.5156588377708116,
"learning_rate": 4.220779220779221e-05,
"loss": 0.5655,
"step": 196
},
{
"epoch": 0.2556781310837119,
"grad_norm": 0.547020541236707,
"learning_rate": 4.242424242424243e-05,
"loss": 0.5823,
"step": 197
},
{
"epoch": 0.25697598961713175,
"grad_norm": 0.4902045464021542,
"learning_rate": 4.264069264069264e-05,
"loss": 0.5819,
"step": 198
},
{
"epoch": 0.2582738481505516,
"grad_norm": 0.43892225186858413,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5407,
"step": 199
},
{
"epoch": 0.25957170668397145,
"grad_norm": 0.35311657422045256,
"learning_rate": 4.3073593073593077e-05,
"loss": 0.5333,
"step": 200
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.550191337628468,
"learning_rate": 4.329004329004329e-05,
"loss": 0.5691,
"step": 201
},
{
"epoch": 0.26216742375081115,
"grad_norm": 0.3252801360738619,
"learning_rate": 4.3506493506493503e-05,
"loss": 0.5533,
"step": 202
},
{
"epoch": 0.263465282284231,
"grad_norm": 0.4553304513015423,
"learning_rate": 4.3722943722943724e-05,
"loss": 0.5565,
"step": 203
},
{
"epoch": 0.2647631408176509,
"grad_norm": 0.3263307722581273,
"learning_rate": 4.3939393939393944e-05,
"loss": 0.5671,
"step": 204
},
{
"epoch": 0.26606099935107075,
"grad_norm": 0.4000844274004943,
"learning_rate": 4.415584415584416e-05,
"loss": 0.5399,
"step": 205
},
{
"epoch": 0.2673588578844906,
"grad_norm": 0.38431545582799964,
"learning_rate": 4.437229437229438e-05,
"loss": 0.5417,
"step": 206
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.40816346897613404,
"learning_rate": 4.458874458874459e-05,
"loss": 0.5592,
"step": 207
},
{
"epoch": 0.2699545749513303,
"grad_norm": 0.34584489381728045,
"learning_rate": 4.4805194805194805e-05,
"loss": 0.5438,
"step": 208
},
{
"epoch": 0.27125243348475014,
"grad_norm": 0.4537307823944973,
"learning_rate": 4.5021645021645025e-05,
"loss": 0.5399,
"step": 209
},
{
"epoch": 0.27255029201817,
"grad_norm": 0.5752635325535591,
"learning_rate": 4.523809523809524e-05,
"loss": 0.5672,
"step": 210
},
{
"epoch": 0.2738481505515899,
"grad_norm": 0.4083818210095887,
"learning_rate": 4.545454545454546e-05,
"loss": 0.5617,
"step": 211
},
{
"epoch": 0.27514600908500975,
"grad_norm": 0.3170399657064755,
"learning_rate": 4.567099567099568e-05,
"loss": 0.5352,
"step": 212
},
{
"epoch": 0.2764438676184296,
"grad_norm": 0.31917717130826856,
"learning_rate": 4.588744588744589e-05,
"loss": 0.5617,
"step": 213
},
{
"epoch": 0.27774172615184944,
"grad_norm": 0.3772448329589651,
"learning_rate": 4.6103896103896106e-05,
"loss": 0.5662,
"step": 214
},
{
"epoch": 0.2790395846852693,
"grad_norm": 0.3799776585928483,
"learning_rate": 4.6320346320346326e-05,
"loss": 0.5814,
"step": 215
},
{
"epoch": 0.28033744321868914,
"grad_norm": 0.376252486652199,
"learning_rate": 4.653679653679654e-05,
"loss": 0.5475,
"step": 216
},
{
"epoch": 0.28163530175210905,
"grad_norm": 0.3703755301826731,
"learning_rate": 4.675324675324675e-05,
"loss": 0.5488,
"step": 217
},
{
"epoch": 0.2829331602855289,
"grad_norm": 0.32861168704985866,
"learning_rate": 4.696969696969697e-05,
"loss": 0.554,
"step": 218
},
{
"epoch": 0.28423101881894874,
"grad_norm": 0.3475845025879547,
"learning_rate": 4.718614718614719e-05,
"loss": 0.5407,
"step": 219
},
{
"epoch": 0.2855288773523686,
"grad_norm": 0.3648655973805309,
"learning_rate": 4.740259740259741e-05,
"loss": 0.5466,
"step": 220
},
{
"epoch": 0.28682673588578844,
"grad_norm": 0.3350866523035428,
"learning_rate": 4.761904761904762e-05,
"loss": 0.5548,
"step": 221
},
{
"epoch": 0.2881245944192083,
"grad_norm": 0.43767143054287594,
"learning_rate": 4.783549783549784e-05,
"loss": 0.5684,
"step": 222
},
{
"epoch": 0.28942245295262814,
"grad_norm": 0.421178133286777,
"learning_rate": 4.8051948051948054e-05,
"loss": 0.5639,
"step": 223
},
{
"epoch": 0.29072031148604804,
"grad_norm": 0.37835877083504477,
"learning_rate": 4.826839826839827e-05,
"loss": 0.5566,
"step": 224
},
{
"epoch": 0.2920181700194679,
"grad_norm": 0.3417724143733512,
"learning_rate": 4.848484848484849e-05,
"loss": 0.5552,
"step": 225
},
{
"epoch": 0.29331602855288774,
"grad_norm": 0.3870541340632366,
"learning_rate": 4.87012987012987e-05,
"loss": 0.549,
"step": 226
},
{
"epoch": 0.2946138870863076,
"grad_norm": 0.4889598386044001,
"learning_rate": 4.8917748917748915e-05,
"loss": 0.5538,
"step": 227
},
{
"epoch": 0.29591174561972744,
"grad_norm": 0.4543222558469965,
"learning_rate": 4.9134199134199135e-05,
"loss": 0.5651,
"step": 228
},
{
"epoch": 0.2972096041531473,
"grad_norm": 0.38147571297168936,
"learning_rate": 4.9350649350649355e-05,
"loss": 0.5456,
"step": 229
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.48062886052178266,
"learning_rate": 4.956709956709957e-05,
"loss": 0.5519,
"step": 230
},
{
"epoch": 0.29980532121998704,
"grad_norm": 0.3436776708584428,
"learning_rate": 4.978354978354979e-05,
"loss": 0.5572,
"step": 231
},
{
"epoch": 0.3011031797534069,
"grad_norm": 0.48075516118965306,
"learning_rate": 5e-05,
"loss": 0.5566,
"step": 232
},
{
"epoch": 0.30240103828682674,
"grad_norm": 0.5764784128417795,
"learning_rate": 4.997594997594998e-05,
"loss": 0.5819,
"step": 233
},
{
"epoch": 0.3036988968202466,
"grad_norm": 0.396476527818061,
"learning_rate": 4.995189995189995e-05,
"loss": 0.5411,
"step": 234
},
{
"epoch": 0.30499675535366644,
"grad_norm": 0.46291378630567925,
"learning_rate": 4.992784992784993e-05,
"loss": 0.5552,
"step": 235
},
{
"epoch": 0.3062946138870863,
"grad_norm": 0.44861478710130637,
"learning_rate": 4.990379990379991e-05,
"loss": 0.544,
"step": 236
},
{
"epoch": 0.3075924724205062,
"grad_norm": 0.3873611746053732,
"learning_rate": 4.987974987974988e-05,
"loss": 0.5556,
"step": 237
},
{
"epoch": 0.30889033095392604,
"grad_norm": 0.41664468323948045,
"learning_rate": 4.985569985569986e-05,
"loss": 0.5771,
"step": 238
},
{
"epoch": 0.3101881894873459,
"grad_norm": 0.3859756658142421,
"learning_rate": 4.983164983164983e-05,
"loss": 0.532,
"step": 239
},
{
"epoch": 0.31148604802076574,
"grad_norm": 0.44937677319362224,
"learning_rate": 4.980759980759981e-05,
"loss": 0.5426,
"step": 240
},
{
"epoch": 0.3127839065541856,
"grad_norm": 0.4349437401518082,
"learning_rate": 4.978354978354979e-05,
"loss": 0.5259,
"step": 241
},
{
"epoch": 0.31408176508760544,
"grad_norm": 0.400324790012705,
"learning_rate": 4.9759499759499764e-05,
"loss": 0.5537,
"step": 242
},
{
"epoch": 0.3153796236210253,
"grad_norm": 0.43872297509664254,
"learning_rate": 4.973544973544973e-05,
"loss": 0.549,
"step": 243
},
{
"epoch": 0.3166774821544452,
"grad_norm": 0.4653708053643151,
"learning_rate": 4.971139971139971e-05,
"loss": 0.5254,
"step": 244
},
{
"epoch": 0.31797534068786504,
"grad_norm": 0.40941811760654495,
"learning_rate": 4.968734968734969e-05,
"loss": 0.568,
"step": 245
},
{
"epoch": 0.3192731992212849,
"grad_norm": 0.5368348479077355,
"learning_rate": 4.966329966329967e-05,
"loss": 0.5476,
"step": 246
},
{
"epoch": 0.32057105775470474,
"grad_norm": 0.4544642184839637,
"learning_rate": 4.963924963924964e-05,
"loss": 0.5566,
"step": 247
},
{
"epoch": 0.3218689162881246,
"grad_norm": 0.42978031279738266,
"learning_rate": 4.961519961519962e-05,
"loss": 0.548,
"step": 248
},
{
"epoch": 0.32316677482154443,
"grad_norm": 0.41191622365654873,
"learning_rate": 4.9591149591149594e-05,
"loss": 0.5458,
"step": 249
},
{
"epoch": 0.3244646333549643,
"grad_norm": 0.6074054124348204,
"learning_rate": 4.956709956709957e-05,
"loss": 0.5519,
"step": 250
},
{
"epoch": 0.3257624918883842,
"grad_norm": 0.4651053481351256,
"learning_rate": 4.9543049543049543e-05,
"loss": 0.5811,
"step": 251
},
{
"epoch": 0.32706035042180404,
"grad_norm": 0.4240240962916135,
"learning_rate": 4.951899951899952e-05,
"loss": 0.5523,
"step": 252
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.5066208761057746,
"learning_rate": 4.94949494949495e-05,
"loss": 0.544,
"step": 253
},
{
"epoch": 0.32965606748864373,
"grad_norm": 0.38109072762259544,
"learning_rate": 4.9470899470899475e-05,
"loss": 0.5538,
"step": 254
},
{
"epoch": 0.3309539260220636,
"grad_norm": 0.5117807003713138,
"learning_rate": 4.944684944684945e-05,
"loss": 0.5577,
"step": 255
},
{
"epoch": 0.33225178455548343,
"grad_norm": 0.44912086500472626,
"learning_rate": 4.9422799422799424e-05,
"loss": 0.5495,
"step": 256
},
{
"epoch": 0.33354964308890334,
"grad_norm": 0.3651331486905666,
"learning_rate": 4.93987493987494e-05,
"loss": 0.5631,
"step": 257
},
{
"epoch": 0.3348475016223232,
"grad_norm": 0.5611125950484844,
"learning_rate": 4.937469937469938e-05,
"loss": 0.5465,
"step": 258
},
{
"epoch": 0.33614536015574303,
"grad_norm": 0.5300284860526002,
"learning_rate": 4.9350649350649355e-05,
"loss": 0.5425,
"step": 259
},
{
"epoch": 0.3374432186891629,
"grad_norm": 0.42241934122178765,
"learning_rate": 4.932659932659932e-05,
"loss": 0.5613,
"step": 260
},
{
"epoch": 0.33874107722258273,
"grad_norm": 0.6480707951702842,
"learning_rate": 4.9302549302549305e-05,
"loss": 0.5443,
"step": 261
},
{
"epoch": 0.3400389357560026,
"grad_norm": 0.5458559898285835,
"learning_rate": 4.927849927849928e-05,
"loss": 0.5362,
"step": 262
},
{
"epoch": 0.34133679428942243,
"grad_norm": 0.4307852761395753,
"learning_rate": 4.925444925444926e-05,
"loss": 0.5405,
"step": 263
},
{
"epoch": 0.34263465282284233,
"grad_norm": 0.5693990395449862,
"learning_rate": 4.923039923039923e-05,
"loss": 0.5455,
"step": 264
},
{
"epoch": 0.3439325113562622,
"grad_norm": 0.4427765568418805,
"learning_rate": 4.9206349206349204e-05,
"loss": 0.5475,
"step": 265
},
{
"epoch": 0.34523036988968203,
"grad_norm": 0.4724926699957873,
"learning_rate": 4.9182299182299185e-05,
"loss": 0.5502,
"step": 266
},
{
"epoch": 0.3465282284231019,
"grad_norm": 0.6296467164625645,
"learning_rate": 4.915824915824916e-05,
"loss": 0.555,
"step": 267
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.521325771991002,
"learning_rate": 4.9134199134199135e-05,
"loss": 0.5521,
"step": 268
},
{
"epoch": 0.3491239454899416,
"grad_norm": 0.4920183923356473,
"learning_rate": 4.911014911014911e-05,
"loss": 0.5589,
"step": 269
},
{
"epoch": 0.3504218040233614,
"grad_norm": 0.6860051439883974,
"learning_rate": 4.908609908609909e-05,
"loss": 0.5396,
"step": 270
},
{
"epoch": 0.35171966255678133,
"grad_norm": 0.38098025544839875,
"learning_rate": 4.9062049062049066e-05,
"loss": 0.5439,
"step": 271
},
{
"epoch": 0.3530175210902012,
"grad_norm": 0.545523500518169,
"learning_rate": 4.903799903799904e-05,
"loss": 0.5285,
"step": 272
},
{
"epoch": 0.35431537962362103,
"grad_norm": 0.4773245042110645,
"learning_rate": 4.9013949013949016e-05,
"loss": 0.5506,
"step": 273
},
{
"epoch": 0.3556132381570409,
"grad_norm": 0.41823644467627,
"learning_rate": 4.898989898989899e-05,
"loss": 0.5382,
"step": 274
},
{
"epoch": 0.3569110966904607,
"grad_norm": 0.43108861210799143,
"learning_rate": 4.896584896584897e-05,
"loss": 0.5312,
"step": 275
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.35256122918946825,
"learning_rate": 4.894179894179895e-05,
"loss": 0.5507,
"step": 276
},
{
"epoch": 0.3595068137573005,
"grad_norm": 0.641603115121163,
"learning_rate": 4.8917748917748915e-05,
"loss": 0.5494,
"step": 277
},
{
"epoch": 0.36080467229072033,
"grad_norm": 0.42144449610145046,
"learning_rate": 4.8893698893698896e-05,
"loss": 0.5537,
"step": 278
},
{
"epoch": 0.3621025308241402,
"grad_norm": 0.4421221398296794,
"learning_rate": 4.886964886964887e-05,
"loss": 0.5305,
"step": 279
},
{
"epoch": 0.36340038935756,
"grad_norm": 0.34904354726043524,
"learning_rate": 4.884559884559885e-05,
"loss": 0.5151,
"step": 280
},
{
"epoch": 0.3646982478909799,
"grad_norm": 0.567323138161088,
"learning_rate": 4.882154882154882e-05,
"loss": 0.5394,
"step": 281
},
{
"epoch": 0.3659961064243997,
"grad_norm": 0.4275900383373202,
"learning_rate": 4.8797498797498795e-05,
"loss": 0.5793,
"step": 282
},
{
"epoch": 0.3672939649578196,
"grad_norm": 0.43590374764579676,
"learning_rate": 4.877344877344878e-05,
"loss": 0.5238,
"step": 283
},
{
"epoch": 0.3685918234912395,
"grad_norm": 0.3806185860855704,
"learning_rate": 4.874939874939875e-05,
"loss": 0.5311,
"step": 284
},
{
"epoch": 0.3698896820246593,
"grad_norm": 0.36597622680635733,
"learning_rate": 4.8725348725348726e-05,
"loss": 0.5483,
"step": 285
},
{
"epoch": 0.3711875405580792,
"grad_norm": 0.39934249219009466,
"learning_rate": 4.87012987012987e-05,
"loss": 0.5323,
"step": 286
},
{
"epoch": 0.372485399091499,
"grad_norm": 0.35489673738601485,
"learning_rate": 4.8677248677248676e-05,
"loss": 0.5197,
"step": 287
},
{
"epoch": 0.3737832576249189,
"grad_norm": 0.35597996373456253,
"learning_rate": 4.865319865319866e-05,
"loss": 0.541,
"step": 288
},
{
"epoch": 0.3750811161583387,
"grad_norm": 0.30995272924377104,
"learning_rate": 4.862914862914863e-05,
"loss": 0.531,
"step": 289
},
{
"epoch": 0.37637897469175857,
"grad_norm": 0.3041222657562842,
"learning_rate": 4.860509860509861e-05,
"loss": 0.5263,
"step": 290
},
{
"epoch": 0.3776768332251785,
"grad_norm": 0.27479710316885086,
"learning_rate": 4.858104858104858e-05,
"loss": 0.5179,
"step": 291
},
{
"epoch": 0.3789746917585983,
"grad_norm": 0.4108809131825242,
"learning_rate": 4.8556998556998563e-05,
"loss": 0.5285,
"step": 292
},
{
"epoch": 0.3802725502920182,
"grad_norm": 0.3283706178094482,
"learning_rate": 4.853294853294854e-05,
"loss": 0.5485,
"step": 293
},
{
"epoch": 0.381570408825438,
"grad_norm": 0.3628325275789365,
"learning_rate": 4.8508898508898506e-05,
"loss": 0.5342,
"step": 294
},
{
"epoch": 0.38286826735885787,
"grad_norm": 0.3545709020214379,
"learning_rate": 4.848484848484849e-05,
"loss": 0.5441,
"step": 295
},
{
"epoch": 0.3841661258922777,
"grad_norm": 0.27536849505708144,
"learning_rate": 4.846079846079846e-05,
"loss": 0.5189,
"step": 296
},
{
"epoch": 0.3854639844256976,
"grad_norm": 0.31314568760395595,
"learning_rate": 4.8436748436748444e-05,
"loss": 0.5165,
"step": 297
},
{
"epoch": 0.3867618429591175,
"grad_norm": 0.31727676668467136,
"learning_rate": 4.841269841269841e-05,
"loss": 0.5185,
"step": 298
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.35285183197833564,
"learning_rate": 4.838864838864839e-05,
"loss": 0.5328,
"step": 299
},
{
"epoch": 0.3893575600259572,
"grad_norm": 0.2990420731073892,
"learning_rate": 4.836459836459837e-05,
"loss": 0.532,
"step": 300
},
{
"epoch": 0.390655418559377,
"grad_norm": 0.38606448483559813,
"learning_rate": 4.834054834054834e-05,
"loss": 0.5461,
"step": 301
},
{
"epoch": 0.39195327709279687,
"grad_norm": 0.37652402001442803,
"learning_rate": 4.831649831649832e-05,
"loss": 0.5379,
"step": 302
},
{
"epoch": 0.3932511356262167,
"grad_norm": 0.34953978468725405,
"learning_rate": 4.829244829244829e-05,
"loss": 0.5359,
"step": 303
},
{
"epoch": 0.3945489941596366,
"grad_norm": 0.3382778166946982,
"learning_rate": 4.826839826839827e-05,
"loss": 0.5342,
"step": 304
},
{
"epoch": 0.3958468526930565,
"grad_norm": 0.34560665492104875,
"learning_rate": 4.824434824434825e-05,
"loss": 0.5324,
"step": 305
},
{
"epoch": 0.3971447112264763,
"grad_norm": 0.34496470111641636,
"learning_rate": 4.8220298220298224e-05,
"loss": 0.5339,
"step": 306
},
{
"epoch": 0.39844256975989617,
"grad_norm": 0.40001685434062584,
"learning_rate": 4.81962481962482e-05,
"loss": 0.5272,
"step": 307
},
{
"epoch": 0.399740428293316,
"grad_norm": 0.366032696592655,
"learning_rate": 4.8172198172198173e-05,
"loss": 0.5306,
"step": 308
},
{
"epoch": 0.40103828682673587,
"grad_norm": 0.37927598899770393,
"learning_rate": 4.814814814814815e-05,
"loss": 0.5591,
"step": 309
},
{
"epoch": 0.4023361453601557,
"grad_norm": 0.32812121834756386,
"learning_rate": 4.812409812409813e-05,
"loss": 0.5422,
"step": 310
},
{
"epoch": 0.4036340038935756,
"grad_norm": 0.35171717899329513,
"learning_rate": 4.81000481000481e-05,
"loss": 0.532,
"step": 311
},
{
"epoch": 0.40493186242699547,
"grad_norm": 0.3756784968486016,
"learning_rate": 4.807599807599808e-05,
"loss": 0.5392,
"step": 312
},
{
"epoch": 0.4062297209604153,
"grad_norm": 0.3426264703785813,
"learning_rate": 4.8051948051948054e-05,
"loss": 0.5457,
"step": 313
},
{
"epoch": 0.40752757949383517,
"grad_norm": 0.39836935230937326,
"learning_rate": 4.8027898027898036e-05,
"loss": 0.5268,
"step": 314
},
{
"epoch": 0.408825438027255,
"grad_norm": 0.33486717640072616,
"learning_rate": 4.8003848003848004e-05,
"loss": 0.5298,
"step": 315
},
{
"epoch": 0.41012329656067487,
"grad_norm": 0.3463640087410465,
"learning_rate": 4.797979797979798e-05,
"loss": 0.5372,
"step": 316
},
{
"epoch": 0.41142115509409477,
"grad_norm": 0.2981951724559669,
"learning_rate": 4.795574795574796e-05,
"loss": 0.5193,
"step": 317
},
{
"epoch": 0.4127190136275146,
"grad_norm": 0.37701472504733063,
"learning_rate": 4.7931697931697935e-05,
"loss": 0.5262,
"step": 318
},
{
"epoch": 0.41401687216093447,
"grad_norm": 0.2958251594721693,
"learning_rate": 4.790764790764791e-05,
"loss": 0.5262,
"step": 319
},
{
"epoch": 0.4153147306943543,
"grad_norm": 0.36512530778352836,
"learning_rate": 4.7883597883597884e-05,
"loss": 0.5451,
"step": 320
},
{
"epoch": 0.41661258922777417,
"grad_norm": 0.32275488837011096,
"learning_rate": 4.785954785954786e-05,
"loss": 0.5378,
"step": 321
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.29968884353456,
"learning_rate": 4.783549783549784e-05,
"loss": 0.5308,
"step": 322
},
{
"epoch": 0.41920830629461386,
"grad_norm": 0.3472967912976659,
"learning_rate": 4.7811447811447815e-05,
"loss": 0.5262,
"step": 323
},
{
"epoch": 0.42050616482803377,
"grad_norm": 0.3256673670375662,
"learning_rate": 4.778739778739779e-05,
"loss": 0.5349,
"step": 324
},
{
"epoch": 0.4218040233614536,
"grad_norm": 0.3830575202323324,
"learning_rate": 4.7763347763347765e-05,
"loss": 0.5343,
"step": 325
},
{
"epoch": 0.42310188189487347,
"grad_norm": 0.34375094386741617,
"learning_rate": 4.773929773929774e-05,
"loss": 0.5295,
"step": 326
},
{
"epoch": 0.4243997404282933,
"grad_norm": 0.32100699117380493,
"learning_rate": 4.771524771524772e-05,
"loss": 0.5241,
"step": 327
},
{
"epoch": 0.42569759896171316,
"grad_norm": 0.3546414912790039,
"learning_rate": 4.769119769119769e-05,
"loss": 0.5292,
"step": 328
},
{
"epoch": 0.426995457495133,
"grad_norm": 0.367282717001635,
"learning_rate": 4.766714766714767e-05,
"loss": 0.525,
"step": 329
},
{
"epoch": 0.42829331602855286,
"grad_norm": 0.36332040957365974,
"learning_rate": 4.7643097643097646e-05,
"loss": 0.5395,
"step": 330
},
{
"epoch": 0.42959117456197277,
"grad_norm": 0.36242424332632034,
"learning_rate": 4.761904761904762e-05,
"loss": 0.5533,
"step": 331
},
{
"epoch": 0.4308890330953926,
"grad_norm": 0.36697609874383924,
"learning_rate": 4.7594997594997595e-05,
"loss": 0.5359,
"step": 332
},
{
"epoch": 0.43218689162881246,
"grad_norm": 0.33118162802731466,
"learning_rate": 4.757094757094757e-05,
"loss": 0.5317,
"step": 333
},
{
"epoch": 0.4334847501622323,
"grad_norm": 0.30441401984534905,
"learning_rate": 4.754689754689755e-05,
"loss": 0.5117,
"step": 334
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.38992931838953715,
"learning_rate": 4.7522847522847526e-05,
"loss": 0.5461,
"step": 335
},
{
"epoch": 0.436080467229072,
"grad_norm": 0.3335314021890073,
"learning_rate": 4.74987974987975e-05,
"loss": 0.521,
"step": 336
},
{
"epoch": 0.4373783257624919,
"grad_norm": 0.44568000339670255,
"learning_rate": 4.7474747474747476e-05,
"loss": 0.5351,
"step": 337
},
{
"epoch": 0.43867618429591176,
"grad_norm": 0.30521804239806394,
"learning_rate": 4.745069745069745e-05,
"loss": 0.5354,
"step": 338
},
{
"epoch": 0.4399740428293316,
"grad_norm": 0.40068879251857975,
"learning_rate": 4.742664742664743e-05,
"loss": 0.519,
"step": 339
},
{
"epoch": 0.44127190136275146,
"grad_norm": 0.2988879048992771,
"learning_rate": 4.740259740259741e-05,
"loss": 0.5219,
"step": 340
},
{
"epoch": 0.4425697598961713,
"grad_norm": 0.3568374963385924,
"learning_rate": 4.737854737854738e-05,
"loss": 0.5407,
"step": 341
},
{
"epoch": 0.44386761842959116,
"grad_norm": 0.3314619863009568,
"learning_rate": 4.7354497354497356e-05,
"loss": 0.5293,
"step": 342
},
{
"epoch": 0.445165476963011,
"grad_norm": 0.38859118585963526,
"learning_rate": 4.733044733044733e-05,
"loss": 0.5313,
"step": 343
},
{
"epoch": 0.4464633354964309,
"grad_norm": 0.29870660291558937,
"learning_rate": 4.730639730639731e-05,
"loss": 0.5235,
"step": 344
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.33559539703922564,
"learning_rate": 4.728234728234728e-05,
"loss": 0.5165,
"step": 345
},
{
"epoch": 0.4490590525632706,
"grad_norm": 0.29408841700456767,
"learning_rate": 4.725829725829726e-05,
"loss": 0.5385,
"step": 346
},
{
"epoch": 0.45035691109669046,
"grad_norm": 0.37054598216697,
"learning_rate": 4.723424723424724e-05,
"loss": 0.5107,
"step": 347
},
{
"epoch": 0.4516547696301103,
"grad_norm": 0.3250925011044857,
"learning_rate": 4.721019721019721e-05,
"loss": 0.5233,
"step": 348
},
{
"epoch": 0.45295262816353016,
"grad_norm": 0.35856402885603006,
"learning_rate": 4.718614718614719e-05,
"loss": 0.5055,
"step": 349
},
{
"epoch": 0.45425048669695,
"grad_norm": 0.34612856309129164,
"learning_rate": 4.716209716209716e-05,
"loss": 0.5404,
"step": 350
},
{
"epoch": 0.4555483452303699,
"grad_norm": 0.33303853027334285,
"learning_rate": 4.713804713804714e-05,
"loss": 0.5178,
"step": 351
},
{
"epoch": 0.45684620376378976,
"grad_norm": 0.32091795432054987,
"learning_rate": 4.711399711399712e-05,
"loss": 0.5222,
"step": 352
},
{
"epoch": 0.4581440622972096,
"grad_norm": 0.34934754787554123,
"learning_rate": 4.708994708994709e-05,
"loss": 0.5405,
"step": 353
},
{
"epoch": 0.45944192083062946,
"grad_norm": 0.2937202903692653,
"learning_rate": 4.706589706589707e-05,
"loss": 0.5283,
"step": 354
},
{
"epoch": 0.4607397793640493,
"grad_norm": 0.3464664667340698,
"learning_rate": 4.704184704184704e-05,
"loss": 0.5257,
"step": 355
},
{
"epoch": 0.46203763789746916,
"grad_norm": 0.3021689351056674,
"learning_rate": 4.7017797017797024e-05,
"loss": 0.5256,
"step": 356
},
{
"epoch": 0.46333549643088906,
"grad_norm": 0.3373492256075124,
"learning_rate": 4.6993746993747e-05,
"loss": 0.5262,
"step": 357
},
{
"epoch": 0.4646333549643089,
"grad_norm": 0.3279466476251607,
"learning_rate": 4.696969696969697e-05,
"loss": 0.5252,
"step": 358
},
{
"epoch": 0.46593121349772876,
"grad_norm": 0.3151794533478745,
"learning_rate": 4.694564694564695e-05,
"loss": 0.5355,
"step": 359
},
{
"epoch": 0.4672290720311486,
"grad_norm": 0.3676469350203011,
"learning_rate": 4.692159692159692e-05,
"loss": 0.5536,
"step": 360
},
{
"epoch": 0.46852693056456846,
"grad_norm": 0.2638129347242171,
"learning_rate": 4.6897546897546904e-05,
"loss": 0.5167,
"step": 361
},
{
"epoch": 0.4698247890979883,
"grad_norm": 0.32036105761534295,
"learning_rate": 4.687349687349687e-05,
"loss": 0.5105,
"step": 362
},
{
"epoch": 0.47112264763140815,
"grad_norm": 0.3312350329187521,
"learning_rate": 4.6849446849446854e-05,
"loss": 0.5403,
"step": 363
},
{
"epoch": 0.47242050616482806,
"grad_norm": 0.2691270807481844,
"learning_rate": 4.682539682539683e-05,
"loss": 0.5129,
"step": 364
},
{
"epoch": 0.4737183646982479,
"grad_norm": 0.31512841418214993,
"learning_rate": 4.68013468013468e-05,
"loss": 0.5188,
"step": 365
},
{
"epoch": 0.47501622323166776,
"grad_norm": 0.32669598998030175,
"learning_rate": 4.677729677729678e-05,
"loss": 0.5384,
"step": 366
},
{
"epoch": 0.4763140817650876,
"grad_norm": 0.3235775681673074,
"learning_rate": 4.675324675324675e-05,
"loss": 0.5194,
"step": 367
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.3095310587265592,
"learning_rate": 4.6729196729196734e-05,
"loss": 0.5323,
"step": 368
},
{
"epoch": 0.4789097988319273,
"grad_norm": 0.32341229991033627,
"learning_rate": 4.670514670514671e-05,
"loss": 0.5192,
"step": 369
},
{
"epoch": 0.48020765736534715,
"grad_norm": 0.3175500385279334,
"learning_rate": 4.6681096681096684e-05,
"loss": 0.53,
"step": 370
},
{
"epoch": 0.48150551589876706,
"grad_norm": 0.34206247662543693,
"learning_rate": 4.665704665704666e-05,
"loss": 0.5314,
"step": 371
},
{
"epoch": 0.4828033744321869,
"grad_norm": 0.3185235414836324,
"learning_rate": 4.6632996632996634e-05,
"loss": 0.5367,
"step": 372
},
{
"epoch": 0.48410123296560675,
"grad_norm": 0.28453878466972515,
"learning_rate": 4.6608946608946615e-05,
"loss": 0.5223,
"step": 373
},
{
"epoch": 0.4853990914990266,
"grad_norm": 0.2957039810409513,
"learning_rate": 4.658489658489659e-05,
"loss": 0.5005,
"step": 374
},
{
"epoch": 0.48669695003244645,
"grad_norm": 0.2940626178906619,
"learning_rate": 4.656084656084656e-05,
"loss": 0.5098,
"step": 375
},
{
"epoch": 0.4879948085658663,
"grad_norm": 0.33976965417394467,
"learning_rate": 4.653679653679654e-05,
"loss": 0.5155,
"step": 376
},
{
"epoch": 0.4892926670992862,
"grad_norm": 0.2993214412594064,
"learning_rate": 4.6512746512746514e-05,
"loss": 0.5183,
"step": 377
},
{
"epoch": 0.49059052563270605,
"grad_norm": 0.3550998192684892,
"learning_rate": 4.6488696488696496e-05,
"loss": 0.5404,
"step": 378
},
{
"epoch": 0.4918883841661259,
"grad_norm": 0.3961098073492471,
"learning_rate": 4.6464646464646464e-05,
"loss": 0.5411,
"step": 379
},
{
"epoch": 0.49318624269954575,
"grad_norm": 0.34269318810223304,
"learning_rate": 4.6440596440596445e-05,
"loss": 0.5206,
"step": 380
},
{
"epoch": 0.4944841012329656,
"grad_norm": 0.29509416892424534,
"learning_rate": 4.641654641654642e-05,
"loss": 0.5189,
"step": 381
},
{
"epoch": 0.49578195976638545,
"grad_norm": 0.32772442148274133,
"learning_rate": 4.6392496392496395e-05,
"loss": 0.5368,
"step": 382
},
{
"epoch": 0.4970798182998053,
"grad_norm": 0.2719733229054414,
"learning_rate": 4.636844636844637e-05,
"loss": 0.5027,
"step": 383
},
{
"epoch": 0.4983776768332252,
"grad_norm": 0.32827976870034653,
"learning_rate": 4.6344396344396344e-05,
"loss": 0.5225,
"step": 384
},
{
"epoch": 0.49967553536664505,
"grad_norm": 0.36529779846696075,
"learning_rate": 4.6320346320346326e-05,
"loss": 0.4961,
"step": 385
},
{
"epoch": 0.5009733939000649,
"grad_norm": 0.34737533192311987,
"learning_rate": 4.62962962962963e-05,
"loss": 0.5056,
"step": 386
},
{
"epoch": 0.5022712524334848,
"grad_norm": 0.32570267249669654,
"learning_rate": 4.6272246272246276e-05,
"loss": 0.5114,
"step": 387
},
{
"epoch": 0.5035691109669046,
"grad_norm": 0.3419484703073112,
"learning_rate": 4.624819624819625e-05,
"loss": 0.5141,
"step": 388
},
{
"epoch": 0.5048669695003245,
"grad_norm": 0.34141193067026915,
"learning_rate": 4.6224146224146225e-05,
"loss": 0.5296,
"step": 389
},
{
"epoch": 0.5061648280337443,
"grad_norm": 0.314416021500765,
"learning_rate": 4.620009620009621e-05,
"loss": 0.5202,
"step": 390
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.2608120507481533,
"learning_rate": 4.617604617604618e-05,
"loss": 0.507,
"step": 391
},
{
"epoch": 0.508760545100584,
"grad_norm": 0.3415961124944403,
"learning_rate": 4.615199615199615e-05,
"loss": 0.5513,
"step": 392
},
{
"epoch": 0.5100584036340039,
"grad_norm": 0.3545772824977214,
"learning_rate": 4.612794612794613e-05,
"loss": 0.5195,
"step": 393
},
{
"epoch": 0.5113562621674238,
"grad_norm": 0.2759048918555112,
"learning_rate": 4.6103896103896106e-05,
"loss": 0.5145,
"step": 394
},
{
"epoch": 0.5126541207008436,
"grad_norm": 0.29424902451922874,
"learning_rate": 4.607984607984609e-05,
"loss": 0.4965,
"step": 395
},
{
"epoch": 0.5139519792342635,
"grad_norm": 0.2865983981532377,
"learning_rate": 4.6055796055796055e-05,
"loss": 0.51,
"step": 396
},
{
"epoch": 0.5152498377676833,
"grad_norm": 0.2826395400080094,
"learning_rate": 4.603174603174603e-05,
"loss": 0.5177,
"step": 397
},
{
"epoch": 0.5165476963011032,
"grad_norm": 0.3154832346968727,
"learning_rate": 4.600769600769601e-05,
"loss": 0.5167,
"step": 398
},
{
"epoch": 0.517845554834523,
"grad_norm": 0.28657500627349963,
"learning_rate": 4.5983645983645986e-05,
"loss": 0.4915,
"step": 399
},
{
"epoch": 0.5191434133679429,
"grad_norm": 0.3108878669769025,
"learning_rate": 4.595959595959596e-05,
"loss": 0.5094,
"step": 400
},
{
"epoch": 0.5204412719013628,
"grad_norm": 0.3105653890499028,
"learning_rate": 4.5935545935545936e-05,
"loss": 0.5395,
"step": 401
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.3041614944176647,
"learning_rate": 4.591149591149592e-05,
"loss": 0.5015,
"step": 402
},
{
"epoch": 0.5230369889682025,
"grad_norm": 0.2774234169618742,
"learning_rate": 4.588744588744589e-05,
"loss": 0.4953,
"step": 403
},
{
"epoch": 0.5243348475016223,
"grad_norm": 0.3292325244305617,
"learning_rate": 4.586339586339587e-05,
"loss": 0.5326,
"step": 404
},
{
"epoch": 0.5256327060350422,
"grad_norm": 0.280230976708198,
"learning_rate": 4.583934583934584e-05,
"loss": 0.5186,
"step": 405
},
{
"epoch": 0.526930564568462,
"grad_norm": 0.29220060360384115,
"learning_rate": 4.5815295815295817e-05,
"loss": 0.5224,
"step": 406
},
{
"epoch": 0.5282284231018819,
"grad_norm": 0.2973987484773138,
"learning_rate": 4.57912457912458e-05,
"loss": 0.5198,
"step": 407
},
{
"epoch": 0.5295262816353018,
"grad_norm": 0.31013343434720114,
"learning_rate": 4.576719576719577e-05,
"loss": 0.5303,
"step": 408
},
{
"epoch": 0.5308241401687216,
"grad_norm": 0.3091402470001352,
"learning_rate": 4.574314574314574e-05,
"loss": 0.5176,
"step": 409
},
{
"epoch": 0.5321219987021415,
"grad_norm": 0.2974831281530903,
"learning_rate": 4.571909571909572e-05,
"loss": 0.5153,
"step": 410
},
{
"epoch": 0.5334198572355613,
"grad_norm": 0.30558238497100093,
"learning_rate": 4.56950456950457e-05,
"loss": 0.5171,
"step": 411
},
{
"epoch": 0.5347177157689812,
"grad_norm": 0.3050417584271822,
"learning_rate": 4.567099567099568e-05,
"loss": 0.51,
"step": 412
},
{
"epoch": 0.536015574302401,
"grad_norm": 0.3062163771011129,
"learning_rate": 4.564694564694565e-05,
"loss": 0.5182,
"step": 413
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.2918708356467548,
"learning_rate": 4.562289562289562e-05,
"loss": 0.5372,
"step": 414
},
{
"epoch": 0.5386112913692408,
"grad_norm": 0.3100802477130393,
"learning_rate": 4.55988455988456e-05,
"loss": 0.504,
"step": 415
},
{
"epoch": 0.5399091499026606,
"grad_norm": 0.28080959444703624,
"learning_rate": 4.557479557479558e-05,
"loss": 0.5133,
"step": 416
},
{
"epoch": 0.5412070084360805,
"grad_norm": 0.31510663571529784,
"learning_rate": 4.555074555074555e-05,
"loss": 0.5083,
"step": 417
},
{
"epoch": 0.5425048669695003,
"grad_norm": 0.2987074534694197,
"learning_rate": 4.552669552669553e-05,
"loss": 0.5233,
"step": 418
},
{
"epoch": 0.5438027255029202,
"grad_norm": 0.3018100646658762,
"learning_rate": 4.55026455026455e-05,
"loss": 0.5125,
"step": 419
},
{
"epoch": 0.54510058403634,
"grad_norm": 0.3068942632441547,
"learning_rate": 4.5478595478595484e-05,
"loss": 0.5141,
"step": 420
},
{
"epoch": 0.5463984425697599,
"grad_norm": 0.30401355001057445,
"learning_rate": 4.545454545454546e-05,
"loss": 0.5275,
"step": 421
},
{
"epoch": 0.5476963011031798,
"grad_norm": 0.3203825407640607,
"learning_rate": 4.543049543049543e-05,
"loss": 0.5374,
"step": 422
},
{
"epoch": 0.5489941596365996,
"grad_norm": 0.2890332355349151,
"learning_rate": 4.540644540644541e-05,
"loss": 0.4928,
"step": 423
},
{
"epoch": 0.5502920181700195,
"grad_norm": 0.3167375640999411,
"learning_rate": 4.538239538239538e-05,
"loss": 0.5238,
"step": 424
},
{
"epoch": 0.5515898767034393,
"grad_norm": 0.2838739525584143,
"learning_rate": 4.535834535834536e-05,
"loss": 0.5193,
"step": 425
},
{
"epoch": 0.5528877352368592,
"grad_norm": 0.31600565211015347,
"learning_rate": 4.533429533429533e-05,
"loss": 0.5408,
"step": 426
},
{
"epoch": 0.5541855937702791,
"grad_norm": 0.2936881048070723,
"learning_rate": 4.5310245310245314e-05,
"loss": 0.517,
"step": 427
},
{
"epoch": 0.5554834523036989,
"grad_norm": 0.2869325695453175,
"learning_rate": 4.528619528619529e-05,
"loss": 0.5272,
"step": 428
},
{
"epoch": 0.5567813108371188,
"grad_norm": 0.26396682696755225,
"learning_rate": 4.5262145262145264e-05,
"loss": 0.5092,
"step": 429
},
{
"epoch": 0.5580791693705386,
"grad_norm": 0.2778797627866081,
"learning_rate": 4.523809523809524e-05,
"loss": 0.5107,
"step": 430
},
{
"epoch": 0.5593770279039585,
"grad_norm": 0.29638166733777366,
"learning_rate": 4.521404521404521e-05,
"loss": 0.5207,
"step": 431
},
{
"epoch": 0.5606748864373783,
"grad_norm": 0.31086056140406293,
"learning_rate": 4.5189995189995195e-05,
"loss": 0.5213,
"step": 432
},
{
"epoch": 0.5619727449707982,
"grad_norm": 0.2777342628277087,
"learning_rate": 4.516594516594517e-05,
"loss": 0.5225,
"step": 433
},
{
"epoch": 0.5632706035042181,
"grad_norm": 0.31153041204212967,
"learning_rate": 4.5141895141895144e-05,
"loss": 0.5029,
"step": 434
},
{
"epoch": 0.5645684620376379,
"grad_norm": 0.2833387330391144,
"learning_rate": 4.511784511784512e-05,
"loss": 0.5269,
"step": 435
},
{
"epoch": 0.5658663205710578,
"grad_norm": 0.3208653719812624,
"learning_rate": 4.5093795093795094e-05,
"loss": 0.5027,
"step": 436
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.3024059727560176,
"learning_rate": 4.5069745069745075e-05,
"loss": 0.5219,
"step": 437
},
{
"epoch": 0.5684620376378975,
"grad_norm": 0.2822549725146957,
"learning_rate": 4.504569504569504e-05,
"loss": 0.5095,
"step": 438
},
{
"epoch": 0.5697598961713173,
"grad_norm": 0.3016137705597104,
"learning_rate": 4.5021645021645025e-05,
"loss": 0.5104,
"step": 439
},
{
"epoch": 0.5710577547047372,
"grad_norm": 0.3627945641240943,
"learning_rate": 4.4997594997595e-05,
"loss": 0.5264,
"step": 440
},
{
"epoch": 0.5723556132381571,
"grad_norm": 0.27507558856590775,
"learning_rate": 4.4973544973544974e-05,
"loss": 0.5139,
"step": 441
},
{
"epoch": 0.5736534717715769,
"grad_norm": 0.3593784232319199,
"learning_rate": 4.494949494949495e-05,
"loss": 0.5212,
"step": 442
},
{
"epoch": 0.5749513303049968,
"grad_norm": 0.29557452932516204,
"learning_rate": 4.4925444925444924e-05,
"loss": 0.502,
"step": 443
},
{
"epoch": 0.5762491888384166,
"grad_norm": 0.3005471775259294,
"learning_rate": 4.4901394901394906e-05,
"loss": 0.4912,
"step": 444
},
{
"epoch": 0.5775470473718365,
"grad_norm": 0.26919062978615377,
"learning_rate": 4.487734487734488e-05,
"loss": 0.5313,
"step": 445
},
{
"epoch": 0.5788449059052563,
"grad_norm": 0.2556241030912058,
"learning_rate": 4.4853294853294855e-05,
"loss": 0.5085,
"step": 446
},
{
"epoch": 0.5801427644386762,
"grad_norm": 0.2733433348389189,
"learning_rate": 4.482924482924483e-05,
"loss": 0.5232,
"step": 447
},
{
"epoch": 0.5814406229720961,
"grad_norm": 0.2699232360629045,
"learning_rate": 4.4805194805194805e-05,
"loss": 0.5049,
"step": 448
},
{
"epoch": 0.5827384815055159,
"grad_norm": 0.28747431650418886,
"learning_rate": 4.4781144781144786e-05,
"loss": 0.5112,
"step": 449
},
{
"epoch": 0.5840363400389358,
"grad_norm": 0.28804716410878617,
"learning_rate": 4.475709475709476e-05,
"loss": 0.4905,
"step": 450
},
{
"epoch": 0.5853341985723556,
"grad_norm": 0.2919584848714507,
"learning_rate": 4.4733044733044736e-05,
"loss": 0.5272,
"step": 451
},
{
"epoch": 0.5866320571057755,
"grad_norm": 0.27840735576549713,
"learning_rate": 4.470899470899471e-05,
"loss": 0.4885,
"step": 452
},
{
"epoch": 0.5879299156391953,
"grad_norm": 0.29490953800516345,
"learning_rate": 4.4684944684944685e-05,
"loss": 0.5164,
"step": 453
},
{
"epoch": 0.5892277741726152,
"grad_norm": 0.3240170627979527,
"learning_rate": 4.466089466089467e-05,
"loss": 0.5173,
"step": 454
},
{
"epoch": 0.5905256327060351,
"grad_norm": 0.2665880580848304,
"learning_rate": 4.4636844636844635e-05,
"loss": 0.5045,
"step": 455
},
{
"epoch": 0.5918234912394549,
"grad_norm": 0.36453608305554464,
"learning_rate": 4.4612794612794616e-05,
"loss": 0.5176,
"step": 456
},
{
"epoch": 0.5931213497728748,
"grad_norm": 0.2971475504780928,
"learning_rate": 4.458874458874459e-05,
"loss": 0.5047,
"step": 457
},
{
"epoch": 0.5944192083062946,
"grad_norm": 0.34100322893736984,
"learning_rate": 4.4564694564694566e-05,
"loss": 0.5111,
"step": 458
},
{
"epoch": 0.5957170668397145,
"grad_norm": 0.27240121549823265,
"learning_rate": 4.454064454064454e-05,
"loss": 0.4885,
"step": 459
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.31589767714076145,
"learning_rate": 4.4516594516594515e-05,
"loss": 0.4994,
"step": 460
},
{
"epoch": 0.5983127839065542,
"grad_norm": 0.2801464661937106,
"learning_rate": 4.44925444925445e-05,
"loss": 0.497,
"step": 461
},
{
"epoch": 0.5996106424399741,
"grad_norm": 0.33064691940201346,
"learning_rate": 4.446849446849447e-05,
"loss": 0.5336,
"step": 462
},
{
"epoch": 0.6009085009733939,
"grad_norm": 0.2866479133025442,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.5346,
"step": 463
},
{
"epoch": 0.6022063595068138,
"grad_norm": 0.33165659226246563,
"learning_rate": 4.442039442039442e-05,
"loss": 0.5145,
"step": 464
},
{
"epoch": 0.6035042180402336,
"grad_norm": 0.4080281603274731,
"learning_rate": 4.4396344396344396e-05,
"loss": 0.5083,
"step": 465
},
{
"epoch": 0.6048020765736535,
"grad_norm": 0.32997017809734314,
"learning_rate": 4.437229437229438e-05,
"loss": 0.5108,
"step": 466
},
{
"epoch": 0.6060999351070734,
"grad_norm": 0.4193305220535223,
"learning_rate": 4.434824434824435e-05,
"loss": 0.5333,
"step": 467
},
{
"epoch": 0.6073977936404932,
"grad_norm": 0.33536790460482346,
"learning_rate": 4.432419432419432e-05,
"loss": 0.5315,
"step": 468
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.30331774787903265,
"learning_rate": 4.43001443001443e-05,
"loss": 0.5044,
"step": 469
},
{
"epoch": 0.6099935107073329,
"grad_norm": 0.3332494229182582,
"learning_rate": 4.427609427609428e-05,
"loss": 0.5271,
"step": 470
},
{
"epoch": 0.6112913692407528,
"grad_norm": 0.27631817841293893,
"learning_rate": 4.425204425204426e-05,
"loss": 0.4869,
"step": 471
},
{
"epoch": 0.6125892277741726,
"grad_norm": 0.30864904652906816,
"learning_rate": 4.4227994227994226e-05,
"loss": 0.5189,
"step": 472
},
{
"epoch": 0.6138870863075925,
"grad_norm": 0.25286124571299967,
"learning_rate": 4.420394420394421e-05,
"loss": 0.5243,
"step": 473
},
{
"epoch": 0.6151849448410124,
"grad_norm": 0.32194768257527906,
"learning_rate": 4.417989417989418e-05,
"loss": 0.517,
"step": 474
},
{
"epoch": 0.6164828033744322,
"grad_norm": 0.25295912491765415,
"learning_rate": 4.415584415584416e-05,
"loss": 0.5119,
"step": 475
},
{
"epoch": 0.6177806619078521,
"grad_norm": 0.30031239529360704,
"learning_rate": 4.413179413179413e-05,
"loss": 0.5167,
"step": 476
},
{
"epoch": 0.6190785204412719,
"grad_norm": 0.3457458465491688,
"learning_rate": 4.410774410774411e-05,
"loss": 0.5122,
"step": 477
},
{
"epoch": 0.6203763789746918,
"grad_norm": 0.3091494265523315,
"learning_rate": 4.408369408369409e-05,
"loss": 0.4996,
"step": 478
},
{
"epoch": 0.6216742375081116,
"grad_norm": 0.34841455852307157,
"learning_rate": 4.405964405964406e-05,
"loss": 0.5187,
"step": 479
},
{
"epoch": 0.6229720960415315,
"grad_norm": 0.28466491288804874,
"learning_rate": 4.403559403559404e-05,
"loss": 0.5081,
"step": 480
},
{
"epoch": 0.6242699545749514,
"grad_norm": 0.31239695738713696,
"learning_rate": 4.401154401154401e-05,
"loss": 0.5188,
"step": 481
},
{
"epoch": 0.6255678131083712,
"grad_norm": 0.2906342979686575,
"learning_rate": 4.398749398749399e-05,
"loss": 0.4979,
"step": 482
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.32742410431546853,
"learning_rate": 4.396344396344397e-05,
"loss": 0.5096,
"step": 483
},
{
"epoch": 0.6281635301752109,
"grad_norm": 0.28587236180759673,
"learning_rate": 4.3939393939393944e-05,
"loss": 0.5282,
"step": 484
},
{
"epoch": 0.6294613887086308,
"grad_norm": 0.2939115603096443,
"learning_rate": 4.391534391534391e-05,
"loss": 0.5172,
"step": 485
},
{
"epoch": 0.6307592472420506,
"grad_norm": 0.2873210431560928,
"learning_rate": 4.3891293891293894e-05,
"loss": 0.5229,
"step": 486
},
{
"epoch": 0.6320571057754705,
"grad_norm": 0.342818284796367,
"learning_rate": 4.386724386724387e-05,
"loss": 0.5272,
"step": 487
},
{
"epoch": 0.6333549643088904,
"grad_norm": 0.2663532951872564,
"learning_rate": 4.384319384319385e-05,
"loss": 0.5087,
"step": 488
},
{
"epoch": 0.6346528228423102,
"grad_norm": 0.25849575306366,
"learning_rate": 4.381914381914382e-05,
"loss": 0.503,
"step": 489
},
{
"epoch": 0.6359506813757301,
"grad_norm": 0.2682428237326465,
"learning_rate": 4.379509379509379e-05,
"loss": 0.5178,
"step": 490
},
{
"epoch": 0.6372485399091499,
"grad_norm": 0.2899634415594277,
"learning_rate": 4.3771043771043774e-05,
"loss": 0.4964,
"step": 491
},
{
"epoch": 0.6385463984425698,
"grad_norm": 0.3453086828842896,
"learning_rate": 4.374699374699375e-05,
"loss": 0.53,
"step": 492
},
{
"epoch": 0.6398442569759896,
"grad_norm": 0.34399408107909996,
"learning_rate": 4.3722943722943724e-05,
"loss": 0.5193,
"step": 493
},
{
"epoch": 0.6411421155094095,
"grad_norm": 0.3610030879163129,
"learning_rate": 4.36988936988937e-05,
"loss": 0.5144,
"step": 494
},
{
"epoch": 0.6424399740428294,
"grad_norm": 0.23843044383570788,
"learning_rate": 4.367484367484368e-05,
"loss": 0.4999,
"step": 495
},
{
"epoch": 0.6437378325762492,
"grad_norm": 0.3654439591676623,
"learning_rate": 4.3650793650793655e-05,
"loss": 0.4888,
"step": 496
},
{
"epoch": 0.6450356911096691,
"grad_norm": 0.28776010247656836,
"learning_rate": 4.362674362674363e-05,
"loss": 0.5022,
"step": 497
},
{
"epoch": 0.6463335496430889,
"grad_norm": 0.3647131705869751,
"learning_rate": 4.3602693602693604e-05,
"loss": 0.5092,
"step": 498
},
{
"epoch": 0.6476314081765088,
"grad_norm": 0.30736812776643446,
"learning_rate": 4.357864357864358e-05,
"loss": 0.5171,
"step": 499
},
{
"epoch": 0.6489292667099286,
"grad_norm": 0.36290147629484104,
"learning_rate": 4.355459355459356e-05,
"loss": 0.5223,
"step": 500
},
{
"epoch": 0.6502271252433485,
"grad_norm": 0.32228223382695725,
"learning_rate": 4.3530543530543535e-05,
"loss": 0.5099,
"step": 501
},
{
"epoch": 0.6515249837767684,
"grad_norm": 0.31393689007483594,
"learning_rate": 4.3506493506493503e-05,
"loss": 0.496,
"step": 502
},
{
"epoch": 0.6528228423101882,
"grad_norm": 0.2966759326603879,
"learning_rate": 4.3482443482443485e-05,
"loss": 0.5173,
"step": 503
},
{
"epoch": 0.6541207008436081,
"grad_norm": 0.2864744517114858,
"learning_rate": 4.345839345839346e-05,
"loss": 0.503,
"step": 504
},
{
"epoch": 0.6554185593770279,
"grad_norm": 0.28016826596559247,
"learning_rate": 4.343434343434344e-05,
"loss": 0.5095,
"step": 505
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.3045274480234983,
"learning_rate": 4.341029341029341e-05,
"loss": 0.5234,
"step": 506
},
{
"epoch": 0.6580142764438677,
"grad_norm": 0.2865539821179636,
"learning_rate": 4.3386243386243384e-05,
"loss": 0.5057,
"step": 507
},
{
"epoch": 0.6593121349772875,
"grad_norm": 0.28016725527352626,
"learning_rate": 4.3362193362193366e-05,
"loss": 0.5054,
"step": 508
},
{
"epoch": 0.6606099935107074,
"grad_norm": 0.2779087438851858,
"learning_rate": 4.333814333814334e-05,
"loss": 0.4844,
"step": 509
},
{
"epoch": 0.6619078520441272,
"grad_norm": 0.29308593442315034,
"learning_rate": 4.3314093314093315e-05,
"loss": 0.5026,
"step": 510
},
{
"epoch": 0.6632057105775471,
"grad_norm": 0.24617150101353785,
"learning_rate": 4.329004329004329e-05,
"loss": 0.505,
"step": 511
},
{
"epoch": 0.6645035691109669,
"grad_norm": 0.2801536462432465,
"learning_rate": 4.3265993265993265e-05,
"loss": 0.4957,
"step": 512
},
{
"epoch": 0.6658014276443868,
"grad_norm": 0.2590262249669081,
"learning_rate": 4.3241943241943246e-05,
"loss": 0.4976,
"step": 513
},
{
"epoch": 0.6670992861778067,
"grad_norm": 0.27675213215164485,
"learning_rate": 4.321789321789322e-05,
"loss": 0.5016,
"step": 514
},
{
"epoch": 0.6683971447112265,
"grad_norm": 0.3211262394859621,
"learning_rate": 4.3193843193843196e-05,
"loss": 0.5285,
"step": 515
},
{
"epoch": 0.6696950032446464,
"grad_norm": 0.2847594895492132,
"learning_rate": 4.316979316979317e-05,
"loss": 0.5174,
"step": 516
},
{
"epoch": 0.6709928617780662,
"grad_norm": 0.31731208678548406,
"learning_rate": 4.314574314574315e-05,
"loss": 0.5287,
"step": 517
},
{
"epoch": 0.6722907203114861,
"grad_norm": 0.26600293134050695,
"learning_rate": 4.312169312169313e-05,
"loss": 0.5105,
"step": 518
},
{
"epoch": 0.6735885788449059,
"grad_norm": 0.29880462234281113,
"learning_rate": 4.3097643097643095e-05,
"loss": 0.5375,
"step": 519
},
{
"epoch": 0.6748864373783258,
"grad_norm": 0.2652094878668775,
"learning_rate": 4.3073593073593077e-05,
"loss": 0.5033,
"step": 520
},
{
"epoch": 0.6761842959117457,
"grad_norm": 0.315140738606816,
"learning_rate": 4.304954304954305e-05,
"loss": 0.5238,
"step": 521
},
{
"epoch": 0.6774821544451655,
"grad_norm": 0.2852888179467452,
"learning_rate": 4.302549302549303e-05,
"loss": 0.5125,
"step": 522
},
{
"epoch": 0.6787800129785854,
"grad_norm": 0.3217782609108167,
"learning_rate": 4.3001443001443e-05,
"loss": 0.5084,
"step": 523
},
{
"epoch": 0.6800778715120052,
"grad_norm": 0.3067930968649758,
"learning_rate": 4.2977392977392976e-05,
"loss": 0.4999,
"step": 524
},
{
"epoch": 0.6813757300454251,
"grad_norm": 0.2937819263154037,
"learning_rate": 4.295334295334296e-05,
"loss": 0.5256,
"step": 525
},
{
"epoch": 0.6826735885788449,
"grad_norm": 0.32438054578281567,
"learning_rate": 4.292929292929293e-05,
"loss": 0.4907,
"step": 526
},
{
"epoch": 0.6839714471122648,
"grad_norm": 0.2742147889295781,
"learning_rate": 4.290524290524291e-05,
"loss": 0.5068,
"step": 527
},
{
"epoch": 0.6852693056456847,
"grad_norm": 0.35488588986537717,
"learning_rate": 4.288119288119288e-05,
"loss": 0.5248,
"step": 528
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.26229530604678386,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5037,
"step": 529
},
{
"epoch": 0.6878650227125244,
"grad_norm": 0.3461696681941986,
"learning_rate": 4.283309283309284e-05,
"loss": 0.5025,
"step": 530
},
{
"epoch": 0.6891628812459442,
"grad_norm": 0.266178675206237,
"learning_rate": 4.280904280904281e-05,
"loss": 0.4896,
"step": 531
},
{
"epoch": 0.6904607397793641,
"grad_norm": 0.34686998824653287,
"learning_rate": 4.278499278499279e-05,
"loss": 0.5034,
"step": 532
},
{
"epoch": 0.6917585983127839,
"grad_norm": 0.3320503579783302,
"learning_rate": 4.276094276094276e-05,
"loss": 0.5178,
"step": 533
},
{
"epoch": 0.6930564568462038,
"grad_norm": 0.3083799644603529,
"learning_rate": 4.273689273689274e-05,
"loss": 0.526,
"step": 534
},
{
"epoch": 0.6943543153796237,
"grad_norm": 0.36890093348582187,
"learning_rate": 4.271284271284272e-05,
"loss": 0.5311,
"step": 535
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.30993196929952377,
"learning_rate": 4.2688792688792686e-05,
"loss": 0.5246,
"step": 536
},
{
"epoch": 0.6969500324464634,
"grad_norm": 0.3135809737260554,
"learning_rate": 4.266474266474267e-05,
"loss": 0.5042,
"step": 537
},
{
"epoch": 0.6982478909798832,
"grad_norm": 0.32235641085299527,
"learning_rate": 4.264069264069264e-05,
"loss": 0.5226,
"step": 538
},
{
"epoch": 0.6995457495133031,
"grad_norm": 0.29955767973991637,
"learning_rate": 4.2616642616642624e-05,
"loss": 0.5003,
"step": 539
},
{
"epoch": 0.7008436080467229,
"grad_norm": 0.2728001277044658,
"learning_rate": 4.259259259259259e-05,
"loss": 0.5035,
"step": 540
},
{
"epoch": 0.7021414665801428,
"grad_norm": 0.3125500639615984,
"learning_rate": 4.256854256854257e-05,
"loss": 0.4982,
"step": 541
},
{
"epoch": 0.7034393251135627,
"grad_norm": 0.3214238247187388,
"learning_rate": 4.254449254449255e-05,
"loss": 0.5015,
"step": 542
},
{
"epoch": 0.7047371836469825,
"grad_norm": 0.34677033141949526,
"learning_rate": 4.2520442520442523e-05,
"loss": 0.5071,
"step": 543
},
{
"epoch": 0.7060350421804024,
"grad_norm": 0.320737794563556,
"learning_rate": 4.24963924963925e-05,
"loss": 0.5036,
"step": 544
},
{
"epoch": 0.7073329007138222,
"grad_norm": 0.28140631509820974,
"learning_rate": 4.247234247234247e-05,
"loss": 0.472,
"step": 545
},
{
"epoch": 0.7086307592472421,
"grad_norm": 0.2876262450309547,
"learning_rate": 4.244829244829245e-05,
"loss": 0.5033,
"step": 546
},
{
"epoch": 0.7099286177806619,
"grad_norm": 0.28203604302358143,
"learning_rate": 4.242424242424243e-05,
"loss": 0.4933,
"step": 547
},
{
"epoch": 0.7112264763140818,
"grad_norm": 0.3106404772330975,
"learning_rate": 4.2400192400192404e-05,
"loss": 0.4972,
"step": 548
},
{
"epoch": 0.7125243348475017,
"grad_norm": 0.25531650904916336,
"learning_rate": 4.237614237614238e-05,
"loss": 0.524,
"step": 549
},
{
"epoch": 0.7138221933809215,
"grad_norm": 0.3694832680055122,
"learning_rate": 4.2352092352092354e-05,
"loss": 0.5215,
"step": 550
},
{
"epoch": 0.7151200519143414,
"grad_norm": 0.29317455967258776,
"learning_rate": 4.232804232804233e-05,
"loss": 0.4978,
"step": 551
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.36952833950680053,
"learning_rate": 4.230399230399231e-05,
"loss": 0.5091,
"step": 552
},
{
"epoch": 0.7177157689811811,
"grad_norm": 0.3458300719165068,
"learning_rate": 4.227994227994228e-05,
"loss": 0.5179,
"step": 553
},
{
"epoch": 0.719013627514601,
"grad_norm": 0.35910338257547214,
"learning_rate": 4.225589225589226e-05,
"loss": 0.4967,
"step": 554
},
{
"epoch": 0.7203114860480208,
"grad_norm": 0.3832017565467235,
"learning_rate": 4.2231842231842234e-05,
"loss": 0.506,
"step": 555
},
{
"epoch": 0.7216093445814407,
"grad_norm": 0.3270524496685099,
"learning_rate": 4.220779220779221e-05,
"loss": 0.4956,
"step": 556
},
{
"epoch": 0.7229072031148605,
"grad_norm": 0.31306380662178745,
"learning_rate": 4.2183742183742184e-05,
"loss": 0.516,
"step": 557
},
{
"epoch": 0.7242050616482804,
"grad_norm": 0.294604631221543,
"learning_rate": 4.215969215969216e-05,
"loss": 0.5055,
"step": 558
},
{
"epoch": 0.7255029201817002,
"grad_norm": 0.3534780044338388,
"learning_rate": 4.213564213564214e-05,
"loss": 0.4975,
"step": 559
},
{
"epoch": 0.72680077871512,
"grad_norm": 0.33032987931239965,
"learning_rate": 4.2111592111592115e-05,
"loss": 0.5118,
"step": 560
},
{
"epoch": 0.72809863724854,
"grad_norm": 0.3196832192635056,
"learning_rate": 4.208754208754209e-05,
"loss": 0.5093,
"step": 561
},
{
"epoch": 0.7293964957819598,
"grad_norm": 0.36785704594666774,
"learning_rate": 4.2063492063492065e-05,
"loss": 0.5069,
"step": 562
},
{
"epoch": 0.7306943543153797,
"grad_norm": 0.3702503469527744,
"learning_rate": 4.203944203944204e-05,
"loss": 0.513,
"step": 563
},
{
"epoch": 0.7319922128487995,
"grad_norm": 0.3070674221331164,
"learning_rate": 4.201539201539202e-05,
"loss": 0.497,
"step": 564
},
{
"epoch": 0.7332900713822194,
"grad_norm": 0.35750959007798994,
"learning_rate": 4.1991341991341996e-05,
"loss": 0.5085,
"step": 565
},
{
"epoch": 0.7345879299156391,
"grad_norm": 0.2835364219292076,
"learning_rate": 4.196729196729197e-05,
"loss": 0.5091,
"step": 566
},
{
"epoch": 0.735885788449059,
"grad_norm": 0.2884098082465498,
"learning_rate": 4.1943241943241945e-05,
"loss": 0.4884,
"step": 567
},
{
"epoch": 0.737183646982479,
"grad_norm": 0.3203510406175552,
"learning_rate": 4.191919191919192e-05,
"loss": 0.4971,
"step": 568
},
{
"epoch": 0.7384815055158988,
"grad_norm": 0.27371373687668255,
"learning_rate": 4.18951418951419e-05,
"loss": 0.5095,
"step": 569
},
{
"epoch": 0.7397793640493187,
"grad_norm": 0.34717402203397457,
"learning_rate": 4.187109187109187e-05,
"loss": 0.5014,
"step": 570
},
{
"epoch": 0.7410772225827384,
"grad_norm": 0.30582393639621713,
"learning_rate": 4.184704184704185e-05,
"loss": 0.5181,
"step": 571
},
{
"epoch": 0.7423750811161584,
"grad_norm": 0.32112393843480735,
"learning_rate": 4.1822991822991826e-05,
"loss": 0.5006,
"step": 572
},
{
"epoch": 0.7436729396495781,
"grad_norm": 0.33979137877685406,
"learning_rate": 4.17989417989418e-05,
"loss": 0.5248,
"step": 573
},
{
"epoch": 0.744970798182998,
"grad_norm": 0.3209001348833202,
"learning_rate": 4.1774891774891775e-05,
"loss": 0.5013,
"step": 574
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.3266878409508907,
"learning_rate": 4.175084175084175e-05,
"loss": 0.4915,
"step": 575
},
{
"epoch": 0.7475665152498377,
"grad_norm": 0.30503612561210064,
"learning_rate": 4.172679172679173e-05,
"loss": 0.5046,
"step": 576
},
{
"epoch": 0.7488643737832577,
"grad_norm": 0.28971405798539174,
"learning_rate": 4.1702741702741707e-05,
"loss": 0.494,
"step": 577
},
{
"epoch": 0.7501622323166774,
"grad_norm": 0.29780493895562776,
"learning_rate": 4.167869167869168e-05,
"loss": 0.492,
"step": 578
},
{
"epoch": 0.7514600908500974,
"grad_norm": 0.32685361493691256,
"learning_rate": 4.1654641654641656e-05,
"loss": 0.5101,
"step": 579
},
{
"epoch": 0.7527579493835171,
"grad_norm": 0.34614664346128227,
"learning_rate": 4.163059163059163e-05,
"loss": 0.4953,
"step": 580
},
{
"epoch": 0.754055807916937,
"grad_norm": 0.24964382812915295,
"learning_rate": 4.160654160654161e-05,
"loss": 0.4974,
"step": 581
},
{
"epoch": 0.755353666450357,
"grad_norm": 0.3266391465805975,
"learning_rate": 4.158249158249159e-05,
"loss": 0.5039,
"step": 582
},
{
"epoch": 0.7566515249837767,
"grad_norm": 0.34857873431761155,
"learning_rate": 4.155844155844156e-05,
"loss": 0.5056,
"step": 583
},
{
"epoch": 0.7579493835171967,
"grad_norm": 0.2921239047290261,
"learning_rate": 4.153439153439154e-05,
"loss": 0.4967,
"step": 584
},
{
"epoch": 0.7592472420506164,
"grad_norm": 0.33268372260105683,
"learning_rate": 4.151034151034151e-05,
"loss": 0.5066,
"step": 585
},
{
"epoch": 0.7605451005840363,
"grad_norm": 0.318064660501317,
"learning_rate": 4.148629148629149e-05,
"loss": 0.5014,
"step": 586
},
{
"epoch": 0.7618429591174561,
"grad_norm": 0.40789097570888044,
"learning_rate": 4.146224146224146e-05,
"loss": 0.5236,
"step": 587
},
{
"epoch": 0.763140817650876,
"grad_norm": 0.36460994866717067,
"learning_rate": 4.143819143819144e-05,
"loss": 0.5028,
"step": 588
},
{
"epoch": 0.764438676184296,
"grad_norm": 0.32621095461004207,
"learning_rate": 4.141414141414142e-05,
"loss": 0.5,
"step": 589
},
{
"epoch": 0.7657365347177157,
"grad_norm": 0.35949920016397585,
"learning_rate": 4.139009139009139e-05,
"loss": 0.5028,
"step": 590
},
{
"epoch": 0.7670343932511356,
"grad_norm": 0.26064162839844207,
"learning_rate": 4.136604136604137e-05,
"loss": 0.4926,
"step": 591
},
{
"epoch": 0.7683322517845554,
"grad_norm": 0.33955569046497985,
"learning_rate": 4.134199134199134e-05,
"loss": 0.491,
"step": 592
},
{
"epoch": 0.7696301103179753,
"grad_norm": 0.302141896432662,
"learning_rate": 4.131794131794132e-05,
"loss": 0.4884,
"step": 593
},
{
"epoch": 0.7709279688513953,
"grad_norm": 0.30640851435457384,
"learning_rate": 4.12938912938913e-05,
"loss": 0.4993,
"step": 594
},
{
"epoch": 0.772225827384815,
"grad_norm": 0.3040997259974209,
"learning_rate": 4.126984126984127e-05,
"loss": 0.5207,
"step": 595
},
{
"epoch": 0.773523685918235,
"grad_norm": 0.3146837697580934,
"learning_rate": 4.124579124579125e-05,
"loss": 0.5041,
"step": 596
},
{
"epoch": 0.7748215444516547,
"grad_norm": 0.2837662015282775,
"learning_rate": 4.122174122174122e-05,
"loss": 0.4958,
"step": 597
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.3005344336762094,
"learning_rate": 4.1197691197691204e-05,
"loss": 0.4995,
"step": 598
},
{
"epoch": 0.7774172615184944,
"grad_norm": 0.2593994091267606,
"learning_rate": 4.117364117364118e-05,
"loss": 0.4949,
"step": 599
},
{
"epoch": 0.7787151200519143,
"grad_norm": 0.31127336265884026,
"learning_rate": 4.114959114959115e-05,
"loss": 0.5026,
"step": 600
},
{
"epoch": 0.7800129785853342,
"grad_norm": 0.24332809263192706,
"learning_rate": 4.112554112554113e-05,
"loss": 0.506,
"step": 601
},
{
"epoch": 0.781310837118754,
"grad_norm": 0.268091615023721,
"learning_rate": 4.11014911014911e-05,
"loss": 0.5148,
"step": 602
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.27606994914293354,
"learning_rate": 4.1077441077441085e-05,
"loss": 0.5058,
"step": 603
},
{
"epoch": 0.7839065541855937,
"grad_norm": 0.25091695629276883,
"learning_rate": 4.105339105339105e-05,
"loss": 0.4939,
"step": 604
},
{
"epoch": 0.7852044127190136,
"grad_norm": 0.25969299604058604,
"learning_rate": 4.1029341029341034e-05,
"loss": 0.5045,
"step": 605
},
{
"epoch": 0.7865022712524334,
"grad_norm": 0.28645937532292653,
"learning_rate": 4.100529100529101e-05,
"loss": 0.51,
"step": 606
},
{
"epoch": 0.7878001297858533,
"grad_norm": 0.2867588479387004,
"learning_rate": 4.0981240981240984e-05,
"loss": 0.4849,
"step": 607
},
{
"epoch": 0.7890979883192732,
"grad_norm": 0.2646936431763157,
"learning_rate": 4.095719095719096e-05,
"loss": 0.5139,
"step": 608
},
{
"epoch": 0.790395846852693,
"grad_norm": 0.3083485975619241,
"learning_rate": 4.093314093314093e-05,
"loss": 0.52,
"step": 609
},
{
"epoch": 0.791693705386113,
"grad_norm": 0.29615253606758346,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.491,
"step": 610
},
{
"epoch": 0.7929915639195327,
"grad_norm": 0.29803496441037525,
"learning_rate": 4.088504088504089e-05,
"loss": 0.5054,
"step": 611
},
{
"epoch": 0.7942894224529526,
"grad_norm": 0.2854912264040868,
"learning_rate": 4.0860990860990864e-05,
"loss": 0.4838,
"step": 612
},
{
"epoch": 0.7955872809863724,
"grad_norm": 0.2861142997625756,
"learning_rate": 4.083694083694084e-05,
"loss": 0.4777,
"step": 613
},
{
"epoch": 0.7968851395197923,
"grad_norm": 0.28878974958497106,
"learning_rate": 4.0812890812890814e-05,
"loss": 0.5043,
"step": 614
},
{
"epoch": 0.7981829980532122,
"grad_norm": 0.28908126130671624,
"learning_rate": 4.0788840788840795e-05,
"loss": 0.5077,
"step": 615
},
{
"epoch": 0.799480856586632,
"grad_norm": 0.31059225731020423,
"learning_rate": 4.0764790764790763e-05,
"loss": 0.4846,
"step": 616
},
{
"epoch": 0.8007787151200519,
"grad_norm": 0.2990367658353648,
"learning_rate": 4.074074074074074e-05,
"loss": 0.5075,
"step": 617
},
{
"epoch": 0.8020765736534717,
"grad_norm": 0.29529728033655306,
"learning_rate": 4.071669071669072e-05,
"loss": 0.4876,
"step": 618
},
{
"epoch": 0.8033744321868916,
"grad_norm": 0.299331177429087,
"learning_rate": 4.0692640692640695e-05,
"loss": 0.5032,
"step": 619
},
{
"epoch": 0.8046722907203114,
"grad_norm": 0.2842657684401892,
"learning_rate": 4.066859066859067e-05,
"loss": 0.4889,
"step": 620
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.2692733717815561,
"learning_rate": 4.0644540644540644e-05,
"loss": 0.5009,
"step": 621
},
{
"epoch": 0.8072680077871512,
"grad_norm": 0.3436883319030681,
"learning_rate": 4.062049062049062e-05,
"loss": 0.5185,
"step": 622
},
{
"epoch": 0.808565866320571,
"grad_norm": 0.2653175278056993,
"learning_rate": 4.05964405964406e-05,
"loss": 0.4739,
"step": 623
},
{
"epoch": 0.8098637248539909,
"grad_norm": 0.31209657077735303,
"learning_rate": 4.0572390572390575e-05,
"loss": 0.4944,
"step": 624
},
{
"epoch": 0.8111615833874107,
"grad_norm": 0.3396825057908641,
"learning_rate": 4.054834054834055e-05,
"loss": 0.5203,
"step": 625
},
{
"epoch": 0.8124594419208306,
"grad_norm": 0.31652376647001546,
"learning_rate": 4.0524290524290525e-05,
"loss": 0.4973,
"step": 626
},
{
"epoch": 0.8137573004542504,
"grad_norm": 0.41987696956302806,
"learning_rate": 4.05002405002405e-05,
"loss": 0.5008,
"step": 627
},
{
"epoch": 0.8150551589876703,
"grad_norm": 0.32125784926625567,
"learning_rate": 4.047619047619048e-05,
"loss": 0.5107,
"step": 628
},
{
"epoch": 0.8163530175210902,
"grad_norm": 0.43302794720660975,
"learning_rate": 4.045214045214045e-05,
"loss": 0.5174,
"step": 629
},
{
"epoch": 0.81765087605451,
"grad_norm": 0.29529734876987174,
"learning_rate": 4.042809042809043e-05,
"loss": 0.4881,
"step": 630
},
{
"epoch": 0.8189487345879299,
"grad_norm": 0.4076264173563411,
"learning_rate": 4.0404040404040405e-05,
"loss": 0.5034,
"step": 631
},
{
"epoch": 0.8202465931213497,
"grad_norm": 0.30337707563686833,
"learning_rate": 4.037999037999039e-05,
"loss": 0.5119,
"step": 632
},
{
"epoch": 0.8215444516547696,
"grad_norm": 0.39849923453663594,
"learning_rate": 4.0355940355940355e-05,
"loss": 0.5216,
"step": 633
},
{
"epoch": 0.8228423101881895,
"grad_norm": 0.26447226558452136,
"learning_rate": 4.033189033189033e-05,
"loss": 0.5122,
"step": 634
},
{
"epoch": 0.8241401687216093,
"grad_norm": 0.36530243282807756,
"learning_rate": 4.030784030784031e-05,
"loss": 0.4918,
"step": 635
},
{
"epoch": 0.8254380272550292,
"grad_norm": 0.3160155549438362,
"learning_rate": 4.0283790283790286e-05,
"loss": 0.5024,
"step": 636
},
{
"epoch": 0.826735885788449,
"grad_norm": 0.33636766065888035,
"learning_rate": 4.025974025974026e-05,
"loss": 0.5075,
"step": 637
},
{
"epoch": 0.8280337443218689,
"grad_norm": 0.29456591212102723,
"learning_rate": 4.0235690235690236e-05,
"loss": 0.5241,
"step": 638
},
{
"epoch": 0.8293316028552887,
"grad_norm": 0.3220137418817016,
"learning_rate": 4.021164021164021e-05,
"loss": 0.5028,
"step": 639
},
{
"epoch": 0.8306294613887086,
"grad_norm": 0.279849046005973,
"learning_rate": 4.018759018759019e-05,
"loss": 0.4937,
"step": 640
},
{
"epoch": 0.8319273199221285,
"grad_norm": 0.34243863539028374,
"learning_rate": 4.016354016354017e-05,
"loss": 0.4992,
"step": 641
},
{
"epoch": 0.8332251784555483,
"grad_norm": 0.3077281111260478,
"learning_rate": 4.013949013949014e-05,
"loss": 0.5013,
"step": 642
},
{
"epoch": 0.8345230369889682,
"grad_norm": 0.2917135387110751,
"learning_rate": 4.0115440115440116e-05,
"loss": 0.5084,
"step": 643
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.4035461806364624,
"learning_rate": 4.009139009139009e-05,
"loss": 0.5075,
"step": 644
},
{
"epoch": 0.8371187540558079,
"grad_norm": 0.28209498726622767,
"learning_rate": 4.006734006734007e-05,
"loss": 0.5018,
"step": 645
},
{
"epoch": 0.8384166125892277,
"grad_norm": 0.32365984928312647,
"learning_rate": 4.004329004329004e-05,
"loss": 0.5059,
"step": 646
},
{
"epoch": 0.8397144711226476,
"grad_norm": 0.27577530319773297,
"learning_rate": 4.001924001924002e-05,
"loss": 0.4833,
"step": 647
},
{
"epoch": 0.8410123296560675,
"grad_norm": 0.28322226416250573,
"learning_rate": 3.999518999519e-05,
"loss": 0.4918,
"step": 648
},
{
"epoch": 0.8423101881894873,
"grad_norm": 0.2855631382527533,
"learning_rate": 3.997113997113997e-05,
"loss": 0.4865,
"step": 649
},
{
"epoch": 0.8436080467229072,
"grad_norm": 0.32968743054146016,
"learning_rate": 3.9947089947089946e-05,
"loss": 0.4995,
"step": 650
},
{
"epoch": 0.844905905256327,
"grad_norm": 0.2546632505302031,
"learning_rate": 3.992303992303992e-05,
"loss": 0.5007,
"step": 651
},
{
"epoch": 0.8462037637897469,
"grad_norm": 0.3379202280608477,
"learning_rate": 3.98989898989899e-05,
"loss": 0.5191,
"step": 652
},
{
"epoch": 0.8475016223231667,
"grad_norm": 0.2828398301217406,
"learning_rate": 3.987493987493988e-05,
"loss": 0.511,
"step": 653
},
{
"epoch": 0.8487994808565866,
"grad_norm": 0.2759621170821239,
"learning_rate": 3.985088985088985e-05,
"loss": 0.4833,
"step": 654
},
{
"epoch": 0.8500973393900065,
"grad_norm": 0.30549235671106895,
"learning_rate": 3.982683982683983e-05,
"loss": 0.5041,
"step": 655
},
{
"epoch": 0.8513951979234263,
"grad_norm": 0.27394626074529427,
"learning_rate": 3.98027898027898e-05,
"loss": 0.4765,
"step": 656
},
{
"epoch": 0.8526930564568462,
"grad_norm": 0.28778374402990897,
"learning_rate": 3.9778739778739783e-05,
"loss": 0.5204,
"step": 657
},
{
"epoch": 0.853990914990266,
"grad_norm": 0.34038234379097493,
"learning_rate": 3.975468975468976e-05,
"loss": 0.4974,
"step": 658
},
{
"epoch": 0.8552887735236859,
"grad_norm": 0.2866059277868264,
"learning_rate": 3.973063973063973e-05,
"loss": 0.5054,
"step": 659
},
{
"epoch": 0.8565866320571057,
"grad_norm": 0.2927739560082091,
"learning_rate": 3.970658970658971e-05,
"loss": 0.4974,
"step": 660
},
{
"epoch": 0.8578844905905256,
"grad_norm": 0.3262341662863849,
"learning_rate": 3.968253968253968e-05,
"loss": 0.4989,
"step": 661
},
{
"epoch": 0.8591823491239455,
"grad_norm": 0.2845679103896212,
"learning_rate": 3.9658489658489664e-05,
"loss": 0.4874,
"step": 662
},
{
"epoch": 0.8604802076573653,
"grad_norm": 0.2655596774616536,
"learning_rate": 3.963443963443963e-05,
"loss": 0.4843,
"step": 663
},
{
"epoch": 0.8617780661907852,
"grad_norm": 0.2820305028454277,
"learning_rate": 3.9610389610389614e-05,
"loss": 0.5164,
"step": 664
},
{
"epoch": 0.863075924724205,
"grad_norm": 0.2940016391705861,
"learning_rate": 3.958633958633959e-05,
"loss": 0.4886,
"step": 665
},
{
"epoch": 0.8643737832576249,
"grad_norm": 0.2547644694514986,
"learning_rate": 3.956228956228956e-05,
"loss": 0.5051,
"step": 666
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.26023827863988136,
"learning_rate": 3.953823953823954e-05,
"loss": 0.5217,
"step": 667
},
{
"epoch": 0.8669695003244646,
"grad_norm": 0.27927530276749113,
"learning_rate": 3.951418951418951e-05,
"loss": 0.5092,
"step": 668
},
{
"epoch": 0.8682673588578845,
"grad_norm": 0.26218361367284654,
"learning_rate": 3.9490139490139494e-05,
"loss": 0.4889,
"step": 669
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.2574083587232944,
"learning_rate": 3.946608946608947e-05,
"loss": 0.498,
"step": 670
},
{
"epoch": 0.8708630759247242,
"grad_norm": 0.28970396117550484,
"learning_rate": 3.9442039442039444e-05,
"loss": 0.5005,
"step": 671
},
{
"epoch": 0.872160934458144,
"grad_norm": 0.253672603517478,
"learning_rate": 3.941798941798942e-05,
"loss": 0.4802,
"step": 672
},
{
"epoch": 0.8734587929915639,
"grad_norm": 0.28176527595198714,
"learning_rate": 3.939393939393939e-05,
"loss": 0.5009,
"step": 673
},
{
"epoch": 0.8747566515249838,
"grad_norm": 0.29970103183262975,
"learning_rate": 3.9369889369889375e-05,
"loss": 0.494,
"step": 674
},
{
"epoch": 0.8760545100584036,
"grad_norm": 0.28208623298136165,
"learning_rate": 3.934583934583935e-05,
"loss": 0.4784,
"step": 675
},
{
"epoch": 0.8773523685918235,
"grad_norm": 0.31291964058774646,
"learning_rate": 3.9321789321789324e-05,
"loss": 0.4869,
"step": 676
},
{
"epoch": 0.8786502271252433,
"grad_norm": 0.2979349183006068,
"learning_rate": 3.92977392977393e-05,
"loss": 0.5046,
"step": 677
},
{
"epoch": 0.8799480856586632,
"grad_norm": 0.2908014815865184,
"learning_rate": 3.9273689273689274e-05,
"loss": 0.5265,
"step": 678
},
{
"epoch": 0.881245944192083,
"grad_norm": 0.26416143914819024,
"learning_rate": 3.9249639249639256e-05,
"loss": 0.4818,
"step": 679
},
{
"epoch": 0.8825438027255029,
"grad_norm": 0.2933751247833131,
"learning_rate": 3.9225589225589224e-05,
"loss": 0.5012,
"step": 680
},
{
"epoch": 0.8838416612589228,
"grad_norm": 0.3126465117694497,
"learning_rate": 3.9201539201539205e-05,
"loss": 0.5146,
"step": 681
},
{
"epoch": 0.8851395197923426,
"grad_norm": 0.2524418661547154,
"learning_rate": 3.917748917748918e-05,
"loss": 0.5016,
"step": 682
},
{
"epoch": 0.8864373783257625,
"grad_norm": 0.25838371662824994,
"learning_rate": 3.9153439153439155e-05,
"loss": 0.4788,
"step": 683
},
{
"epoch": 0.8877352368591823,
"grad_norm": 0.26415640627712256,
"learning_rate": 3.912938912938913e-05,
"loss": 0.5026,
"step": 684
},
{
"epoch": 0.8890330953926022,
"grad_norm": 0.24247698201137077,
"learning_rate": 3.9105339105339104e-05,
"loss": 0.4785,
"step": 685
},
{
"epoch": 0.890330953926022,
"grad_norm": 0.2592072694481307,
"learning_rate": 3.9081289081289086e-05,
"loss": 0.4977,
"step": 686
},
{
"epoch": 0.8916288124594419,
"grad_norm": 0.2542832037438216,
"learning_rate": 3.905723905723906e-05,
"loss": 0.4965,
"step": 687
},
{
"epoch": 0.8929266709928618,
"grad_norm": 0.262249098843731,
"learning_rate": 3.9033189033189035e-05,
"loss": 0.4973,
"step": 688
},
{
"epoch": 0.8942245295262816,
"grad_norm": 0.31610300163969496,
"learning_rate": 3.900913900913901e-05,
"loss": 0.4825,
"step": 689
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.2740619461513036,
"learning_rate": 3.8985088985088985e-05,
"loss": 0.4975,
"step": 690
},
{
"epoch": 0.8968202465931213,
"grad_norm": 0.2741811791339238,
"learning_rate": 3.8961038961038966e-05,
"loss": 0.4959,
"step": 691
},
{
"epoch": 0.8981181051265412,
"grad_norm": 0.271574424484243,
"learning_rate": 3.893698893698894e-05,
"loss": 0.4871,
"step": 692
},
{
"epoch": 0.899415963659961,
"grad_norm": 0.23758978275848563,
"learning_rate": 3.891293891293891e-05,
"loss": 0.4862,
"step": 693
},
{
"epoch": 0.9007138221933809,
"grad_norm": 0.27263432443106744,
"learning_rate": 3.888888888888889e-05,
"loss": 0.5041,
"step": 694
},
{
"epoch": 0.9020116807268008,
"grad_norm": 0.2707424886921677,
"learning_rate": 3.8864838864838866e-05,
"loss": 0.4669,
"step": 695
},
{
"epoch": 0.9033095392602206,
"grad_norm": 0.25600324849109557,
"learning_rate": 3.884078884078885e-05,
"loss": 0.4915,
"step": 696
},
{
"epoch": 0.9046073977936405,
"grad_norm": 0.28010923150865535,
"learning_rate": 3.8816738816738815e-05,
"loss": 0.5038,
"step": 697
},
{
"epoch": 0.9059052563270603,
"grad_norm": 0.28506679888273495,
"learning_rate": 3.87926887926888e-05,
"loss": 0.4965,
"step": 698
},
{
"epoch": 0.9072031148604802,
"grad_norm": 0.26956128070889057,
"learning_rate": 3.876863876863877e-05,
"loss": 0.5141,
"step": 699
},
{
"epoch": 0.9085009733939,
"grad_norm": 0.25947152657252537,
"learning_rate": 3.8744588744588746e-05,
"loss": 0.477,
"step": 700
},
{
"epoch": 0.9097988319273199,
"grad_norm": 0.29443853949197607,
"learning_rate": 3.872053872053872e-05,
"loss": 0.5059,
"step": 701
},
{
"epoch": 0.9110966904607398,
"grad_norm": 0.23371391184316104,
"learning_rate": 3.8696488696488696e-05,
"loss": 0.4783,
"step": 702
},
{
"epoch": 0.9123945489941596,
"grad_norm": 0.2960003000238748,
"learning_rate": 3.867243867243868e-05,
"loss": 0.4975,
"step": 703
},
{
"epoch": 0.9136924075275795,
"grad_norm": 0.2447202791191107,
"learning_rate": 3.864838864838865e-05,
"loss": 0.4922,
"step": 704
},
{
"epoch": 0.9149902660609993,
"grad_norm": 0.26642870018861,
"learning_rate": 3.862433862433863e-05,
"loss": 0.5252,
"step": 705
},
{
"epoch": 0.9162881245944192,
"grad_norm": 0.2560976171455726,
"learning_rate": 3.86002886002886e-05,
"loss": 0.5012,
"step": 706
},
{
"epoch": 0.917585983127839,
"grad_norm": 0.28883925682227224,
"learning_rate": 3.8576238576238576e-05,
"loss": 0.509,
"step": 707
},
{
"epoch": 0.9188838416612589,
"grad_norm": 0.2513723680500846,
"learning_rate": 3.855218855218856e-05,
"loss": 0.5064,
"step": 708
},
{
"epoch": 0.9201817001946788,
"grad_norm": 0.26385183299541554,
"learning_rate": 3.852813852813853e-05,
"loss": 0.4844,
"step": 709
},
{
"epoch": 0.9214795587280986,
"grad_norm": 0.2739471730680778,
"learning_rate": 3.85040885040885e-05,
"loss": 0.5062,
"step": 710
},
{
"epoch": 0.9227774172615185,
"grad_norm": 0.29349547383156327,
"learning_rate": 3.848003848003848e-05,
"loss": 0.4663,
"step": 711
},
{
"epoch": 0.9240752757949383,
"grad_norm": 0.24335538681766872,
"learning_rate": 3.845598845598846e-05,
"loss": 0.4802,
"step": 712
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.26992374528269225,
"learning_rate": 3.843193843193844e-05,
"loss": 0.4737,
"step": 713
},
{
"epoch": 0.9266709928617781,
"grad_norm": 0.2601068353604271,
"learning_rate": 3.8407888407888407e-05,
"loss": 0.4871,
"step": 714
},
{
"epoch": 0.9279688513951979,
"grad_norm": 0.2758032793334676,
"learning_rate": 3.838383838383838e-05,
"loss": 0.497,
"step": 715
},
{
"epoch": 0.9292667099286178,
"grad_norm": 0.28995105804236226,
"learning_rate": 3.835978835978836e-05,
"loss": 0.4954,
"step": 716
},
{
"epoch": 0.9305645684620376,
"grad_norm": 0.2711126929498793,
"learning_rate": 3.833573833573834e-05,
"loss": 0.4899,
"step": 717
},
{
"epoch": 0.9318624269954575,
"grad_norm": 0.27659146601067053,
"learning_rate": 3.831168831168831e-05,
"loss": 0.4724,
"step": 718
},
{
"epoch": 0.9331602855288773,
"grad_norm": 0.27572149711082233,
"learning_rate": 3.828763828763829e-05,
"loss": 0.4771,
"step": 719
},
{
"epoch": 0.9344581440622972,
"grad_norm": 0.26747547869691946,
"learning_rate": 3.826358826358827e-05,
"loss": 0.508,
"step": 720
},
{
"epoch": 0.9357560025957171,
"grad_norm": 0.25233671196063495,
"learning_rate": 3.8239538239538244e-05,
"loss": 0.5112,
"step": 721
},
{
"epoch": 0.9370538611291369,
"grad_norm": 0.25520294945614,
"learning_rate": 3.821548821548822e-05,
"loss": 0.5015,
"step": 722
},
{
"epoch": 0.9383517196625568,
"grad_norm": 0.2638285175394357,
"learning_rate": 3.819143819143819e-05,
"loss": 0.5159,
"step": 723
},
{
"epoch": 0.9396495781959766,
"grad_norm": 0.2424275513941752,
"learning_rate": 3.816738816738817e-05,
"loss": 0.4934,
"step": 724
},
{
"epoch": 0.9409474367293965,
"grad_norm": 0.25978321355889966,
"learning_rate": 3.814333814333815e-05,
"loss": 0.493,
"step": 725
},
{
"epoch": 0.9422452952628163,
"grad_norm": 0.2717237370055125,
"learning_rate": 3.8119288119288124e-05,
"loss": 0.4848,
"step": 726
},
{
"epoch": 0.9435431537962362,
"grad_norm": 0.2649262510228781,
"learning_rate": 3.809523809523809e-05,
"loss": 0.5106,
"step": 727
},
{
"epoch": 0.9448410123296561,
"grad_norm": 0.2630223525297261,
"learning_rate": 3.8071188071188074e-05,
"loss": 0.4819,
"step": 728
},
{
"epoch": 0.9461388708630759,
"grad_norm": 0.2552060740231092,
"learning_rate": 3.804713804713805e-05,
"loss": 0.4846,
"step": 729
},
{
"epoch": 0.9474367293964958,
"grad_norm": 0.27269809006023926,
"learning_rate": 3.802308802308803e-05,
"loss": 0.4646,
"step": 730
},
{
"epoch": 0.9487345879299156,
"grad_norm": 0.26086007457601984,
"learning_rate": 3.7999037999038e-05,
"loss": 0.4881,
"step": 731
},
{
"epoch": 0.9500324464633355,
"grad_norm": 0.261121106269547,
"learning_rate": 3.797498797498797e-05,
"loss": 0.4676,
"step": 732
},
{
"epoch": 0.9513303049967553,
"grad_norm": 0.24120860308268785,
"learning_rate": 3.7950937950937954e-05,
"loss": 0.475,
"step": 733
},
{
"epoch": 0.9526281635301752,
"grad_norm": 0.23338097940225563,
"learning_rate": 3.792688792688793e-05,
"loss": 0.4913,
"step": 734
},
{
"epoch": 0.9539260220635951,
"grad_norm": 0.27856238721903515,
"learning_rate": 3.7902837902837904e-05,
"loss": 0.4817,
"step": 735
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.23489180306981414,
"learning_rate": 3.787878787878788e-05,
"loss": 0.4876,
"step": 736
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.24747777349253483,
"learning_rate": 3.7854737854737854e-05,
"loss": 0.4926,
"step": 737
},
{
"epoch": 0.9578195976638546,
"grad_norm": 0.2464671646367973,
"learning_rate": 3.7830687830687835e-05,
"loss": 0.4988,
"step": 738
},
{
"epoch": 0.9591174561972745,
"grad_norm": 0.2444290486715431,
"learning_rate": 3.780663780663781e-05,
"loss": 0.5102,
"step": 739
},
{
"epoch": 0.9604153147306943,
"grad_norm": 0.23262037897628499,
"learning_rate": 3.7782587782587785e-05,
"loss": 0.4928,
"step": 740
},
{
"epoch": 0.9617131732641142,
"grad_norm": 0.2320844900099328,
"learning_rate": 3.775853775853776e-05,
"loss": 0.4858,
"step": 741
},
{
"epoch": 0.9630110317975341,
"grad_norm": 0.25468927720147666,
"learning_rate": 3.773448773448774e-05,
"loss": 0.4978,
"step": 742
},
{
"epoch": 0.9643088903309539,
"grad_norm": 0.2686348081196404,
"learning_rate": 3.7710437710437716e-05,
"loss": 0.4774,
"step": 743
},
{
"epoch": 0.9656067488643738,
"grad_norm": 0.25462323553264127,
"learning_rate": 3.7686387686387684e-05,
"loss": 0.4967,
"step": 744
},
{
"epoch": 0.9669046073977936,
"grad_norm": 0.23826397601414423,
"learning_rate": 3.7662337662337665e-05,
"loss": 0.4803,
"step": 745
},
{
"epoch": 0.9682024659312135,
"grad_norm": 0.28828391446426194,
"learning_rate": 3.763828763828764e-05,
"loss": 0.4853,
"step": 746
},
{
"epoch": 0.9695003244646333,
"grad_norm": 0.28433469305996517,
"learning_rate": 3.761423761423762e-05,
"loss": 0.4952,
"step": 747
},
{
"epoch": 0.9707981829980532,
"grad_norm": 0.23492438563324666,
"learning_rate": 3.759018759018759e-05,
"loss": 0.4707,
"step": 748
},
{
"epoch": 0.9720960415314731,
"grad_norm": 0.24099143399264922,
"learning_rate": 3.7566137566137564e-05,
"loss": 0.5025,
"step": 749
},
{
"epoch": 0.9733939000648929,
"grad_norm": 0.26521862280904784,
"learning_rate": 3.7542087542087546e-05,
"loss": 0.4801,
"step": 750
},
{
"epoch": 0.9746917585983128,
"grad_norm": 0.25301181696938563,
"learning_rate": 3.751803751803752e-05,
"loss": 0.4812,
"step": 751
},
{
"epoch": 0.9759896171317326,
"grad_norm": 0.27340749938479236,
"learning_rate": 3.7493987493987495e-05,
"loss": 0.5218,
"step": 752
},
{
"epoch": 0.9772874756651525,
"grad_norm": 0.2758375543775843,
"learning_rate": 3.746993746993747e-05,
"loss": 0.5003,
"step": 753
},
{
"epoch": 0.9785853341985724,
"grad_norm": 0.2894871095962033,
"learning_rate": 3.7445887445887445e-05,
"loss": 0.4898,
"step": 754
},
{
"epoch": 0.9798831927319922,
"grad_norm": 0.24603375955455123,
"learning_rate": 3.7421837421837427e-05,
"loss": 0.4975,
"step": 755
},
{
"epoch": 0.9811810512654121,
"grad_norm": 0.3430084532330243,
"learning_rate": 3.73977873977874e-05,
"loss": 0.4986,
"step": 756
},
{
"epoch": 0.9824789097988319,
"grad_norm": 0.26260792839174424,
"learning_rate": 3.7373737373737376e-05,
"loss": 0.4948,
"step": 757
},
{
"epoch": 0.9837767683322518,
"grad_norm": 0.3417705691604155,
"learning_rate": 3.734968734968735e-05,
"loss": 0.4892,
"step": 758
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.2367132738219532,
"learning_rate": 3.7325637325637326e-05,
"loss": 0.4837,
"step": 759
},
{
"epoch": 0.9863724853990915,
"grad_norm": 0.35539307871553294,
"learning_rate": 3.730158730158731e-05,
"loss": 0.5081,
"step": 760
},
{
"epoch": 0.9876703439325114,
"grad_norm": 0.22735379818962181,
"learning_rate": 3.7277537277537275e-05,
"loss": 0.4791,
"step": 761
},
{
"epoch": 0.9889682024659312,
"grad_norm": 0.30959128828024085,
"learning_rate": 3.725348725348726e-05,
"loss": 0.4977,
"step": 762
},
{
"epoch": 0.9902660609993511,
"grad_norm": 0.24407830637335193,
"learning_rate": 3.722943722943723e-05,
"loss": 0.4889,
"step": 763
},
{
"epoch": 0.9915639195327709,
"grad_norm": 0.37047588851002045,
"learning_rate": 3.720538720538721e-05,
"loss": 0.4887,
"step": 764
},
{
"epoch": 0.9928617780661908,
"grad_norm": 0.3132029888056198,
"learning_rate": 3.718133718133718e-05,
"loss": 0.4808,
"step": 765
},
{
"epoch": 0.9941596365996106,
"grad_norm": 0.336725063578875,
"learning_rate": 3.7157287157287156e-05,
"loss": 0.4944,
"step": 766
},
{
"epoch": 0.9954574951330305,
"grad_norm": 0.29559283526551605,
"learning_rate": 3.713323713323714e-05,
"loss": 0.485,
"step": 767
},
{
"epoch": 0.9967553536664504,
"grad_norm": 0.3181505432986373,
"learning_rate": 3.710918710918711e-05,
"loss": 0.5109,
"step": 768
},
{
"epoch": 0.9980532121998702,
"grad_norm": 0.25792486030536,
"learning_rate": 3.708513708513709e-05,
"loss": 0.4837,
"step": 769
},
{
"epoch": 0.9993510707332901,
"grad_norm": 0.2662559802060241,
"learning_rate": 3.706108706108706e-05,
"loss": 0.4768,
"step": 770
},
{
"epoch": 1.0,
"grad_norm": 0.2662559802060241,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.4749,
"step": 771
},
{
"epoch": 1.0012978585334198,
"grad_norm": 0.45710604360462226,
"learning_rate": 3.701298701298702e-05,
"loss": 0.4295,
"step": 772
},
{
"epoch": 1.0025957170668398,
"grad_norm": 0.3984763478659025,
"learning_rate": 3.698893698893699e-05,
"loss": 0.4211,
"step": 773
},
{
"epoch": 1.0038935756002596,
"grad_norm": 0.33604362226653633,
"learning_rate": 3.696488696488697e-05,
"loss": 0.4472,
"step": 774
},
{
"epoch": 1.0051914341336794,
"grad_norm": 0.35492193650921217,
"learning_rate": 3.694083694083694e-05,
"loss": 0.4144,
"step": 775
},
{
"epoch": 1.0064892926670992,
"grad_norm": 0.34395756575343545,
"learning_rate": 3.691678691678692e-05,
"loss": 0.4371,
"step": 776
},
{
"epoch": 1.0077871512005192,
"grad_norm": 0.3027073670465442,
"learning_rate": 3.68927368927369e-05,
"loss": 0.4418,
"step": 777
},
{
"epoch": 1.009085009733939,
"grad_norm": 0.33164997403681556,
"learning_rate": 3.686868686868687e-05,
"loss": 0.4236,
"step": 778
},
{
"epoch": 1.0103828682673588,
"grad_norm": 0.29836741552546153,
"learning_rate": 3.684463684463685e-05,
"loss": 0.4291,
"step": 779
},
{
"epoch": 1.0116807268007788,
"grad_norm": 0.31908724983675907,
"learning_rate": 3.682058682058682e-05,
"loss": 0.4444,
"step": 780
},
{
"epoch": 1.0129785853341986,
"grad_norm": 0.3063522863393575,
"learning_rate": 3.67965367965368e-05,
"loss": 0.4235,
"step": 781
},
{
"epoch": 1.0142764438676184,
"grad_norm": 0.286969078111983,
"learning_rate": 3.677248677248677e-05,
"loss": 0.4273,
"step": 782
},
{
"epoch": 1.0155743024010382,
"grad_norm": 0.30969412667904017,
"learning_rate": 3.674843674843675e-05,
"loss": 0.4179,
"step": 783
},
{
"epoch": 1.0168721609344582,
"grad_norm": 0.28846243749065087,
"learning_rate": 3.672438672438673e-05,
"loss": 0.445,
"step": 784
},
{
"epoch": 1.018170019467878,
"grad_norm": 0.3052289303187025,
"learning_rate": 3.6700336700336704e-05,
"loss": 0.4282,
"step": 785
},
{
"epoch": 1.0194678780012978,
"grad_norm": 0.28776824445687055,
"learning_rate": 3.667628667628668e-05,
"loss": 0.4361,
"step": 786
},
{
"epoch": 1.0207657365347178,
"grad_norm": 0.25471106959244577,
"learning_rate": 3.665223665223665e-05,
"loss": 0.4177,
"step": 787
},
{
"epoch": 1.0220635950681376,
"grad_norm": 0.2740291049864792,
"learning_rate": 3.662818662818663e-05,
"loss": 0.4394,
"step": 788
},
{
"epoch": 1.0233614536015574,
"grad_norm": 0.3017972640732574,
"learning_rate": 3.660413660413661e-05,
"loss": 0.4386,
"step": 789
},
{
"epoch": 1.0246593121349772,
"grad_norm": 0.23597430678446688,
"learning_rate": 3.6580086580086584e-05,
"loss": 0.4499,
"step": 790
},
{
"epoch": 1.0259571706683972,
"grad_norm": 0.2879673662024183,
"learning_rate": 3.655603655603656e-05,
"loss": 0.4293,
"step": 791
},
{
"epoch": 1.027255029201817,
"grad_norm": 0.2700731107191033,
"learning_rate": 3.6531986531986534e-05,
"loss": 0.4258,
"step": 792
},
{
"epoch": 1.0285528877352368,
"grad_norm": 0.25488494703667464,
"learning_rate": 3.650793650793651e-05,
"loss": 0.4074,
"step": 793
},
{
"epoch": 1.0298507462686568,
"grad_norm": 0.26394634723070287,
"learning_rate": 3.648388648388649e-05,
"loss": 0.4508,
"step": 794
},
{
"epoch": 1.0311486048020766,
"grad_norm": 0.27582489513269015,
"learning_rate": 3.645983645983646e-05,
"loss": 0.4412,
"step": 795
},
{
"epoch": 1.0324464633354964,
"grad_norm": 0.24726232362927064,
"learning_rate": 3.643578643578644e-05,
"loss": 0.4168,
"step": 796
},
{
"epoch": 1.0337443218689162,
"grad_norm": 0.28259272497475907,
"learning_rate": 3.6411736411736415e-05,
"loss": 0.4154,
"step": 797
},
{
"epoch": 1.0350421804023362,
"grad_norm": 0.2663149191606831,
"learning_rate": 3.638768638768639e-05,
"loss": 0.4189,
"step": 798
},
{
"epoch": 1.036340038935756,
"grad_norm": 0.23665837923986385,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.4098,
"step": 799
},
{
"epoch": 1.0376378974691758,
"grad_norm": 0.26291163598038214,
"learning_rate": 3.633958633958634e-05,
"loss": 0.4311,
"step": 800
},
{
"epoch": 1.0389357560025958,
"grad_norm": 0.27970192834528934,
"learning_rate": 3.631553631553632e-05,
"loss": 0.4145,
"step": 801
},
{
"epoch": 1.0402336145360156,
"grad_norm": 0.2710942408225925,
"learning_rate": 3.6291486291486295e-05,
"loss": 0.4277,
"step": 802
},
{
"epoch": 1.0415314730694354,
"grad_norm": 0.2727015101923209,
"learning_rate": 3.626743626743627e-05,
"loss": 0.4232,
"step": 803
},
{
"epoch": 1.0428293316028552,
"grad_norm": 0.3128392157246963,
"learning_rate": 3.6243386243386245e-05,
"loss": 0.4335,
"step": 804
},
{
"epoch": 1.0441271901362752,
"grad_norm": 0.2349817412715045,
"learning_rate": 3.621933621933622e-05,
"loss": 0.4253,
"step": 805
},
{
"epoch": 1.045425048669695,
"grad_norm": 0.28497577605120694,
"learning_rate": 3.61952861952862e-05,
"loss": 0.4168,
"step": 806
},
{
"epoch": 1.0467229072031148,
"grad_norm": 0.2587789310002607,
"learning_rate": 3.617123617123617e-05,
"loss": 0.4217,
"step": 807
},
{
"epoch": 1.0480207657365348,
"grad_norm": 0.26828544455125314,
"learning_rate": 3.6147186147186144e-05,
"loss": 0.4074,
"step": 808
},
{
"epoch": 1.0493186242699546,
"grad_norm": 0.2598287776154231,
"learning_rate": 3.6123136123136125e-05,
"loss": 0.4184,
"step": 809
},
{
"epoch": 1.0506164828033744,
"grad_norm": 0.2703287966299121,
"learning_rate": 3.60990860990861e-05,
"loss": 0.4323,
"step": 810
},
{
"epoch": 1.0519143413367944,
"grad_norm": 0.318496639053465,
"learning_rate": 3.6075036075036075e-05,
"loss": 0.4209,
"step": 811
},
{
"epoch": 1.0532121998702142,
"grad_norm": 0.2697909678170961,
"learning_rate": 3.605098605098605e-05,
"loss": 0.4244,
"step": 812
},
{
"epoch": 1.054510058403634,
"grad_norm": 0.2969724315201258,
"learning_rate": 3.602693602693603e-05,
"loss": 0.4172,
"step": 813
},
{
"epoch": 1.0558079169370538,
"grad_norm": 0.30303096771353466,
"learning_rate": 3.6002886002886006e-05,
"loss": 0.404,
"step": 814
},
{
"epoch": 1.0571057754704738,
"grad_norm": 0.240870306849622,
"learning_rate": 3.597883597883598e-05,
"loss": 0.4226,
"step": 815
},
{
"epoch": 1.0584036340038936,
"grad_norm": 0.31791173210740425,
"learning_rate": 3.5954785954785956e-05,
"loss": 0.411,
"step": 816
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.2804167030463435,
"learning_rate": 3.593073593073593e-05,
"loss": 0.4297,
"step": 817
},
{
"epoch": 1.0609993510707332,
"grad_norm": 0.24716383688085644,
"learning_rate": 3.590668590668591e-05,
"loss": 0.434,
"step": 818
},
{
"epoch": 1.0622972096041532,
"grad_norm": 0.3039180273884032,
"learning_rate": 3.588263588263589e-05,
"loss": 0.4212,
"step": 819
},
{
"epoch": 1.063595068137573,
"grad_norm": 0.319488059258111,
"learning_rate": 3.5858585858585855e-05,
"loss": 0.4068,
"step": 820
},
{
"epoch": 1.0648929266709928,
"grad_norm": 0.22619788557964504,
"learning_rate": 3.5834535834535836e-05,
"loss": 0.4311,
"step": 821
},
{
"epoch": 1.0661907852044128,
"grad_norm": 0.34410773893661634,
"learning_rate": 3.581048581048581e-05,
"loss": 0.4349,
"step": 822
},
{
"epoch": 1.0674886437378326,
"grad_norm": 0.3100368353729728,
"learning_rate": 3.578643578643579e-05,
"loss": 0.4352,
"step": 823
},
{
"epoch": 1.0687865022712524,
"grad_norm": 0.2901826811884039,
"learning_rate": 3.576238576238576e-05,
"loss": 0.4157,
"step": 824
},
{
"epoch": 1.0700843608046724,
"grad_norm": 0.31611344846131356,
"learning_rate": 3.5738335738335735e-05,
"loss": 0.4693,
"step": 825
},
{
"epoch": 1.0713822193380922,
"grad_norm": 0.35227684302990314,
"learning_rate": 3.571428571428572e-05,
"loss": 0.4356,
"step": 826
},
{
"epoch": 1.072680077871512,
"grad_norm": 0.25990758753916315,
"learning_rate": 3.569023569023569e-05,
"loss": 0.4149,
"step": 827
},
{
"epoch": 1.0739779364049318,
"grad_norm": 0.33795998379210196,
"learning_rate": 3.5666185666185667e-05,
"loss": 0.4231,
"step": 828
},
{
"epoch": 1.0752757949383518,
"grad_norm": 0.260416289520159,
"learning_rate": 3.564213564213564e-05,
"loss": 0.4439,
"step": 829
},
{
"epoch": 1.0765736534717716,
"grad_norm": 0.2745403629951124,
"learning_rate": 3.5618085618085616e-05,
"loss": 0.4363,
"step": 830
},
{
"epoch": 1.0778715120051914,
"grad_norm": 0.30247544618833483,
"learning_rate": 3.55940355940356e-05,
"loss": 0.4195,
"step": 831
},
{
"epoch": 1.0791693705386114,
"grad_norm": 0.32082708667036386,
"learning_rate": 3.556998556998557e-05,
"loss": 0.4207,
"step": 832
},
{
"epoch": 1.0804672290720312,
"grad_norm": 0.2897448897920795,
"learning_rate": 3.554593554593555e-05,
"loss": 0.4256,
"step": 833
},
{
"epoch": 1.081765087605451,
"grad_norm": 0.2799359981594651,
"learning_rate": 3.552188552188552e-05,
"loss": 0.4566,
"step": 834
},
{
"epoch": 1.0830629461388708,
"grad_norm": 0.3069382837366587,
"learning_rate": 3.5497835497835503e-05,
"loss": 0.4422,
"step": 835
},
{
"epoch": 1.0843608046722908,
"grad_norm": 0.2432398771659819,
"learning_rate": 3.547378547378548e-05,
"loss": 0.4058,
"step": 836
},
{
"epoch": 1.0856586632057106,
"grad_norm": 0.2946884230374921,
"learning_rate": 3.5449735449735446e-05,
"loss": 0.4212,
"step": 837
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.2833055873065403,
"learning_rate": 3.542568542568543e-05,
"loss": 0.4236,
"step": 838
},
{
"epoch": 1.0882543802725504,
"grad_norm": 0.24633104520706475,
"learning_rate": 3.54016354016354e-05,
"loss": 0.4296,
"step": 839
},
{
"epoch": 1.0895522388059702,
"grad_norm": 0.30305714876494183,
"learning_rate": 3.5377585377585384e-05,
"loss": 0.4125,
"step": 840
},
{
"epoch": 1.09085009733939,
"grad_norm": 0.27515539449115956,
"learning_rate": 3.535353535353535e-05,
"loss": 0.4343,
"step": 841
},
{
"epoch": 1.0921479558728098,
"grad_norm": 0.2566570554342314,
"learning_rate": 3.532948532948533e-05,
"loss": 0.4259,
"step": 842
},
{
"epoch": 1.0934458144062298,
"grad_norm": 0.27580070959828384,
"learning_rate": 3.530543530543531e-05,
"loss": 0.4429,
"step": 843
},
{
"epoch": 1.0947436729396496,
"grad_norm": 0.40316368454416424,
"learning_rate": 3.528138528138528e-05,
"loss": 0.4471,
"step": 844
},
{
"epoch": 1.0960415314730694,
"grad_norm": 0.2693638650733015,
"learning_rate": 3.525733525733526e-05,
"loss": 0.4259,
"step": 845
},
{
"epoch": 1.0973393900064894,
"grad_norm": 0.3044405084823207,
"learning_rate": 3.523328523328523e-05,
"loss": 0.4187,
"step": 846
},
{
"epoch": 1.0986372485399092,
"grad_norm": 0.24933029657871267,
"learning_rate": 3.520923520923521e-05,
"loss": 0.4328,
"step": 847
},
{
"epoch": 1.099935107073329,
"grad_norm": 0.26906838606368316,
"learning_rate": 3.518518518518519e-05,
"loss": 0.4368,
"step": 848
},
{
"epoch": 1.1012329656067488,
"grad_norm": 0.26971278689485273,
"learning_rate": 3.5161135161135164e-05,
"loss": 0.4297,
"step": 849
},
{
"epoch": 1.1025308241401688,
"grad_norm": 0.2747922938561738,
"learning_rate": 3.513708513708514e-05,
"loss": 0.4373,
"step": 850
},
{
"epoch": 1.1038286826735886,
"grad_norm": 0.2761210141034917,
"learning_rate": 3.5113035113035113e-05,
"loss": 0.4382,
"step": 851
},
{
"epoch": 1.1051265412070084,
"grad_norm": 0.2964790930440498,
"learning_rate": 3.508898508898509e-05,
"loss": 0.4231,
"step": 852
},
{
"epoch": 1.1064243997404284,
"grad_norm": 0.25620765135533435,
"learning_rate": 3.506493506493507e-05,
"loss": 0.4408,
"step": 853
},
{
"epoch": 1.1077222582738482,
"grad_norm": 0.21064152286484897,
"learning_rate": 3.504088504088504e-05,
"loss": 0.417,
"step": 854
},
{
"epoch": 1.109020116807268,
"grad_norm": 0.2958430063839789,
"learning_rate": 3.501683501683502e-05,
"loss": 0.4339,
"step": 855
},
{
"epoch": 1.1103179753406878,
"grad_norm": 0.22482067437635359,
"learning_rate": 3.4992784992784994e-05,
"loss": 0.4295,
"step": 856
},
{
"epoch": 1.1116158338741078,
"grad_norm": 0.2955967782295721,
"learning_rate": 3.4968734968734976e-05,
"loss": 0.4015,
"step": 857
},
{
"epoch": 1.1129136924075276,
"grad_norm": 0.2843718668047481,
"learning_rate": 3.4944684944684944e-05,
"loss": 0.4218,
"step": 858
},
{
"epoch": 1.1142115509409474,
"grad_norm": 0.2764727160970055,
"learning_rate": 3.492063492063492e-05,
"loss": 0.4246,
"step": 859
},
{
"epoch": 1.1155094094743674,
"grad_norm": 0.2624405170772858,
"learning_rate": 3.48965848965849e-05,
"loss": 0.407,
"step": 860
},
{
"epoch": 1.1168072680077872,
"grad_norm": 0.2666157900698976,
"learning_rate": 3.4872534872534875e-05,
"loss": 0.4402,
"step": 861
},
{
"epoch": 1.118105126541207,
"grad_norm": 1.2301966564667228,
"learning_rate": 3.484848484848485e-05,
"loss": 0.4452,
"step": 862
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.3054038468605886,
"learning_rate": 3.4824434824434824e-05,
"loss": 0.4354,
"step": 863
},
{
"epoch": 1.1207008436080468,
"grad_norm": 0.2582864922377692,
"learning_rate": 3.48003848003848e-05,
"loss": 0.4495,
"step": 864
},
{
"epoch": 1.1219987021414666,
"grad_norm": 0.2500252146157672,
"learning_rate": 3.477633477633478e-05,
"loss": 0.4395,
"step": 865
},
{
"epoch": 1.1232965606748864,
"grad_norm": 0.29631159663934065,
"learning_rate": 3.4752284752284755e-05,
"loss": 0.4514,
"step": 866
},
{
"epoch": 1.1245944192083064,
"grad_norm": 0.28152136468600736,
"learning_rate": 3.472823472823473e-05,
"loss": 0.4397,
"step": 867
},
{
"epoch": 1.1258922777417262,
"grad_norm": 0.21131772623693806,
"learning_rate": 3.4704184704184705e-05,
"loss": 0.4285,
"step": 868
},
{
"epoch": 1.127190136275146,
"grad_norm": 0.25615102365106246,
"learning_rate": 3.468013468013468e-05,
"loss": 0.4133,
"step": 869
},
{
"epoch": 1.128487994808566,
"grad_norm": 0.23917269321305726,
"learning_rate": 3.465608465608466e-05,
"loss": 0.4435,
"step": 870
},
{
"epoch": 1.1297858533419858,
"grad_norm": 0.27161646725256705,
"learning_rate": 3.463203463203463e-05,
"loss": 0.4233,
"step": 871
},
{
"epoch": 1.1310837118754056,
"grad_norm": 0.2412152469942244,
"learning_rate": 3.460798460798461e-05,
"loss": 0.4461,
"step": 872
},
{
"epoch": 1.1323815704088254,
"grad_norm": 0.3179450028674149,
"learning_rate": 3.4583934583934586e-05,
"loss": 0.4119,
"step": 873
},
{
"epoch": 1.1336794289422454,
"grad_norm": 0.23384786069007624,
"learning_rate": 3.455988455988456e-05,
"loss": 0.4103,
"step": 874
},
{
"epoch": 1.1349772874756652,
"grad_norm": 0.2576089230129144,
"learning_rate": 3.4535834535834535e-05,
"loss": 0.4256,
"step": 875
},
{
"epoch": 1.136275146009085,
"grad_norm": 0.25522949186712174,
"learning_rate": 3.451178451178451e-05,
"loss": 0.4306,
"step": 876
},
{
"epoch": 1.1375730045425048,
"grad_norm": 0.30146067353056377,
"learning_rate": 3.448773448773449e-05,
"loss": 0.4373,
"step": 877
},
{
"epoch": 1.1388708630759248,
"grad_norm": 0.2821696115546317,
"learning_rate": 3.4463684463684466e-05,
"loss": 0.4262,
"step": 878
},
{
"epoch": 1.1401687216093446,
"grad_norm": 0.23559687187392536,
"learning_rate": 3.443963443963444e-05,
"loss": 0.4128,
"step": 879
},
{
"epoch": 1.1414665801427644,
"grad_norm": 0.3096443772818808,
"learning_rate": 3.4415584415584416e-05,
"loss": 0.424,
"step": 880
},
{
"epoch": 1.1427644386761844,
"grad_norm": 0.27131628167000316,
"learning_rate": 3.439153439153439e-05,
"loss": 0.4243,
"step": 881
},
{
"epoch": 1.1440622972096042,
"grad_norm": 0.3321877999366987,
"learning_rate": 3.436748436748437e-05,
"loss": 0.3962,
"step": 882
},
{
"epoch": 1.145360155743024,
"grad_norm": 0.252836911633108,
"learning_rate": 3.434343434343435e-05,
"loss": 0.4364,
"step": 883
},
{
"epoch": 1.146658014276444,
"grad_norm": 0.28472809342059574,
"learning_rate": 3.431938431938432e-05,
"loss": 0.4265,
"step": 884
},
{
"epoch": 1.1479558728098638,
"grad_norm": 0.25893718977038643,
"learning_rate": 3.4295334295334296e-05,
"loss": 0.4239,
"step": 885
},
{
"epoch": 1.1492537313432836,
"grad_norm": 0.29888823830438355,
"learning_rate": 3.427128427128427e-05,
"loss": 0.4641,
"step": 886
},
{
"epoch": 1.1505515898767034,
"grad_norm": 0.2336271423650757,
"learning_rate": 3.424723424723425e-05,
"loss": 0.4363,
"step": 887
},
{
"epoch": 1.1518494484101234,
"grad_norm": 0.28355868807379,
"learning_rate": 3.422318422318422e-05,
"loss": 0.4446,
"step": 888
},
{
"epoch": 1.1531473069435432,
"grad_norm": 0.25482834362533163,
"learning_rate": 3.41991341991342e-05,
"loss": 0.432,
"step": 889
},
{
"epoch": 1.154445165476963,
"grad_norm": 0.257058818612092,
"learning_rate": 3.417508417508418e-05,
"loss": 0.4238,
"step": 890
},
{
"epoch": 1.1557430240103828,
"grad_norm": 0.2964878759739716,
"learning_rate": 3.415103415103415e-05,
"loss": 0.4341,
"step": 891
},
{
"epoch": 1.1570408825438028,
"grad_norm": 0.24581976687613294,
"learning_rate": 3.412698412698413e-05,
"loss": 0.445,
"step": 892
},
{
"epoch": 1.1583387410772226,
"grad_norm": 0.2672951398900844,
"learning_rate": 3.41029341029341e-05,
"loss": 0.4146,
"step": 893
},
{
"epoch": 1.1596365996106424,
"grad_norm": 0.29744313702383335,
"learning_rate": 3.407888407888408e-05,
"loss": 0.4291,
"step": 894
},
{
"epoch": 1.1609344581440624,
"grad_norm": 0.2594025370257348,
"learning_rate": 3.405483405483406e-05,
"loss": 0.4393,
"step": 895
},
{
"epoch": 1.1622323166774822,
"grad_norm": 0.27322558327059043,
"learning_rate": 3.403078403078403e-05,
"loss": 0.413,
"step": 896
},
{
"epoch": 1.163530175210902,
"grad_norm": 0.27895427053368943,
"learning_rate": 3.400673400673401e-05,
"loss": 0.4282,
"step": 897
},
{
"epoch": 1.164828033744322,
"grad_norm": 0.3278143440045291,
"learning_rate": 3.398268398268398e-05,
"loss": 0.4503,
"step": 898
},
{
"epoch": 1.1661258922777418,
"grad_norm": 0.2878645741851875,
"learning_rate": 3.3958633958633964e-05,
"loss": 0.418,
"step": 899
},
{
"epoch": 1.1674237508111616,
"grad_norm": 0.279091054078343,
"learning_rate": 3.393458393458394e-05,
"loss": 0.4351,
"step": 900
},
{
"epoch": 1.1687216093445814,
"grad_norm": 0.300972554323965,
"learning_rate": 3.391053391053391e-05,
"loss": 0.4177,
"step": 901
},
{
"epoch": 1.1700194678780014,
"grad_norm": 0.2912604255538886,
"learning_rate": 3.388648388648389e-05,
"loss": 0.4239,
"step": 902
},
{
"epoch": 1.1713173264114212,
"grad_norm": 0.28729371845984225,
"learning_rate": 3.386243386243386e-05,
"loss": 0.4498,
"step": 903
},
{
"epoch": 1.172615184944841,
"grad_norm": 0.2983707424965568,
"learning_rate": 3.3838383838383844e-05,
"loss": 0.4093,
"step": 904
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.2396146655592429,
"learning_rate": 3.381433381433381e-05,
"loss": 0.4134,
"step": 905
},
{
"epoch": 1.1752109020116808,
"grad_norm": 0.25743340902304185,
"learning_rate": 3.3790283790283794e-05,
"loss": 0.4024,
"step": 906
},
{
"epoch": 1.1765087605451006,
"grad_norm": 0.27027531302973373,
"learning_rate": 3.376623376623377e-05,
"loss": 0.4518,
"step": 907
},
{
"epoch": 1.1778066190785204,
"grad_norm": 0.25280300819232365,
"learning_rate": 3.3742183742183743e-05,
"loss": 0.4185,
"step": 908
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.22682160703006224,
"learning_rate": 3.371813371813372e-05,
"loss": 0.4174,
"step": 909
},
{
"epoch": 1.1804023361453602,
"grad_norm": 0.23204503630025836,
"learning_rate": 3.369408369408369e-05,
"loss": 0.4177,
"step": 910
},
{
"epoch": 1.18170019467878,
"grad_norm": 0.25880635574030375,
"learning_rate": 3.3670033670033675e-05,
"loss": 0.4179,
"step": 911
},
{
"epoch": 1.1829980532122,
"grad_norm": 0.2522597708371833,
"learning_rate": 3.364598364598365e-05,
"loss": 0.4283,
"step": 912
},
{
"epoch": 1.1842959117456198,
"grad_norm": 0.2883869624140782,
"learning_rate": 3.3621933621933624e-05,
"loss": 0.4372,
"step": 913
},
{
"epoch": 1.1855937702790396,
"grad_norm": 0.25106486957221746,
"learning_rate": 3.35978835978836e-05,
"loss": 0.4281,
"step": 914
},
{
"epoch": 1.1868916288124594,
"grad_norm": 0.292526125260076,
"learning_rate": 3.3573833573833574e-05,
"loss": 0.4365,
"step": 915
},
{
"epoch": 1.1881894873458794,
"grad_norm": 0.2676690874911841,
"learning_rate": 3.3549783549783555e-05,
"loss": 0.4326,
"step": 916
},
{
"epoch": 1.1894873458792992,
"grad_norm": 0.26481922535161423,
"learning_rate": 3.352573352573353e-05,
"loss": 0.4274,
"step": 917
},
{
"epoch": 1.190785204412719,
"grad_norm": 0.2798203995195467,
"learning_rate": 3.35016835016835e-05,
"loss": 0.4368,
"step": 918
},
{
"epoch": 1.1920830629461387,
"grad_norm": 0.2861488916957064,
"learning_rate": 3.347763347763348e-05,
"loss": 0.4137,
"step": 919
},
{
"epoch": 1.1933809214795588,
"grad_norm": 0.29631412123985384,
"learning_rate": 3.3453583453583454e-05,
"loss": 0.4569,
"step": 920
},
{
"epoch": 1.1946787800129786,
"grad_norm": 0.2342734368682864,
"learning_rate": 3.3429533429533436e-05,
"loss": 0.4161,
"step": 921
},
{
"epoch": 1.1959766385463984,
"grad_norm": 0.2701444926397203,
"learning_rate": 3.3405483405483404e-05,
"loss": 0.4436,
"step": 922
},
{
"epoch": 1.1972744970798184,
"grad_norm": 0.24533441345926324,
"learning_rate": 3.3381433381433385e-05,
"loss": 0.4169,
"step": 923
},
{
"epoch": 1.1985723556132382,
"grad_norm": 0.2540671392711727,
"learning_rate": 3.335738335738336e-05,
"loss": 0.4157,
"step": 924
},
{
"epoch": 1.199870214146658,
"grad_norm": 0.24799812272450378,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4185,
"step": 925
},
{
"epoch": 1.201168072680078,
"grad_norm": 0.22598144682861446,
"learning_rate": 3.330928330928331e-05,
"loss": 0.4316,
"step": 926
},
{
"epoch": 1.2024659312134978,
"grad_norm": 0.2448180172195478,
"learning_rate": 3.3285233285233284e-05,
"loss": 0.4224,
"step": 927
},
{
"epoch": 1.2037637897469176,
"grad_norm": 0.2765575264990623,
"learning_rate": 3.3261183261183266e-05,
"loss": 0.4152,
"step": 928
},
{
"epoch": 1.2050616482803373,
"grad_norm": 0.2804413742079038,
"learning_rate": 3.323713323713324e-05,
"loss": 0.4268,
"step": 929
},
{
"epoch": 1.2063595068137574,
"grad_norm": 0.241273559139525,
"learning_rate": 3.3213083213083216e-05,
"loss": 0.44,
"step": 930
},
{
"epoch": 1.2076573653471772,
"grad_norm": 0.26424527656796193,
"learning_rate": 3.318903318903319e-05,
"loss": 0.423,
"step": 931
},
{
"epoch": 1.208955223880597,
"grad_norm": 0.2618836109205494,
"learning_rate": 3.3164983164983165e-05,
"loss": 0.4263,
"step": 932
},
{
"epoch": 1.210253082414017,
"grad_norm": 0.232277628692469,
"learning_rate": 3.314093314093315e-05,
"loss": 0.4285,
"step": 933
},
{
"epoch": 1.2115509409474368,
"grad_norm": 0.22920838397957313,
"learning_rate": 3.311688311688312e-05,
"loss": 0.4222,
"step": 934
},
{
"epoch": 1.2128487994808566,
"grad_norm": 0.22768841564535697,
"learning_rate": 3.309283309283309e-05,
"loss": 0.4343,
"step": 935
},
{
"epoch": 1.2141466580142763,
"grad_norm": 0.2503532795435805,
"learning_rate": 3.306878306878307e-05,
"loss": 0.439,
"step": 936
},
{
"epoch": 1.2154445165476964,
"grad_norm": 0.22461109640165802,
"learning_rate": 3.3044733044733046e-05,
"loss": 0.4188,
"step": 937
},
{
"epoch": 1.2167423750811162,
"grad_norm": 0.209011116297864,
"learning_rate": 3.302068302068303e-05,
"loss": 0.4095,
"step": 938
},
{
"epoch": 1.218040233614536,
"grad_norm": 0.2266971578892572,
"learning_rate": 3.2996632996632995e-05,
"loss": 0.4354,
"step": 939
},
{
"epoch": 1.219338092147956,
"grad_norm": 0.22300168989786548,
"learning_rate": 3.297258297258297e-05,
"loss": 0.4225,
"step": 940
},
{
"epoch": 1.2206359506813758,
"grad_norm": 0.24047207294507855,
"learning_rate": 3.294853294853295e-05,
"loss": 0.4485,
"step": 941
},
{
"epoch": 1.2219338092147956,
"grad_norm": 0.26188839036093997,
"learning_rate": 3.2924482924482926e-05,
"loss": 0.4338,
"step": 942
},
{
"epoch": 1.2232316677482156,
"grad_norm": 0.2235845694258825,
"learning_rate": 3.29004329004329e-05,
"loss": 0.4242,
"step": 943
},
{
"epoch": 1.2245295262816354,
"grad_norm": 0.21723162921446287,
"learning_rate": 3.2876382876382876e-05,
"loss": 0.4241,
"step": 944
},
{
"epoch": 1.2258273848150552,
"grad_norm": 0.25526775092171644,
"learning_rate": 3.285233285233286e-05,
"loss": 0.4253,
"step": 945
},
{
"epoch": 1.227125243348475,
"grad_norm": 0.20573746450142508,
"learning_rate": 3.282828282828283e-05,
"loss": 0.4479,
"step": 946
},
{
"epoch": 1.228423101881895,
"grad_norm": 0.2510082428750361,
"learning_rate": 3.280423280423281e-05,
"loss": 0.4347,
"step": 947
},
{
"epoch": 1.2297209604153148,
"grad_norm": 0.2669964257731318,
"learning_rate": 3.278018278018278e-05,
"loss": 0.4291,
"step": 948
},
{
"epoch": 1.2310188189487346,
"grad_norm": 0.23768286255343224,
"learning_rate": 3.275613275613276e-05,
"loss": 0.4348,
"step": 949
},
{
"epoch": 1.2323166774821543,
"grad_norm": 0.253301068721141,
"learning_rate": 3.273208273208274e-05,
"loss": 0.4165,
"step": 950
},
{
"epoch": 1.2336145360155744,
"grad_norm": 0.24211175208943894,
"learning_rate": 3.270803270803271e-05,
"loss": 0.4525,
"step": 951
},
{
"epoch": 1.2349123945489942,
"grad_norm": 0.20694813799256812,
"learning_rate": 3.268398268398268e-05,
"loss": 0.4353,
"step": 952
},
{
"epoch": 1.236210253082414,
"grad_norm": 0.24139016385768045,
"learning_rate": 3.265993265993266e-05,
"loss": 0.4255,
"step": 953
},
{
"epoch": 1.237508111615834,
"grad_norm": 0.23298523425827472,
"learning_rate": 3.263588263588264e-05,
"loss": 0.4453,
"step": 954
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.22089226803029272,
"learning_rate": 3.261183261183262e-05,
"loss": 0.4142,
"step": 955
},
{
"epoch": 1.2401038286826735,
"grad_norm": 0.21279087400923866,
"learning_rate": 3.258778258778259e-05,
"loss": 0.3967,
"step": 956
},
{
"epoch": 1.2414016872160936,
"grad_norm": 0.21365588686190673,
"learning_rate": 3.256373256373256e-05,
"loss": 0.4265,
"step": 957
},
{
"epoch": 1.2426995457495134,
"grad_norm": 0.2297103141691502,
"learning_rate": 3.253968253968254e-05,
"loss": 0.4247,
"step": 958
},
{
"epoch": 1.2439974042829332,
"grad_norm": 0.21237733020443594,
"learning_rate": 3.251563251563252e-05,
"loss": 0.4187,
"step": 959
},
{
"epoch": 1.245295262816353,
"grad_norm": 0.25205287449171737,
"learning_rate": 3.249158249158249e-05,
"loss": 0.4195,
"step": 960
},
{
"epoch": 1.246593121349773,
"grad_norm": 0.2275565335826296,
"learning_rate": 3.246753246753247e-05,
"loss": 0.4222,
"step": 961
},
{
"epoch": 1.2478909798831928,
"grad_norm": 0.24497964692242122,
"learning_rate": 3.244348244348244e-05,
"loss": 0.4282,
"step": 962
},
{
"epoch": 1.2491888384166125,
"grad_norm": 0.2154307123634933,
"learning_rate": 3.2419432419432424e-05,
"loss": 0.4354,
"step": 963
},
{
"epoch": 1.2504866969500323,
"grad_norm": 0.21744389583650917,
"learning_rate": 3.23953823953824e-05,
"loss": 0.4138,
"step": 964
},
{
"epoch": 1.2517845554834524,
"grad_norm": 0.2430698493626977,
"learning_rate": 3.237133237133237e-05,
"loss": 0.4342,
"step": 965
},
{
"epoch": 1.2530824140168721,
"grad_norm": 0.2426482535744958,
"learning_rate": 3.234728234728235e-05,
"loss": 0.4253,
"step": 966
},
{
"epoch": 1.254380272550292,
"grad_norm": 0.22495203456758703,
"learning_rate": 3.232323232323233e-05,
"loss": 0.4308,
"step": 967
},
{
"epoch": 1.255678131083712,
"grad_norm": 0.24355897774937213,
"learning_rate": 3.2299182299182304e-05,
"loss": 0.4284,
"step": 968
},
{
"epoch": 1.2569759896171318,
"grad_norm": 0.2417579603003613,
"learning_rate": 3.227513227513227e-05,
"loss": 0.4236,
"step": 969
},
{
"epoch": 1.2582738481505515,
"grad_norm": 0.21950714489690643,
"learning_rate": 3.2251082251082254e-05,
"loss": 0.4319,
"step": 970
},
{
"epoch": 1.2595717066839716,
"grad_norm": 0.248967863409087,
"learning_rate": 3.222703222703223e-05,
"loss": 0.414,
"step": 971
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.2320971622059916,
"learning_rate": 3.220298220298221e-05,
"loss": 0.4396,
"step": 972
},
{
"epoch": 1.2621674237508111,
"grad_norm": 0.23309901348515835,
"learning_rate": 3.217893217893218e-05,
"loss": 0.423,
"step": 973
},
{
"epoch": 1.263465282284231,
"grad_norm": 0.2227973619365033,
"learning_rate": 3.215488215488215e-05,
"loss": 0.4256,
"step": 974
},
{
"epoch": 1.264763140817651,
"grad_norm": 0.245760384681702,
"learning_rate": 3.2130832130832135e-05,
"loss": 0.4326,
"step": 975
},
{
"epoch": 1.2660609993510707,
"grad_norm": 0.2789130551480554,
"learning_rate": 3.210678210678211e-05,
"loss": 0.4158,
"step": 976
},
{
"epoch": 1.2673588578844905,
"grad_norm": 0.24466337361794802,
"learning_rate": 3.2082732082732084e-05,
"loss": 0.4132,
"step": 977
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.26994667632692604,
"learning_rate": 3.205868205868206e-05,
"loss": 0.4193,
"step": 978
},
{
"epoch": 1.2699545749513304,
"grad_norm": 0.27229427206268936,
"learning_rate": 3.2034632034632034e-05,
"loss": 0.4438,
"step": 979
},
{
"epoch": 1.2712524334847501,
"grad_norm": 0.2562325137645907,
"learning_rate": 3.2010582010582015e-05,
"loss": 0.4356,
"step": 980
},
{
"epoch": 1.27255029201817,
"grad_norm": 0.25248121832020193,
"learning_rate": 3.198653198653199e-05,
"loss": 0.4143,
"step": 981
},
{
"epoch": 1.27384815055159,
"grad_norm": 0.2656639346583922,
"learning_rate": 3.1962481962481965e-05,
"loss": 0.4116,
"step": 982
},
{
"epoch": 1.2751460090850097,
"grad_norm": 0.22210045965124164,
"learning_rate": 3.193843193843194e-05,
"loss": 0.4194,
"step": 983
},
{
"epoch": 1.2764438676184295,
"grad_norm": 0.29306826782606415,
"learning_rate": 3.1914381914381914e-05,
"loss": 0.4148,
"step": 984
},
{
"epoch": 1.2777417261518496,
"grad_norm": 0.24612787413957143,
"learning_rate": 3.1890331890331896e-05,
"loss": 0.419,
"step": 985
},
{
"epoch": 1.2790395846852693,
"grad_norm": 0.24144328983707405,
"learning_rate": 3.1866281866281864e-05,
"loss": 0.4412,
"step": 986
},
{
"epoch": 1.2803374432186891,
"grad_norm": 0.24433831385926233,
"learning_rate": 3.1842231842231846e-05,
"loss": 0.4237,
"step": 987
},
{
"epoch": 1.2816353017521092,
"grad_norm": 0.29652784983616687,
"learning_rate": 3.181818181818182e-05,
"loss": 0.4052,
"step": 988
},
{
"epoch": 1.282933160285529,
"grad_norm": 0.22517076903481237,
"learning_rate": 3.1794131794131795e-05,
"loss": 0.437,
"step": 989
},
{
"epoch": 1.2842310188189487,
"grad_norm": 0.2695004176632199,
"learning_rate": 3.177008177008177e-05,
"loss": 0.4051,
"step": 990
},
{
"epoch": 1.2855288773523685,
"grad_norm": 0.2831795101586642,
"learning_rate": 3.1746031746031745e-05,
"loss": 0.4142,
"step": 991
},
{
"epoch": 1.2868267358857883,
"grad_norm": 0.2305605000016992,
"learning_rate": 3.1721981721981726e-05,
"loss": 0.4318,
"step": 992
},
{
"epoch": 1.2881245944192083,
"grad_norm": 0.27421133904277795,
"learning_rate": 3.16979316979317e-05,
"loss": 0.4303,
"step": 993
},
{
"epoch": 1.2894224529526281,
"grad_norm": 0.28015539417195207,
"learning_rate": 3.1673881673881676e-05,
"loss": 0.4332,
"step": 994
},
{
"epoch": 1.290720311486048,
"grad_norm": 0.2456153256375182,
"learning_rate": 3.164983164983165e-05,
"loss": 0.4377,
"step": 995
},
{
"epoch": 1.292018170019468,
"grad_norm": 0.2875091277813538,
"learning_rate": 3.1625781625781625e-05,
"loss": 0.4306,
"step": 996
},
{
"epoch": 1.2933160285528877,
"grad_norm": 0.24534524749679693,
"learning_rate": 3.160173160173161e-05,
"loss": 0.4389,
"step": 997
},
{
"epoch": 1.2946138870863075,
"grad_norm": 0.2604745981282834,
"learning_rate": 3.1577681577681575e-05,
"loss": 0.4221,
"step": 998
},
{
"epoch": 1.2959117456197276,
"grad_norm": 0.2519193028255322,
"learning_rate": 3.1553631553631556e-05,
"loss": 0.4613,
"step": 999
},
{
"epoch": 1.2972096041531473,
"grad_norm": 0.28305211286597437,
"learning_rate": 3.152958152958153e-05,
"loss": 0.4364,
"step": 1000
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.31302239991262554,
"learning_rate": 3.1505531505531506e-05,
"loss": 0.4315,
"step": 1001
},
{
"epoch": 1.2998053212199872,
"grad_norm": 0.21929034373943385,
"learning_rate": 3.148148148148148e-05,
"loss": 0.4389,
"step": 1002
},
{
"epoch": 1.301103179753407,
"grad_norm": 0.33967578917657276,
"learning_rate": 3.1457431457431456e-05,
"loss": 0.4245,
"step": 1003
},
{
"epoch": 1.3024010382868267,
"grad_norm": 0.26773599652804664,
"learning_rate": 3.143338143338144e-05,
"loss": 0.4322,
"step": 1004
},
{
"epoch": 1.3036988968202465,
"grad_norm": 0.3377829113222901,
"learning_rate": 3.140933140933141e-05,
"loss": 0.4156,
"step": 1005
},
{
"epoch": 1.3049967553536663,
"grad_norm": 0.28733563773070486,
"learning_rate": 3.1385281385281387e-05,
"loss": 0.4153,
"step": 1006
},
{
"epoch": 1.3062946138870863,
"grad_norm": 0.29064269118004,
"learning_rate": 3.136123136123136e-05,
"loss": 0.4248,
"step": 1007
},
{
"epoch": 1.3075924724205061,
"grad_norm": 0.3578706488624722,
"learning_rate": 3.1337181337181336e-05,
"loss": 0.4103,
"step": 1008
},
{
"epoch": 1.308890330953926,
"grad_norm": 0.2506804986945498,
"learning_rate": 3.131313131313132e-05,
"loss": 0.4455,
"step": 1009
},
{
"epoch": 1.310188189487346,
"grad_norm": 0.26163932223113945,
"learning_rate": 3.128908128908129e-05,
"loss": 0.418,
"step": 1010
},
{
"epoch": 1.3114860480207657,
"grad_norm": 0.2968145670132288,
"learning_rate": 3.126503126503126e-05,
"loss": 0.4419,
"step": 1011
},
{
"epoch": 1.3127839065541855,
"grad_norm": 0.29985902672925774,
"learning_rate": 3.124098124098124e-05,
"loss": 0.4397,
"step": 1012
},
{
"epoch": 1.3140817650876055,
"grad_norm": 0.2693824680991638,
"learning_rate": 3.121693121693122e-05,
"loss": 0.4477,
"step": 1013
},
{
"epoch": 1.3153796236210253,
"grad_norm": 0.2703405621428287,
"learning_rate": 3.11928811928812e-05,
"loss": 0.4325,
"step": 1014
},
{
"epoch": 1.3166774821544451,
"grad_norm": 0.2721038339798775,
"learning_rate": 3.1168831168831166e-05,
"loss": 0.4373,
"step": 1015
},
{
"epoch": 1.3179753406878651,
"grad_norm": 0.26849320227585655,
"learning_rate": 3.114478114478115e-05,
"loss": 0.4246,
"step": 1016
},
{
"epoch": 1.319273199221285,
"grad_norm": 0.28294666170474586,
"learning_rate": 3.112073112073112e-05,
"loss": 0.4405,
"step": 1017
},
{
"epoch": 1.3205710577547047,
"grad_norm": 0.7257885246786743,
"learning_rate": 3.10966810966811e-05,
"loss": 0.419,
"step": 1018
},
{
"epoch": 1.3218689162881245,
"grad_norm": 0.26474834284364107,
"learning_rate": 3.107263107263107e-05,
"loss": 0.4561,
"step": 1019
},
{
"epoch": 1.3231667748215443,
"grad_norm": 0.2836196696373746,
"learning_rate": 3.104858104858105e-05,
"loss": 0.4187,
"step": 1020
},
{
"epoch": 1.3244646333549643,
"grad_norm": 0.2978444678442113,
"learning_rate": 3.102453102453103e-05,
"loss": 0.4245,
"step": 1021
},
{
"epoch": 1.3257624918883841,
"grad_norm": 0.27039863643097406,
"learning_rate": 3.1000481000481e-05,
"loss": 0.4492,
"step": 1022
},
{
"epoch": 1.327060350421804,
"grad_norm": 0.28245910629768817,
"learning_rate": 3.097643097643098e-05,
"loss": 0.4205,
"step": 1023
},
{
"epoch": 1.328358208955224,
"grad_norm": 0.235926922135542,
"learning_rate": 3.095238095238095e-05,
"loss": 0.4176,
"step": 1024
},
{
"epoch": 1.3296560674886437,
"grad_norm": 0.24582624312732296,
"learning_rate": 3.092833092833093e-05,
"loss": 0.4409,
"step": 1025
},
{
"epoch": 1.3309539260220635,
"grad_norm": 0.25618143979663144,
"learning_rate": 3.090428090428091e-05,
"loss": 0.4411,
"step": 1026
},
{
"epoch": 1.3322517845554835,
"grad_norm": 0.3044844628215498,
"learning_rate": 3.0880230880230884e-05,
"loss": 0.4628,
"step": 1027
},
{
"epoch": 1.3335496430889033,
"grad_norm": 0.25846935424140755,
"learning_rate": 3.085618085618085e-05,
"loss": 0.431,
"step": 1028
},
{
"epoch": 1.3348475016223231,
"grad_norm": 0.2299843153442688,
"learning_rate": 3.0832130832130834e-05,
"loss": 0.4091,
"step": 1029
},
{
"epoch": 1.3361453601557431,
"grad_norm": 0.2461059374323843,
"learning_rate": 3.080808080808081e-05,
"loss": 0.4095,
"step": 1030
},
{
"epoch": 1.337443218689163,
"grad_norm": 0.2591709406665044,
"learning_rate": 3.078403078403079e-05,
"loss": 0.4311,
"step": 1031
},
{
"epoch": 1.3387410772225827,
"grad_norm": 0.22916233734032224,
"learning_rate": 3.075998075998076e-05,
"loss": 0.4485,
"step": 1032
},
{
"epoch": 1.3400389357560025,
"grad_norm": 0.2600469770283129,
"learning_rate": 3.073593073593073e-05,
"loss": 0.4356,
"step": 1033
},
{
"epoch": 1.3413367942894223,
"grad_norm": 0.24066487258683386,
"learning_rate": 3.0711880711880714e-05,
"loss": 0.4185,
"step": 1034
},
{
"epoch": 1.3426346528228423,
"grad_norm": 0.27145997155071,
"learning_rate": 3.068783068783069e-05,
"loss": 0.4145,
"step": 1035
},
{
"epoch": 1.3439325113562621,
"grad_norm": 0.22303833825329822,
"learning_rate": 3.0663780663780664e-05,
"loss": 0.4544,
"step": 1036
},
{
"epoch": 1.345230369889682,
"grad_norm": 0.2631308244918525,
"learning_rate": 3.063973063973064e-05,
"loss": 0.4171,
"step": 1037
},
{
"epoch": 1.346528228423102,
"grad_norm": 0.24013765031743725,
"learning_rate": 3.061568061568062e-05,
"loss": 0.4162,
"step": 1038
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.2563444998106366,
"learning_rate": 3.0591630591630595e-05,
"loss": 0.4228,
"step": 1039
},
{
"epoch": 1.3491239454899415,
"grad_norm": 0.26755087682195894,
"learning_rate": 3.056758056758057e-05,
"loss": 0.4393,
"step": 1040
},
{
"epoch": 1.3504218040233615,
"grad_norm": 0.2651622032346258,
"learning_rate": 3.0543530543530544e-05,
"loss": 0.4149,
"step": 1041
},
{
"epoch": 1.3517196625567813,
"grad_norm": 0.25334754508930496,
"learning_rate": 3.051948051948052e-05,
"loss": 0.3989,
"step": 1042
},
{
"epoch": 1.3530175210902011,
"grad_norm": 0.23721003539310276,
"learning_rate": 3.04954304954305e-05,
"loss": 0.4221,
"step": 1043
},
{
"epoch": 1.3543153796236211,
"grad_norm": 0.23037112505555338,
"learning_rate": 3.0471380471380472e-05,
"loss": 0.4182,
"step": 1044
},
{
"epoch": 1.355613238157041,
"grad_norm": 0.2436111206549788,
"learning_rate": 3.0447330447330447e-05,
"loss": 0.4109,
"step": 1045
},
{
"epoch": 1.3569110966904607,
"grad_norm": 0.24241659874410296,
"learning_rate": 3.0423280423280425e-05,
"loss": 0.4223,
"step": 1046
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.25749535429929715,
"learning_rate": 3.03992303992304e-05,
"loss": 0.4182,
"step": 1047
},
{
"epoch": 1.3595068137573005,
"grad_norm": 0.22344908087958898,
"learning_rate": 3.0375180375180378e-05,
"loss": 0.4321,
"step": 1048
},
{
"epoch": 1.3608046722907203,
"grad_norm": 0.23197019372372432,
"learning_rate": 3.0351130351130353e-05,
"loss": 0.4399,
"step": 1049
},
{
"epoch": 1.3621025308241401,
"grad_norm": 0.237479688071686,
"learning_rate": 3.0327080327080328e-05,
"loss": 0.4395,
"step": 1050
},
{
"epoch": 1.36340038935756,
"grad_norm": 0.2607112703183096,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.4379,
"step": 1051
},
{
"epoch": 1.36469824789098,
"grad_norm": 0.2367521283120222,
"learning_rate": 3.027898027898028e-05,
"loss": 0.4102,
"step": 1052
},
{
"epoch": 1.3659961064243997,
"grad_norm": 0.29194250880671824,
"learning_rate": 3.025493025493026e-05,
"loss": 0.443,
"step": 1053
},
{
"epoch": 1.3672939649578195,
"grad_norm": 0.26648481874523033,
"learning_rate": 3.0230880230880233e-05,
"loss": 0.434,
"step": 1054
},
{
"epoch": 1.3685918234912395,
"grad_norm": 0.26285311775511155,
"learning_rate": 3.0206830206830205e-05,
"loss": 0.4094,
"step": 1055
},
{
"epoch": 1.3698896820246593,
"grad_norm": 0.2564406034453145,
"learning_rate": 3.0182780182780186e-05,
"loss": 0.4398,
"step": 1056
},
{
"epoch": 1.3711875405580791,
"grad_norm": 0.2796151000660689,
"learning_rate": 3.0158730158730158e-05,
"loss": 0.4276,
"step": 1057
},
{
"epoch": 1.3724853990914991,
"grad_norm": 0.26631625570118417,
"learning_rate": 3.013468013468014e-05,
"loss": 0.4343,
"step": 1058
},
{
"epoch": 1.373783257624919,
"grad_norm": 0.23806771079205116,
"learning_rate": 3.011063011063011e-05,
"loss": 0.4334,
"step": 1059
},
{
"epoch": 1.3750811161583387,
"grad_norm": 0.27717084828254007,
"learning_rate": 3.0086580086580092e-05,
"loss": 0.4445,
"step": 1060
},
{
"epoch": 1.3763789746917585,
"grad_norm": 0.25955745890460635,
"learning_rate": 3.0062530062530064e-05,
"loss": 0.4339,
"step": 1061
},
{
"epoch": 1.3776768332251785,
"grad_norm": 0.27833631860695396,
"learning_rate": 3.003848003848004e-05,
"loss": 0.4236,
"step": 1062
},
{
"epoch": 1.3789746917585983,
"grad_norm": 0.23547677910993067,
"learning_rate": 3.0014430014430017e-05,
"loss": 0.4451,
"step": 1063
},
{
"epoch": 1.3802725502920181,
"grad_norm": 0.2818218077693615,
"learning_rate": 2.999037999037999e-05,
"loss": 0.444,
"step": 1064
},
{
"epoch": 1.381570408825438,
"grad_norm": 0.22221888861761538,
"learning_rate": 2.996632996632997e-05,
"loss": 0.403,
"step": 1065
},
{
"epoch": 1.382868267358858,
"grad_norm": 0.2445641007844883,
"learning_rate": 2.9942279942279944e-05,
"loss": 0.4163,
"step": 1066
},
{
"epoch": 1.3841661258922777,
"grad_norm": 0.24596043479534688,
"learning_rate": 2.991822991822992e-05,
"loss": 0.4227,
"step": 1067
},
{
"epoch": 1.3854639844256975,
"grad_norm": 0.2278867553746751,
"learning_rate": 2.9894179894179897e-05,
"loss": 0.4252,
"step": 1068
},
{
"epoch": 1.3867618429591175,
"grad_norm": 0.2275831703154012,
"learning_rate": 2.9870129870129872e-05,
"loss": 0.4242,
"step": 1069
},
{
"epoch": 1.3880597014925373,
"grad_norm": 0.2792734033461531,
"learning_rate": 2.984607984607985e-05,
"loss": 0.4117,
"step": 1070
},
{
"epoch": 1.3893575600259571,
"grad_norm": 0.22328171573584032,
"learning_rate": 2.9822029822029825e-05,
"loss": 0.4115,
"step": 1071
},
{
"epoch": 1.3906554185593771,
"grad_norm": 0.2554385321095662,
"learning_rate": 2.9797979797979796e-05,
"loss": 0.4193,
"step": 1072
},
{
"epoch": 1.391953277092797,
"grad_norm": 0.23411526096024632,
"learning_rate": 2.9773929773929778e-05,
"loss": 0.401,
"step": 1073
},
{
"epoch": 1.3932511356262167,
"grad_norm": 0.2417797864860212,
"learning_rate": 2.974987974987975e-05,
"loss": 0.4175,
"step": 1074
},
{
"epoch": 1.3945489941596367,
"grad_norm": 0.22909915502634573,
"learning_rate": 2.972582972582973e-05,
"loss": 0.4145,
"step": 1075
},
{
"epoch": 1.3958468526930565,
"grad_norm": 0.24654124988803922,
"learning_rate": 2.9701779701779702e-05,
"loss": 0.4363,
"step": 1076
},
{
"epoch": 1.3971447112264763,
"grad_norm": 0.23841085503876774,
"learning_rate": 2.9677729677729677e-05,
"loss": 0.4263,
"step": 1077
},
{
"epoch": 1.3984425697598961,
"grad_norm": 0.2372620155067687,
"learning_rate": 2.9653679653679655e-05,
"loss": 0.4164,
"step": 1078
},
{
"epoch": 1.399740428293316,
"grad_norm": 0.2174655800712752,
"learning_rate": 2.962962962962963e-05,
"loss": 0.4131,
"step": 1079
},
{
"epoch": 1.401038286826736,
"grad_norm": 0.25800767555211196,
"learning_rate": 2.9605579605579608e-05,
"loss": 0.4353,
"step": 1080
},
{
"epoch": 1.4023361453601557,
"grad_norm": 0.22921820584725752,
"learning_rate": 2.9581529581529583e-05,
"loss": 0.4048,
"step": 1081
},
{
"epoch": 1.4036340038935755,
"grad_norm": 0.23402586035822698,
"learning_rate": 2.955747955747956e-05,
"loss": 0.4424,
"step": 1082
},
{
"epoch": 1.4049318624269955,
"grad_norm": 0.2352437377010652,
"learning_rate": 2.9533429533429536e-05,
"loss": 0.4026,
"step": 1083
},
{
"epoch": 1.4062297209604153,
"grad_norm": 0.2247412452603025,
"learning_rate": 2.950937950937951e-05,
"loss": 0.4336,
"step": 1084
},
{
"epoch": 1.4075275794938351,
"grad_norm": 0.22277806225908384,
"learning_rate": 2.948532948532949e-05,
"loss": 0.4114,
"step": 1085
},
{
"epoch": 1.4088254380272551,
"grad_norm": 0.241538295583898,
"learning_rate": 2.946127946127946e-05,
"loss": 0.4394,
"step": 1086
},
{
"epoch": 1.410123296560675,
"grad_norm": 0.2610774457770794,
"learning_rate": 2.943722943722944e-05,
"loss": 0.4078,
"step": 1087
},
{
"epoch": 1.4114211550940947,
"grad_norm": 0.28762989292521274,
"learning_rate": 2.9413179413179413e-05,
"loss": 0.4109,
"step": 1088
},
{
"epoch": 1.4127190136275147,
"grad_norm": 0.22764926899208376,
"learning_rate": 2.9389129389129388e-05,
"loss": 0.4113,
"step": 1089
},
{
"epoch": 1.4140168721609345,
"grad_norm": 0.2951748901233817,
"learning_rate": 2.9365079365079366e-05,
"loss": 0.4112,
"step": 1090
},
{
"epoch": 1.4153147306943543,
"grad_norm": 0.2491523290558,
"learning_rate": 2.934102934102934e-05,
"loss": 0.4061,
"step": 1091
},
{
"epoch": 1.416612589227774,
"grad_norm": 0.24774585578620387,
"learning_rate": 2.931697931697932e-05,
"loss": 0.4208,
"step": 1092
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.2383954293588994,
"learning_rate": 2.9292929292929294e-05,
"loss": 0.4243,
"step": 1093
},
{
"epoch": 1.419208306294614,
"grad_norm": 0.2551439406874695,
"learning_rate": 2.926887926887927e-05,
"loss": 0.4254,
"step": 1094
},
{
"epoch": 1.4205061648280337,
"grad_norm": 0.24117464674091157,
"learning_rate": 2.9244829244829247e-05,
"loss": 0.4082,
"step": 1095
},
{
"epoch": 1.4218040233614535,
"grad_norm": 0.216503714519975,
"learning_rate": 2.922077922077922e-05,
"loss": 0.4242,
"step": 1096
},
{
"epoch": 1.4231018818948735,
"grad_norm": 0.2422351302114647,
"learning_rate": 2.91967291967292e-05,
"loss": 0.4181,
"step": 1097
},
{
"epoch": 1.4243997404282933,
"grad_norm": 0.22788810318626124,
"learning_rate": 2.9172679172679174e-05,
"loss": 0.4203,
"step": 1098
},
{
"epoch": 1.425697598961713,
"grad_norm": 0.24325054021695863,
"learning_rate": 2.9148629148629146e-05,
"loss": 0.423,
"step": 1099
},
{
"epoch": 1.4269954574951331,
"grad_norm": 0.23344217723128893,
"learning_rate": 2.9124579124579127e-05,
"loss": 0.4162,
"step": 1100
},
{
"epoch": 1.428293316028553,
"grad_norm": 0.2543879197815251,
"learning_rate": 2.91005291005291e-05,
"loss": 0.4376,
"step": 1101
},
{
"epoch": 1.4295911745619727,
"grad_norm": 0.2501682851968916,
"learning_rate": 2.907647907647908e-05,
"loss": 0.4164,
"step": 1102
},
{
"epoch": 1.4308890330953927,
"grad_norm": 0.21492688667239696,
"learning_rate": 2.905242905242905e-05,
"loss": 0.4113,
"step": 1103
},
{
"epoch": 1.4321868916288125,
"grad_norm": 0.2658354623358409,
"learning_rate": 2.9028379028379033e-05,
"loss": 0.4315,
"step": 1104
},
{
"epoch": 1.4334847501622323,
"grad_norm": 0.2831492165913101,
"learning_rate": 2.9004329004329005e-05,
"loss": 0.4338,
"step": 1105
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.22471574523786844,
"learning_rate": 2.898027898027898e-05,
"loss": 0.4157,
"step": 1106
},
{
"epoch": 1.436080467229072,
"grad_norm": 0.24385415494263882,
"learning_rate": 2.8956228956228958e-05,
"loss": 0.4172,
"step": 1107
},
{
"epoch": 1.437378325762492,
"grad_norm": 0.2722198360023313,
"learning_rate": 2.8932178932178932e-05,
"loss": 0.407,
"step": 1108
},
{
"epoch": 1.4386761842959117,
"grad_norm": 0.21019395837235147,
"learning_rate": 2.890812890812891e-05,
"loss": 0.4189,
"step": 1109
},
{
"epoch": 1.4399740428293315,
"grad_norm": 0.23984115871051997,
"learning_rate": 2.8884078884078885e-05,
"loss": 0.4261,
"step": 1110
},
{
"epoch": 1.4412719013627515,
"grad_norm": 0.25738974208155574,
"learning_rate": 2.886002886002886e-05,
"loss": 0.4694,
"step": 1111
},
{
"epoch": 1.4425697598961713,
"grad_norm": 0.2739742760545878,
"learning_rate": 2.8835978835978838e-05,
"loss": 0.4284,
"step": 1112
},
{
"epoch": 1.443867618429591,
"grad_norm": 0.2563615388623274,
"learning_rate": 2.8811928811928813e-05,
"loss": 0.4187,
"step": 1113
},
{
"epoch": 1.4451654769630111,
"grad_norm": 0.2355873474417628,
"learning_rate": 2.878787878787879e-05,
"loss": 0.4201,
"step": 1114
},
{
"epoch": 1.446463335496431,
"grad_norm": 0.3037078461896459,
"learning_rate": 2.8763828763828766e-05,
"loss": 0.4164,
"step": 1115
},
{
"epoch": 1.4477611940298507,
"grad_norm": 0.2778889853166693,
"learning_rate": 2.8739778739778737e-05,
"loss": 0.4263,
"step": 1116
},
{
"epoch": 1.4490590525632707,
"grad_norm": 0.25304453875189337,
"learning_rate": 2.871572871572872e-05,
"loss": 0.4238,
"step": 1117
},
{
"epoch": 1.4503569110966905,
"grad_norm": 0.2617845594600046,
"learning_rate": 2.869167869167869e-05,
"loss": 0.4325,
"step": 1118
},
{
"epoch": 1.4516547696301103,
"grad_norm": 0.28455794197858575,
"learning_rate": 2.8667628667628672e-05,
"loss": 0.4156,
"step": 1119
},
{
"epoch": 1.45295262816353,
"grad_norm": 0.23740191563596869,
"learning_rate": 2.8643578643578643e-05,
"loss": 0.4261,
"step": 1120
},
{
"epoch": 1.45425048669695,
"grad_norm": 0.24072843737266889,
"learning_rate": 2.8619528619528618e-05,
"loss": 0.4151,
"step": 1121
},
{
"epoch": 1.45554834523037,
"grad_norm": 0.2700405419220064,
"learning_rate": 2.8595478595478596e-05,
"loss": 0.4252,
"step": 1122
},
{
"epoch": 1.4568462037637897,
"grad_norm": 0.2488359814096937,
"learning_rate": 2.857142857142857e-05,
"loss": 0.4428,
"step": 1123
},
{
"epoch": 1.4581440622972095,
"grad_norm": 0.23487079361910798,
"learning_rate": 2.854737854737855e-05,
"loss": 0.4216,
"step": 1124
},
{
"epoch": 1.4594419208306295,
"grad_norm": 0.2466576788103327,
"learning_rate": 2.8523328523328524e-05,
"loss": 0.4226,
"step": 1125
},
{
"epoch": 1.4607397793640493,
"grad_norm": 0.2391996649480345,
"learning_rate": 2.8499278499278502e-05,
"loss": 0.4096,
"step": 1126
},
{
"epoch": 1.462037637897469,
"grad_norm": 0.24258796808928063,
"learning_rate": 2.8475228475228477e-05,
"loss": 0.4145,
"step": 1127
},
{
"epoch": 1.4633354964308891,
"grad_norm": 0.2560406117346898,
"learning_rate": 2.845117845117845e-05,
"loss": 0.4312,
"step": 1128
},
{
"epoch": 1.464633354964309,
"grad_norm": 0.2861667925744788,
"learning_rate": 2.842712842712843e-05,
"loss": 0.4328,
"step": 1129
},
{
"epoch": 1.4659312134977287,
"grad_norm": 0.25402106108095945,
"learning_rate": 2.8403078403078404e-05,
"loss": 0.4258,
"step": 1130
},
{
"epoch": 1.4672290720311487,
"grad_norm": 0.24075563965908323,
"learning_rate": 2.8379028379028383e-05,
"loss": 0.4397,
"step": 1131
},
{
"epoch": 1.4685269305645685,
"grad_norm": 0.28522807177447185,
"learning_rate": 2.8354978354978357e-05,
"loss": 0.4303,
"step": 1132
},
{
"epoch": 1.4698247890979883,
"grad_norm": 0.26004049344709895,
"learning_rate": 2.833092833092833e-05,
"loss": 0.4294,
"step": 1133
},
{
"epoch": 1.471122647631408,
"grad_norm": 0.29853418709486346,
"learning_rate": 2.830687830687831e-05,
"loss": 0.4323,
"step": 1134
},
{
"epoch": 1.4724205061648281,
"grad_norm": 0.2633187635030568,
"learning_rate": 2.8282828282828282e-05,
"loss": 0.439,
"step": 1135
},
{
"epoch": 1.473718364698248,
"grad_norm": 0.3157910451013907,
"learning_rate": 2.8258778258778263e-05,
"loss": 0.4337,
"step": 1136
},
{
"epoch": 1.4750162232316677,
"grad_norm": 0.27203466331005977,
"learning_rate": 2.8234728234728235e-05,
"loss": 0.4198,
"step": 1137
},
{
"epoch": 1.4763140817650875,
"grad_norm": 0.2508023724498563,
"learning_rate": 2.821067821067821e-05,
"loss": 0.4267,
"step": 1138
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.2808852939089359,
"learning_rate": 2.8186628186628188e-05,
"loss": 0.4281,
"step": 1139
},
{
"epoch": 1.4789097988319273,
"grad_norm": 0.24576060851892864,
"learning_rate": 2.8162578162578162e-05,
"loss": 0.4206,
"step": 1140
},
{
"epoch": 1.480207657365347,
"grad_norm": 0.29052064234787545,
"learning_rate": 2.813852813852814e-05,
"loss": 0.413,
"step": 1141
},
{
"epoch": 1.4815055158987671,
"grad_norm": 0.23398439998245094,
"learning_rate": 2.8114478114478115e-05,
"loss": 0.4352,
"step": 1142
},
{
"epoch": 1.482803374432187,
"grad_norm": 0.25678491256047153,
"learning_rate": 2.809042809042809e-05,
"loss": 0.4346,
"step": 1143
},
{
"epoch": 1.4841012329656067,
"grad_norm": 0.2826048101734635,
"learning_rate": 2.8066378066378068e-05,
"loss": 0.4262,
"step": 1144
},
{
"epoch": 1.4853990914990267,
"grad_norm": 0.25015775061708817,
"learning_rate": 2.8042328042328043e-05,
"loss": 0.4129,
"step": 1145
},
{
"epoch": 1.4866969500324465,
"grad_norm": 0.2561977557181458,
"learning_rate": 2.801827801827802e-05,
"loss": 0.417,
"step": 1146
},
{
"epoch": 1.4879948085658663,
"grad_norm": 0.25036370521793533,
"learning_rate": 2.7994227994227996e-05,
"loss": 0.4079,
"step": 1147
},
{
"epoch": 1.4892926670992863,
"grad_norm": 0.28901223175805674,
"learning_rate": 2.7970177970177974e-05,
"loss": 0.4094,
"step": 1148
},
{
"epoch": 1.490590525632706,
"grad_norm": 0.23134484811007663,
"learning_rate": 2.794612794612795e-05,
"loss": 0.4476,
"step": 1149
},
{
"epoch": 1.491888384166126,
"grad_norm": 0.25137689970727467,
"learning_rate": 2.792207792207792e-05,
"loss": 0.4416,
"step": 1150
},
{
"epoch": 1.4931862426995457,
"grad_norm": 0.2524284266331274,
"learning_rate": 2.7898027898027902e-05,
"loss": 0.4295,
"step": 1151
},
{
"epoch": 1.4944841012329655,
"grad_norm": 0.22266682751444122,
"learning_rate": 2.7873977873977873e-05,
"loss": 0.4115,
"step": 1152
},
{
"epoch": 1.4957819597663855,
"grad_norm": 0.2085505106465029,
"learning_rate": 2.7849927849927855e-05,
"loss": 0.4271,
"step": 1153
},
{
"epoch": 1.4970798182998053,
"grad_norm": 0.2352572065506912,
"learning_rate": 2.7825877825877826e-05,
"loss": 0.4129,
"step": 1154
},
{
"epoch": 1.498377676833225,
"grad_norm": 0.2322270923460416,
"learning_rate": 2.78018278018278e-05,
"loss": 0.4404,
"step": 1155
},
{
"epoch": 1.499675535366645,
"grad_norm": 0.20327079840186968,
"learning_rate": 2.777777777777778e-05,
"loss": 0.3992,
"step": 1156
},
{
"epoch": 1.500973393900065,
"grad_norm": 0.22409767153079405,
"learning_rate": 2.7753727753727754e-05,
"loss": 0.4351,
"step": 1157
},
{
"epoch": 1.5022712524334847,
"grad_norm": 0.21789564363803948,
"learning_rate": 2.7729677729677732e-05,
"loss": 0.4263,
"step": 1158
},
{
"epoch": 1.5035691109669047,
"grad_norm": 0.23289144485137522,
"learning_rate": 2.7705627705627707e-05,
"loss": 0.4296,
"step": 1159
},
{
"epoch": 1.5048669695003245,
"grad_norm": 0.22790992420912343,
"learning_rate": 2.768157768157768e-05,
"loss": 0.4275,
"step": 1160
},
{
"epoch": 1.5061648280337443,
"grad_norm": 0.2180550660231808,
"learning_rate": 2.765752765752766e-05,
"loss": 0.4242,
"step": 1161
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.23490836544769822,
"learning_rate": 2.7633477633477635e-05,
"loss": 0.43,
"step": 1162
},
{
"epoch": 1.5087605451005839,
"grad_norm": 0.22192430223007012,
"learning_rate": 2.7609427609427613e-05,
"loss": 0.429,
"step": 1163
},
{
"epoch": 1.510058403634004,
"grad_norm": 0.2311428363766677,
"learning_rate": 2.7585377585377587e-05,
"loss": 0.4414,
"step": 1164
},
{
"epoch": 1.511356262167424,
"grad_norm": 0.2001408824188442,
"learning_rate": 2.756132756132756e-05,
"loss": 0.4225,
"step": 1165
},
{
"epoch": 1.5126541207008435,
"grad_norm": 0.2279619365636095,
"learning_rate": 2.753727753727754e-05,
"loss": 0.4149,
"step": 1166
},
{
"epoch": 1.5139519792342635,
"grad_norm": 0.24302952170040615,
"learning_rate": 2.7513227513227512e-05,
"loss": 0.4153,
"step": 1167
},
{
"epoch": 1.5152498377676833,
"grad_norm": 0.2252876935922963,
"learning_rate": 2.7489177489177493e-05,
"loss": 0.4135,
"step": 1168
},
{
"epoch": 1.516547696301103,
"grad_norm": 0.22162900859128726,
"learning_rate": 2.7465127465127465e-05,
"loss": 0.4283,
"step": 1169
},
{
"epoch": 1.517845554834523,
"grad_norm": 0.23728269459202284,
"learning_rate": 2.7441077441077446e-05,
"loss": 0.4284,
"step": 1170
},
{
"epoch": 1.519143413367943,
"grad_norm": 0.2073068726020532,
"learning_rate": 2.7417027417027418e-05,
"loss": 0.419,
"step": 1171
},
{
"epoch": 1.5204412719013627,
"grad_norm": 0.2308870482056988,
"learning_rate": 2.7392977392977392e-05,
"loss": 0.4409,
"step": 1172
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.21898533880032697,
"learning_rate": 2.736892736892737e-05,
"loss": 0.4171,
"step": 1173
},
{
"epoch": 1.5230369889682025,
"grad_norm": 0.21000995819843474,
"learning_rate": 2.7344877344877345e-05,
"loss": 0.417,
"step": 1174
},
{
"epoch": 1.5243348475016223,
"grad_norm": 0.2150245170655777,
"learning_rate": 2.7320827320827324e-05,
"loss": 0.4365,
"step": 1175
},
{
"epoch": 1.5256327060350423,
"grad_norm": 0.24290565598308295,
"learning_rate": 2.72967772967773e-05,
"loss": 0.4201,
"step": 1176
},
{
"epoch": 1.5269305645684619,
"grad_norm": 0.2304464719146474,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.4149,
"step": 1177
},
{
"epoch": 1.528228423101882,
"grad_norm": 0.23523933221515506,
"learning_rate": 2.724867724867725e-05,
"loss": 0.453,
"step": 1178
},
{
"epoch": 1.529526281635302,
"grad_norm": 0.2253825255615944,
"learning_rate": 2.7224627224627226e-05,
"loss": 0.4209,
"step": 1179
},
{
"epoch": 1.5308241401687215,
"grad_norm": 0.2742775834013937,
"learning_rate": 2.7200577200577204e-05,
"loss": 0.442,
"step": 1180
},
{
"epoch": 1.5321219987021415,
"grad_norm": 0.2176528388600847,
"learning_rate": 2.717652717652718e-05,
"loss": 0.4329,
"step": 1181
},
{
"epoch": 1.5334198572355613,
"grad_norm": 0.23818415433225926,
"learning_rate": 2.715247715247715e-05,
"loss": 0.4187,
"step": 1182
},
{
"epoch": 1.534717715768981,
"grad_norm": 0.26109881547859903,
"learning_rate": 2.7128427128427132e-05,
"loss": 0.4251,
"step": 1183
},
{
"epoch": 1.536015574302401,
"grad_norm": 0.2196942384869763,
"learning_rate": 2.7104377104377103e-05,
"loss": 0.418,
"step": 1184
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.2400322015222156,
"learning_rate": 2.7080327080327085e-05,
"loss": 0.4109,
"step": 1185
},
{
"epoch": 1.5386112913692407,
"grad_norm": 0.23150552647711828,
"learning_rate": 2.7056277056277056e-05,
"loss": 0.4264,
"step": 1186
},
{
"epoch": 1.5399091499026607,
"grad_norm": 0.22005403208488783,
"learning_rate": 2.703222703222703e-05,
"loss": 0.4039,
"step": 1187
},
{
"epoch": 1.5412070084360805,
"grad_norm": 0.22581597634393283,
"learning_rate": 2.700817700817701e-05,
"loss": 0.425,
"step": 1188
},
{
"epoch": 1.5425048669695003,
"grad_norm": 0.2382695341310579,
"learning_rate": 2.6984126984126984e-05,
"loss": 0.4496,
"step": 1189
},
{
"epoch": 1.5438027255029203,
"grad_norm": 0.2203961917107305,
"learning_rate": 2.6960076960076962e-05,
"loss": 0.4155,
"step": 1190
},
{
"epoch": 1.5451005840363399,
"grad_norm": 0.25210372953982285,
"learning_rate": 2.6936026936026937e-05,
"loss": 0.4342,
"step": 1191
},
{
"epoch": 1.54639844256976,
"grad_norm": 0.238604523146027,
"learning_rate": 2.691197691197691e-05,
"loss": 0.4323,
"step": 1192
},
{
"epoch": 1.54769630110318,
"grad_norm": 0.23138471132633792,
"learning_rate": 2.688792688792689e-05,
"loss": 0.4242,
"step": 1193
},
{
"epoch": 1.5489941596365995,
"grad_norm": 0.2320529813667351,
"learning_rate": 2.6863876863876865e-05,
"loss": 0.4363,
"step": 1194
},
{
"epoch": 1.5502920181700195,
"grad_norm": 0.22679612862184145,
"learning_rate": 2.6839826839826843e-05,
"loss": 0.4253,
"step": 1195
},
{
"epoch": 1.5515898767034393,
"grad_norm": 0.2665688161045152,
"learning_rate": 2.6815776815776818e-05,
"loss": 0.4222,
"step": 1196
},
{
"epoch": 1.552887735236859,
"grad_norm": 0.21178913986030537,
"learning_rate": 2.6791726791726796e-05,
"loss": 0.422,
"step": 1197
},
{
"epoch": 1.554185593770279,
"grad_norm": 0.24464931528999015,
"learning_rate": 2.676767676767677e-05,
"loss": 0.4241,
"step": 1198
},
{
"epoch": 1.5554834523036989,
"grad_norm": 0.22319718290311183,
"learning_rate": 2.6743626743626742e-05,
"loss": 0.4168,
"step": 1199
},
{
"epoch": 1.5567813108371187,
"grad_norm": 0.2302808693777694,
"learning_rate": 2.6719576719576723e-05,
"loss": 0.41,
"step": 1200
},
{
"epoch": 1.5580791693705387,
"grad_norm": 0.2317544115600513,
"learning_rate": 2.6695526695526695e-05,
"loss": 0.4555,
"step": 1201
},
{
"epoch": 1.5593770279039585,
"grad_norm": 0.2554067046842974,
"learning_rate": 2.6671476671476676e-05,
"loss": 0.4075,
"step": 1202
},
{
"epoch": 1.5606748864373783,
"grad_norm": 0.23832375199078534,
"learning_rate": 2.6647426647426648e-05,
"loss": 0.4149,
"step": 1203
},
{
"epoch": 1.5619727449707983,
"grad_norm": 0.2387421511606967,
"learning_rate": 2.6623376623376623e-05,
"loss": 0.4349,
"step": 1204
},
{
"epoch": 1.563270603504218,
"grad_norm": 0.24466721743899011,
"learning_rate": 2.65993265993266e-05,
"loss": 0.4158,
"step": 1205
},
{
"epoch": 1.5645684620376379,
"grad_norm": 0.23174513616055498,
"learning_rate": 2.6575276575276575e-05,
"loss": 0.4443,
"step": 1206
},
{
"epoch": 1.565866320571058,
"grad_norm": 0.218582346579111,
"learning_rate": 2.6551226551226554e-05,
"loss": 0.4228,
"step": 1207
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.23236180907143378,
"learning_rate": 2.652717652717653e-05,
"loss": 0.4198,
"step": 1208
},
{
"epoch": 1.5684620376378975,
"grad_norm": 0.2461597550351122,
"learning_rate": 2.6503126503126503e-05,
"loss": 0.4388,
"step": 1209
},
{
"epoch": 1.5697598961713173,
"grad_norm": 0.24135274528584466,
"learning_rate": 2.647907647907648e-05,
"loss": 0.4182,
"step": 1210
},
{
"epoch": 1.571057754704737,
"grad_norm": 0.23011430180334824,
"learning_rate": 2.6455026455026456e-05,
"loss": 0.4345,
"step": 1211
},
{
"epoch": 1.572355613238157,
"grad_norm": 0.25813925411615873,
"learning_rate": 2.6430976430976434e-05,
"loss": 0.4152,
"step": 1212
},
{
"epoch": 1.5736534717715769,
"grad_norm": 0.2361569395941438,
"learning_rate": 2.640692640692641e-05,
"loss": 0.4107,
"step": 1213
},
{
"epoch": 1.5749513303049967,
"grad_norm": 0.26363884372789825,
"learning_rate": 2.638287638287638e-05,
"loss": 0.4392,
"step": 1214
},
{
"epoch": 1.5762491888384167,
"grad_norm": 0.24244610329485705,
"learning_rate": 2.6358826358826362e-05,
"loss": 0.4164,
"step": 1215
},
{
"epoch": 1.5775470473718365,
"grad_norm": 0.2552987758465308,
"learning_rate": 2.6334776334776333e-05,
"loss": 0.4339,
"step": 1216
},
{
"epoch": 1.5788449059052563,
"grad_norm": 0.2622601300659554,
"learning_rate": 2.6310726310726315e-05,
"loss": 0.4081,
"step": 1217
},
{
"epoch": 1.5801427644386763,
"grad_norm": 0.23435950487013313,
"learning_rate": 2.6286676286676286e-05,
"loss": 0.4266,
"step": 1218
},
{
"epoch": 1.581440622972096,
"grad_norm": 0.31150362868262865,
"learning_rate": 2.6262626262626268e-05,
"loss": 0.4205,
"step": 1219
},
{
"epoch": 1.5827384815055159,
"grad_norm": 0.2356568945579236,
"learning_rate": 2.623857623857624e-05,
"loss": 0.4235,
"step": 1220
},
{
"epoch": 1.584036340038936,
"grad_norm": 0.2636851026847217,
"learning_rate": 2.6214526214526214e-05,
"loss": 0.4194,
"step": 1221
},
{
"epoch": 1.5853341985723555,
"grad_norm": 0.2609824789762705,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.4386,
"step": 1222
},
{
"epoch": 1.5866320571057755,
"grad_norm": 0.2503475112982072,
"learning_rate": 2.6166426166426167e-05,
"loss": 0.4295,
"step": 1223
},
{
"epoch": 1.5879299156391953,
"grad_norm": 0.2748789264904923,
"learning_rate": 2.6142376142376145e-05,
"loss": 0.431,
"step": 1224
},
{
"epoch": 1.589227774172615,
"grad_norm": 0.2122856536086439,
"learning_rate": 2.611832611832612e-05,
"loss": 0.4151,
"step": 1225
},
{
"epoch": 1.590525632706035,
"grad_norm": 0.2882371321327433,
"learning_rate": 2.6094276094276095e-05,
"loss": 0.4242,
"step": 1226
},
{
"epoch": 1.5918234912394549,
"grad_norm": 0.22024360438567706,
"learning_rate": 2.6070226070226073e-05,
"loss": 0.4173,
"step": 1227
},
{
"epoch": 1.5931213497728747,
"grad_norm": 0.23708353175014626,
"learning_rate": 2.6046176046176048e-05,
"loss": 0.4251,
"step": 1228
},
{
"epoch": 1.5944192083062947,
"grad_norm": 0.2658200863217972,
"learning_rate": 2.6022126022126026e-05,
"loss": 0.4408,
"step": 1229
},
{
"epoch": 1.5957170668397145,
"grad_norm": 0.21583066555363375,
"learning_rate": 2.5998075998076e-05,
"loss": 0.4191,
"step": 1230
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.2777242614566809,
"learning_rate": 2.5974025974025972e-05,
"loss": 0.4393,
"step": 1231
},
{
"epoch": 1.5983127839065543,
"grad_norm": 0.23219187892619703,
"learning_rate": 2.5949975949975954e-05,
"loss": 0.4265,
"step": 1232
},
{
"epoch": 1.599610642439974,
"grad_norm": 0.29387387169794943,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.4244,
"step": 1233
},
{
"epoch": 1.6009085009733939,
"grad_norm": 0.22151955032464254,
"learning_rate": 2.5901875901875906e-05,
"loss": 0.4085,
"step": 1234
},
{
"epoch": 1.602206359506814,
"grad_norm": 0.24242248455059523,
"learning_rate": 2.5877825877825878e-05,
"loss": 0.4285,
"step": 1235
},
{
"epoch": 1.6035042180402335,
"grad_norm": 0.2621217435997206,
"learning_rate": 2.5853775853775853e-05,
"loss": 0.4379,
"step": 1236
},
{
"epoch": 1.6048020765736535,
"grad_norm": 0.22823804591889496,
"learning_rate": 2.582972582972583e-05,
"loss": 0.4211,
"step": 1237
},
{
"epoch": 1.6060999351070735,
"grad_norm": 0.2353758128022499,
"learning_rate": 2.5805675805675806e-05,
"loss": 0.4127,
"step": 1238
},
{
"epoch": 1.607397793640493,
"grad_norm": 0.22811130965496038,
"learning_rate": 2.5781625781625784e-05,
"loss": 0.4246,
"step": 1239
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.2366434232412805,
"learning_rate": 2.575757575757576e-05,
"loss": 0.4109,
"step": 1240
},
{
"epoch": 1.6099935107073329,
"grad_norm": 0.20375567441674386,
"learning_rate": 2.5733525733525737e-05,
"loss": 0.4146,
"step": 1241
},
{
"epoch": 1.6112913692407527,
"grad_norm": 0.2436449942466404,
"learning_rate": 2.570947570947571e-05,
"loss": 0.4162,
"step": 1242
},
{
"epoch": 1.6125892277741727,
"grad_norm": 0.22023021001348508,
"learning_rate": 2.5685425685425686e-05,
"loss": 0.4136,
"step": 1243
},
{
"epoch": 1.6138870863075925,
"grad_norm": 0.2069116265186359,
"learning_rate": 2.5661375661375664e-05,
"loss": 0.3999,
"step": 1244
},
{
"epoch": 1.6151849448410123,
"grad_norm": 0.24450308671714907,
"learning_rate": 2.563732563732564e-05,
"loss": 0.4352,
"step": 1245
},
{
"epoch": 1.6164828033744323,
"grad_norm": 0.2361666753423955,
"learning_rate": 2.5613275613275617e-05,
"loss": 0.4215,
"step": 1246
},
{
"epoch": 1.617780661907852,
"grad_norm": 0.24709753794900446,
"learning_rate": 2.5589225589225592e-05,
"loss": 0.4095,
"step": 1247
},
{
"epoch": 1.6190785204412719,
"grad_norm": 0.22770211218246428,
"learning_rate": 2.5565175565175563e-05,
"loss": 0.4208,
"step": 1248
},
{
"epoch": 1.6203763789746919,
"grad_norm": 0.24442985342584414,
"learning_rate": 2.5541125541125545e-05,
"loss": 0.4048,
"step": 1249
},
{
"epoch": 1.6216742375081115,
"grad_norm": 0.2449341182023967,
"learning_rate": 2.5517075517075516e-05,
"loss": 0.429,
"step": 1250
},
{
"epoch": 1.6229720960415315,
"grad_norm": 0.22314422338157636,
"learning_rate": 2.5493025493025498e-05,
"loss": 0.4161,
"step": 1251
},
{
"epoch": 1.6242699545749515,
"grad_norm": 0.22271710889727703,
"learning_rate": 2.546897546897547e-05,
"loss": 0.4223,
"step": 1252
},
{
"epoch": 1.625567813108371,
"grad_norm": 0.23943855813232637,
"learning_rate": 2.5444925444925444e-05,
"loss": 0.4263,
"step": 1253
},
{
"epoch": 1.626865671641791,
"grad_norm": 0.22346829290932305,
"learning_rate": 2.5420875420875422e-05,
"loss": 0.4002,
"step": 1254
},
{
"epoch": 1.6281635301752109,
"grad_norm": 0.21819410830608127,
"learning_rate": 2.5396825396825397e-05,
"loss": 0.4228,
"step": 1255
},
{
"epoch": 1.6294613887086307,
"grad_norm": 0.2487542450136884,
"learning_rate": 2.5372775372775375e-05,
"loss": 0.431,
"step": 1256
},
{
"epoch": 1.6307592472420507,
"grad_norm": 0.22276858066653343,
"learning_rate": 2.534872534872535e-05,
"loss": 0.3975,
"step": 1257
},
{
"epoch": 1.6320571057754705,
"grad_norm": 0.20406534653386582,
"learning_rate": 2.5324675324675325e-05,
"loss": 0.4308,
"step": 1258
},
{
"epoch": 1.6333549643088903,
"grad_norm": 0.2369459882014465,
"learning_rate": 2.5300625300625303e-05,
"loss": 0.4434,
"step": 1259
},
{
"epoch": 1.6346528228423103,
"grad_norm": 0.23054872564198348,
"learning_rate": 2.5276575276575278e-05,
"loss": 0.4296,
"step": 1260
},
{
"epoch": 1.63595068137573,
"grad_norm": 0.21314688817002478,
"learning_rate": 2.5252525252525256e-05,
"loss": 0.4234,
"step": 1261
},
{
"epoch": 1.6372485399091499,
"grad_norm": 0.22937591574682323,
"learning_rate": 2.522847522847523e-05,
"loss": 0.425,
"step": 1262
},
{
"epoch": 1.6385463984425699,
"grad_norm": 0.23974213218799267,
"learning_rate": 2.520442520442521e-05,
"loss": 0.4393,
"step": 1263
},
{
"epoch": 1.6398442569759895,
"grad_norm": 0.23441342590653153,
"learning_rate": 2.5180375180375184e-05,
"loss": 0.4474,
"step": 1264
},
{
"epoch": 1.6411421155094095,
"grad_norm": 0.22460634450789943,
"learning_rate": 2.5156325156325155e-05,
"loss": 0.4405,
"step": 1265
},
{
"epoch": 1.6424399740428295,
"grad_norm": 0.21099257965853147,
"learning_rate": 2.5132275132275137e-05,
"loss": 0.4147,
"step": 1266
},
{
"epoch": 1.643737832576249,
"grad_norm": 0.24160346011701583,
"learning_rate": 2.5108225108225108e-05,
"loss": 0.4397,
"step": 1267
},
{
"epoch": 1.645035691109669,
"grad_norm": 0.21504387068528427,
"learning_rate": 2.5084175084175086e-05,
"loss": 0.4134,
"step": 1268
},
{
"epoch": 1.6463335496430889,
"grad_norm": 0.20136235310740322,
"learning_rate": 2.506012506012506e-05,
"loss": 0.4352,
"step": 1269
},
{
"epoch": 1.6476314081765087,
"grad_norm": 0.20297036044525715,
"learning_rate": 2.5036075036075036e-05,
"loss": 0.4181,
"step": 1270
},
{
"epoch": 1.6489292667099287,
"grad_norm": 0.22303019507601843,
"learning_rate": 2.5012025012025014e-05,
"loss": 0.4028,
"step": 1271
},
{
"epoch": 1.6502271252433485,
"grad_norm": 0.22166881997259968,
"learning_rate": 2.498797498797499e-05,
"loss": 0.4347,
"step": 1272
},
{
"epoch": 1.6515249837767683,
"grad_norm": 0.20648838786480744,
"learning_rate": 2.4963924963924963e-05,
"loss": 0.4236,
"step": 1273
},
{
"epoch": 1.6528228423101883,
"grad_norm": 0.23349839066379247,
"learning_rate": 2.493987493987494e-05,
"loss": 0.419,
"step": 1274
},
{
"epoch": 1.654120700843608,
"grad_norm": 0.23063394385414213,
"learning_rate": 2.4915824915824916e-05,
"loss": 0.4152,
"step": 1275
},
{
"epoch": 1.6554185593770279,
"grad_norm": 0.2190005315364852,
"learning_rate": 2.4891774891774894e-05,
"loss": 0.4115,
"step": 1276
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.20078160624348626,
"learning_rate": 2.4867724867724866e-05,
"loss": 0.4245,
"step": 1277
},
{
"epoch": 1.6580142764438677,
"grad_norm": 0.24133729159661466,
"learning_rate": 2.4843674843674844e-05,
"loss": 0.4293,
"step": 1278
},
{
"epoch": 1.6593121349772875,
"grad_norm": 0.23794916923086656,
"learning_rate": 2.481962481962482e-05,
"loss": 0.4271,
"step": 1279
},
{
"epoch": 1.6606099935107075,
"grad_norm": 0.2574981267536903,
"learning_rate": 2.4795574795574797e-05,
"loss": 0.447,
"step": 1280
},
{
"epoch": 1.661907852044127,
"grad_norm": 0.23168835516119046,
"learning_rate": 2.4771524771524772e-05,
"loss": 0.4193,
"step": 1281
},
{
"epoch": 1.663205710577547,
"grad_norm": 0.23742732882318857,
"learning_rate": 2.474747474747475e-05,
"loss": 0.4476,
"step": 1282
},
{
"epoch": 1.6645035691109669,
"grad_norm": 0.2514178670168895,
"learning_rate": 2.4723424723424725e-05,
"loss": 0.4204,
"step": 1283
},
{
"epoch": 1.6658014276443867,
"grad_norm": 0.24624362103838568,
"learning_rate": 2.46993746993747e-05,
"loss": 0.4212,
"step": 1284
},
{
"epoch": 1.6670992861778067,
"grad_norm": 0.23275179943867672,
"learning_rate": 2.4675324675324678e-05,
"loss": 0.4312,
"step": 1285
},
{
"epoch": 1.6683971447112265,
"grad_norm": 0.24951766353093746,
"learning_rate": 2.4651274651274652e-05,
"loss": 0.4068,
"step": 1286
},
{
"epoch": 1.6696950032446463,
"grad_norm": 0.2052390086188538,
"learning_rate": 2.462722462722463e-05,
"loss": 0.4173,
"step": 1287
},
{
"epoch": 1.6709928617780663,
"grad_norm": 0.23198703435419166,
"learning_rate": 2.4603174603174602e-05,
"loss": 0.4177,
"step": 1288
},
{
"epoch": 1.672290720311486,
"grad_norm": 0.212107274947473,
"learning_rate": 2.457912457912458e-05,
"loss": 0.4166,
"step": 1289
},
{
"epoch": 1.6735885788449059,
"grad_norm": 0.2322378778891487,
"learning_rate": 2.4555074555074555e-05,
"loss": 0.434,
"step": 1290
},
{
"epoch": 1.6748864373783259,
"grad_norm": 0.21435317963286998,
"learning_rate": 2.4531024531024533e-05,
"loss": 0.4133,
"step": 1291
},
{
"epoch": 1.6761842959117457,
"grad_norm": 0.2156907157084962,
"learning_rate": 2.4506974506974508e-05,
"loss": 0.4285,
"step": 1292
},
{
"epoch": 1.6774821544451655,
"grad_norm": 0.2568679217265247,
"learning_rate": 2.4482924482924486e-05,
"loss": 0.4134,
"step": 1293
},
{
"epoch": 1.6787800129785855,
"grad_norm": 0.23974841708540973,
"learning_rate": 2.4458874458874457e-05,
"loss": 0.432,
"step": 1294
},
{
"epoch": 1.680077871512005,
"grad_norm": 0.24031934038845462,
"learning_rate": 2.4434824434824436e-05,
"loss": 0.4444,
"step": 1295
},
{
"epoch": 1.681375730045425,
"grad_norm": 0.2672955058745279,
"learning_rate": 2.441077441077441e-05,
"loss": 0.4369,
"step": 1296
},
{
"epoch": 1.6826735885788449,
"grad_norm": 0.250150232180256,
"learning_rate": 2.438672438672439e-05,
"loss": 0.4273,
"step": 1297
},
{
"epoch": 1.6839714471122647,
"grad_norm": 0.23626717443698833,
"learning_rate": 2.4362674362674363e-05,
"loss": 0.4236,
"step": 1298
},
{
"epoch": 1.6852693056456847,
"grad_norm": 0.22422589742898313,
"learning_rate": 2.4338624338624338e-05,
"loss": 0.4338,
"step": 1299
},
{
"epoch": 1.6865671641791045,
"grad_norm": 0.2308927071463409,
"learning_rate": 2.4314574314574316e-05,
"loss": 0.4386,
"step": 1300
},
{
"epoch": 1.6878650227125243,
"grad_norm": 0.2564472802977678,
"learning_rate": 2.429052429052429e-05,
"loss": 0.4117,
"step": 1301
},
{
"epoch": 1.6891628812459443,
"grad_norm": 0.23392174813654176,
"learning_rate": 2.426647426647427e-05,
"loss": 0.4154,
"step": 1302
},
{
"epoch": 1.690460739779364,
"grad_norm": 0.2397393509201778,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.4117,
"step": 1303
},
{
"epoch": 1.6917585983127839,
"grad_norm": 0.22912904331451653,
"learning_rate": 2.4218374218374222e-05,
"loss": 0.4273,
"step": 1304
},
{
"epoch": 1.6930564568462039,
"grad_norm": 0.23255973129828944,
"learning_rate": 2.4194324194324193e-05,
"loss": 0.4199,
"step": 1305
},
{
"epoch": 1.6943543153796237,
"grad_norm": 0.2459474867528304,
"learning_rate": 2.417027417027417e-05,
"loss": 0.3879,
"step": 1306
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.21055785182005404,
"learning_rate": 2.4146224146224146e-05,
"loss": 0.4284,
"step": 1307
},
{
"epoch": 1.6969500324464635,
"grad_norm": 0.23246957442423627,
"learning_rate": 2.4122174122174125e-05,
"loss": 0.4131,
"step": 1308
},
{
"epoch": 1.698247890979883,
"grad_norm": 0.23994403266599254,
"learning_rate": 2.40981240981241e-05,
"loss": 0.4333,
"step": 1309
},
{
"epoch": 1.699545749513303,
"grad_norm": 0.2316301617751929,
"learning_rate": 2.4074074074074074e-05,
"loss": 0.4486,
"step": 1310
},
{
"epoch": 1.7008436080467229,
"grad_norm": 0.22426960542300423,
"learning_rate": 2.405002405002405e-05,
"loss": 0.4087,
"step": 1311
},
{
"epoch": 1.7021414665801426,
"grad_norm": 0.2150721322470821,
"learning_rate": 2.4025974025974027e-05,
"loss": 0.4073,
"step": 1312
},
{
"epoch": 1.7034393251135627,
"grad_norm": 0.2519319561232589,
"learning_rate": 2.4001924001924002e-05,
"loss": 0.4312,
"step": 1313
},
{
"epoch": 1.7047371836469825,
"grad_norm": 0.21097482449181498,
"learning_rate": 2.397787397787398e-05,
"loss": 0.4319,
"step": 1314
},
{
"epoch": 1.7060350421804023,
"grad_norm": 0.24220719812817199,
"learning_rate": 2.3953823953823955e-05,
"loss": 0.4203,
"step": 1315
},
{
"epoch": 1.7073329007138223,
"grad_norm": 0.22812751540222995,
"learning_rate": 2.392977392977393e-05,
"loss": 0.4325,
"step": 1316
},
{
"epoch": 1.708630759247242,
"grad_norm": 0.21991172303538978,
"learning_rate": 2.3905723905723908e-05,
"loss": 0.4307,
"step": 1317
},
{
"epoch": 1.7099286177806619,
"grad_norm": 0.24924033452167085,
"learning_rate": 2.3881673881673882e-05,
"loss": 0.4049,
"step": 1318
},
{
"epoch": 1.7112264763140819,
"grad_norm": 0.20641915282626047,
"learning_rate": 2.385762385762386e-05,
"loss": 0.4171,
"step": 1319
},
{
"epoch": 1.7125243348475017,
"grad_norm": 0.24635070927670322,
"learning_rate": 2.3833573833573835e-05,
"loss": 0.4273,
"step": 1320
},
{
"epoch": 1.7138221933809215,
"grad_norm": 0.22793016083957607,
"learning_rate": 2.380952380952381e-05,
"loss": 0.4028,
"step": 1321
},
{
"epoch": 1.7151200519143415,
"grad_norm": 0.21686313598641418,
"learning_rate": 2.3785473785473785e-05,
"loss": 0.4216,
"step": 1322
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.20935343473455395,
"learning_rate": 2.3761423761423763e-05,
"loss": 0.4002,
"step": 1323
},
{
"epoch": 1.717715768981181,
"grad_norm": 0.22891020801302248,
"learning_rate": 2.3737373737373738e-05,
"loss": 0.411,
"step": 1324
},
{
"epoch": 1.719013627514601,
"grad_norm": 0.20360937762091913,
"learning_rate": 2.3713323713323716e-05,
"loss": 0.4445,
"step": 1325
},
{
"epoch": 1.7203114860480206,
"grad_norm": 0.21662567258205914,
"learning_rate": 2.368927368927369e-05,
"loss": 0.4182,
"step": 1326
},
{
"epoch": 1.7216093445814407,
"grad_norm": 0.2112365052652544,
"learning_rate": 2.3665223665223666e-05,
"loss": 0.4119,
"step": 1327
},
{
"epoch": 1.7229072031148605,
"grad_norm": 0.24045539800451038,
"learning_rate": 2.364117364117364e-05,
"loss": 0.4294,
"step": 1328
},
{
"epoch": 1.7242050616482802,
"grad_norm": 0.22344351793204972,
"learning_rate": 2.361712361712362e-05,
"loss": 0.4159,
"step": 1329
},
{
"epoch": 1.7255029201817003,
"grad_norm": 0.21385701142195507,
"learning_rate": 2.3593073593073593e-05,
"loss": 0.4208,
"step": 1330
},
{
"epoch": 1.72680077871512,
"grad_norm": 0.22306282993754703,
"learning_rate": 2.356902356902357e-05,
"loss": 0.417,
"step": 1331
},
{
"epoch": 1.7280986372485398,
"grad_norm": 0.23450328976859844,
"learning_rate": 2.3544973544973546e-05,
"loss": 0.397,
"step": 1332
},
{
"epoch": 1.7293964957819599,
"grad_norm": 0.23314049600175976,
"learning_rate": 2.352092352092352e-05,
"loss": 0.4297,
"step": 1333
},
{
"epoch": 1.7306943543153797,
"grad_norm": 0.21399972133644776,
"learning_rate": 2.34968734968735e-05,
"loss": 0.4366,
"step": 1334
},
{
"epoch": 1.7319922128487995,
"grad_norm": 0.22426899700350952,
"learning_rate": 2.3472823472823474e-05,
"loss": 0.433,
"step": 1335
},
{
"epoch": 1.7332900713822195,
"grad_norm": 0.21720443031252623,
"learning_rate": 2.3448773448773452e-05,
"loss": 0.4135,
"step": 1336
},
{
"epoch": 1.734587929915639,
"grad_norm": 0.22022369229968872,
"learning_rate": 2.3424723424723427e-05,
"loss": 0.4035,
"step": 1337
},
{
"epoch": 1.735885788449059,
"grad_norm": 0.2432882987842844,
"learning_rate": 2.34006734006734e-05,
"loss": 0.4274,
"step": 1338
},
{
"epoch": 1.737183646982479,
"grad_norm": 0.22954645223280482,
"learning_rate": 2.3376623376623376e-05,
"loss": 0.4265,
"step": 1339
},
{
"epoch": 1.7384815055158986,
"grad_norm": 0.23456332298959323,
"learning_rate": 2.3352573352573355e-05,
"loss": 0.4261,
"step": 1340
},
{
"epoch": 1.7397793640493187,
"grad_norm": 0.23090513352220413,
"learning_rate": 2.332852332852333e-05,
"loss": 0.4343,
"step": 1341
},
{
"epoch": 1.7410772225827384,
"grad_norm": 0.22635007188997747,
"learning_rate": 2.3304473304473308e-05,
"loss": 0.434,
"step": 1342
},
{
"epoch": 1.7423750811161582,
"grad_norm": 0.24328716551223983,
"learning_rate": 2.328042328042328e-05,
"loss": 0.4329,
"step": 1343
},
{
"epoch": 1.7436729396495783,
"grad_norm": 0.2245296632717372,
"learning_rate": 2.3256373256373257e-05,
"loss": 0.4135,
"step": 1344
},
{
"epoch": 1.744970798182998,
"grad_norm": 0.23430249945874695,
"learning_rate": 2.3232323232323232e-05,
"loss": 0.4178,
"step": 1345
},
{
"epoch": 1.7462686567164178,
"grad_norm": 0.21397181948116892,
"learning_rate": 2.320827320827321e-05,
"loss": 0.4236,
"step": 1346
},
{
"epoch": 1.7475665152498379,
"grad_norm": 0.21776737560072357,
"learning_rate": 2.3184223184223185e-05,
"loss": 0.425,
"step": 1347
},
{
"epoch": 1.7488643737832577,
"grad_norm": 0.23739059292954565,
"learning_rate": 2.3160173160173163e-05,
"loss": 0.4262,
"step": 1348
},
{
"epoch": 1.7501622323166774,
"grad_norm": 0.2207747605074272,
"learning_rate": 2.3136123136123138e-05,
"loss": 0.4097,
"step": 1349
},
{
"epoch": 1.7514600908500975,
"grad_norm": 0.19291564676436485,
"learning_rate": 2.3112073112073113e-05,
"loss": 0.4222,
"step": 1350
},
{
"epoch": 1.752757949383517,
"grad_norm": 0.20208738954938904,
"learning_rate": 2.308802308802309e-05,
"loss": 0.4199,
"step": 1351
},
{
"epoch": 1.754055807916937,
"grad_norm": 0.22066527169458836,
"learning_rate": 2.3063973063973065e-05,
"loss": 0.426,
"step": 1352
},
{
"epoch": 1.755353666450357,
"grad_norm": 0.22615489279435733,
"learning_rate": 2.3039923039923044e-05,
"loss": 0.4103,
"step": 1353
},
{
"epoch": 1.7566515249837766,
"grad_norm": 0.24657435823356594,
"learning_rate": 2.3015873015873015e-05,
"loss": 0.4006,
"step": 1354
},
{
"epoch": 1.7579493835171967,
"grad_norm": 0.2287984884377898,
"learning_rate": 2.2991822991822993e-05,
"loss": 0.4481,
"step": 1355
},
{
"epoch": 1.7592472420506164,
"grad_norm": 0.21060281438071618,
"learning_rate": 2.2967772967772968e-05,
"loss": 0.399,
"step": 1356
},
{
"epoch": 1.7605451005840362,
"grad_norm": 0.2265028463512503,
"learning_rate": 2.2943722943722946e-05,
"loss": 0.4258,
"step": 1357
},
{
"epoch": 1.7618429591174563,
"grad_norm": 0.21362689950493072,
"learning_rate": 2.291967291967292e-05,
"loss": 0.4178,
"step": 1358
},
{
"epoch": 1.763140817650876,
"grad_norm": 0.21905319629937445,
"learning_rate": 2.28956228956229e-05,
"loss": 0.435,
"step": 1359
},
{
"epoch": 1.7644386761842958,
"grad_norm": 0.2390674190592371,
"learning_rate": 2.287157287157287e-05,
"loss": 0.4374,
"step": 1360
},
{
"epoch": 1.7657365347177159,
"grad_norm": 0.262047575243414,
"learning_rate": 2.284752284752285e-05,
"loss": 0.4249,
"step": 1361
},
{
"epoch": 1.7670343932511356,
"grad_norm": 0.215263905999125,
"learning_rate": 2.2823472823472823e-05,
"loss": 0.4358,
"step": 1362
},
{
"epoch": 1.7683322517845554,
"grad_norm": 0.2566335364538136,
"learning_rate": 2.27994227994228e-05,
"loss": 0.4358,
"step": 1363
},
{
"epoch": 1.7696301103179755,
"grad_norm": 0.23396394290955702,
"learning_rate": 2.2775372775372776e-05,
"loss": 0.4345,
"step": 1364
},
{
"epoch": 1.7709279688513953,
"grad_norm": 0.2997951835182788,
"learning_rate": 2.275132275132275e-05,
"loss": 0.417,
"step": 1365
},
{
"epoch": 1.772225827384815,
"grad_norm": 0.25908281100055225,
"learning_rate": 2.272727272727273e-05,
"loss": 0.4283,
"step": 1366
},
{
"epoch": 1.773523685918235,
"grad_norm": 0.2710663340672724,
"learning_rate": 2.2703222703222704e-05,
"loss": 0.4405,
"step": 1367
},
{
"epoch": 1.7748215444516546,
"grad_norm": 0.25367420601807966,
"learning_rate": 2.267917267917268e-05,
"loss": 0.4149,
"step": 1368
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.2872738552914004,
"learning_rate": 2.2655122655122657e-05,
"loss": 0.4339,
"step": 1369
},
{
"epoch": 1.7774172615184944,
"grad_norm": 0.21919850447737751,
"learning_rate": 2.2631072631072632e-05,
"loss": 0.4445,
"step": 1370
},
{
"epoch": 1.7787151200519142,
"grad_norm": 0.27892242525176375,
"learning_rate": 2.2607022607022607e-05,
"loss": 0.4131,
"step": 1371
},
{
"epoch": 1.7800129785853342,
"grad_norm": 0.24026730555070555,
"learning_rate": 2.2582972582972585e-05,
"loss": 0.427,
"step": 1372
},
{
"epoch": 1.781310837118754,
"grad_norm": 0.2145688542497997,
"learning_rate": 2.255892255892256e-05,
"loss": 0.4391,
"step": 1373
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.23661442477067585,
"learning_rate": 2.2534872534872538e-05,
"loss": 0.41,
"step": 1374
},
{
"epoch": 1.7839065541855939,
"grad_norm": 0.2610547392581578,
"learning_rate": 2.2510822510822512e-05,
"loss": 0.4188,
"step": 1375
},
{
"epoch": 1.7852044127190136,
"grad_norm": 0.2493506467256105,
"learning_rate": 2.2486772486772487e-05,
"loss": 0.4206,
"step": 1376
},
{
"epoch": 1.7865022712524334,
"grad_norm": 0.2649167628997299,
"learning_rate": 2.2462722462722462e-05,
"loss": 0.4237,
"step": 1377
},
{
"epoch": 1.7878001297858535,
"grad_norm": 0.2534895267108062,
"learning_rate": 2.243867243867244e-05,
"loss": 0.4283,
"step": 1378
},
{
"epoch": 1.7890979883192732,
"grad_norm": 0.2709616156112994,
"learning_rate": 2.2414622414622415e-05,
"loss": 0.4113,
"step": 1379
},
{
"epoch": 1.790395846852693,
"grad_norm": 0.22792963880042075,
"learning_rate": 2.2390572390572393e-05,
"loss": 0.4267,
"step": 1380
},
{
"epoch": 1.791693705386113,
"grad_norm": 0.24622268955355062,
"learning_rate": 2.2366522366522368e-05,
"loss": 0.4051,
"step": 1381
},
{
"epoch": 1.7929915639195326,
"grad_norm": 0.27590035842972194,
"learning_rate": 2.2342472342472343e-05,
"loss": 0.4378,
"step": 1382
},
{
"epoch": 1.7942894224529526,
"grad_norm": 0.2264860712514965,
"learning_rate": 2.2318422318422317e-05,
"loss": 0.4171,
"step": 1383
},
{
"epoch": 1.7955872809863724,
"grad_norm": 0.27527712703496315,
"learning_rate": 2.2294372294372296e-05,
"loss": 0.4136,
"step": 1384
},
{
"epoch": 1.7968851395197922,
"grad_norm": 0.27052531643386396,
"learning_rate": 2.227032227032227e-05,
"loss": 0.3935,
"step": 1385
},
{
"epoch": 1.7981829980532122,
"grad_norm": 0.2479444281803134,
"learning_rate": 2.224627224627225e-05,
"loss": 0.4331,
"step": 1386
},
{
"epoch": 1.799480856586632,
"grad_norm": 0.2373284631481721,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.417,
"step": 1387
},
{
"epoch": 1.8007787151200518,
"grad_norm": 0.2575638652052547,
"learning_rate": 2.2198172198172198e-05,
"loss": 0.4323,
"step": 1388
},
{
"epoch": 1.8020765736534718,
"grad_norm": 0.2407980171885747,
"learning_rate": 2.2174122174122176e-05,
"loss": 0.4127,
"step": 1389
},
{
"epoch": 1.8033744321868916,
"grad_norm": 0.21117036443387086,
"learning_rate": 2.215007215007215e-05,
"loss": 0.4191,
"step": 1390
},
{
"epoch": 1.8046722907203114,
"grad_norm": 0.20129164818193102,
"learning_rate": 2.212602212602213e-05,
"loss": 0.4115,
"step": 1391
},
{
"epoch": 1.8059701492537314,
"grad_norm": 0.2540001501490721,
"learning_rate": 2.2101972101972104e-05,
"loss": 0.431,
"step": 1392
},
{
"epoch": 1.8072680077871512,
"grad_norm": 0.24804686013462887,
"learning_rate": 2.207792207792208e-05,
"loss": 0.4228,
"step": 1393
},
{
"epoch": 1.808565866320571,
"grad_norm": 0.2022520818624456,
"learning_rate": 2.2053872053872053e-05,
"loss": 0.4203,
"step": 1394
},
{
"epoch": 1.809863724853991,
"grad_norm": 0.23238075295532062,
"learning_rate": 2.202982202982203e-05,
"loss": 0.425,
"step": 1395
},
{
"epoch": 1.8111615833874106,
"grad_norm": 0.21664815140422355,
"learning_rate": 2.2005772005772006e-05,
"loss": 0.4299,
"step": 1396
},
{
"epoch": 1.8124594419208306,
"grad_norm": 0.21744531033366538,
"learning_rate": 2.1981721981721985e-05,
"loss": 0.4019,
"step": 1397
},
{
"epoch": 1.8137573004542504,
"grad_norm": 0.2138946808987489,
"learning_rate": 2.1957671957671956e-05,
"loss": 0.4256,
"step": 1398
},
{
"epoch": 1.8150551589876702,
"grad_norm": 0.2224576706896047,
"learning_rate": 2.1933621933621934e-05,
"loss": 0.4147,
"step": 1399
},
{
"epoch": 1.8163530175210902,
"grad_norm": 0.2002966133486591,
"learning_rate": 2.190957190957191e-05,
"loss": 0.426,
"step": 1400
},
{
"epoch": 1.81765087605451,
"grad_norm": 0.202531126088113,
"learning_rate": 2.1885521885521887e-05,
"loss": 0.4429,
"step": 1401
},
{
"epoch": 1.8189487345879298,
"grad_norm": 0.20516498421820234,
"learning_rate": 2.1861471861471862e-05,
"loss": 0.4208,
"step": 1402
},
{
"epoch": 1.8202465931213498,
"grad_norm": 0.23389135616856488,
"learning_rate": 2.183742183742184e-05,
"loss": 0.4261,
"step": 1403
},
{
"epoch": 1.8215444516547696,
"grad_norm": 0.21459274849252136,
"learning_rate": 2.1813371813371815e-05,
"loss": 0.4346,
"step": 1404
},
{
"epoch": 1.8228423101881894,
"grad_norm": 0.2078147272516738,
"learning_rate": 2.178932178932179e-05,
"loss": 0.4092,
"step": 1405
},
{
"epoch": 1.8241401687216094,
"grad_norm": 0.24580373752808737,
"learning_rate": 2.1765271765271768e-05,
"loss": 0.4233,
"step": 1406
},
{
"epoch": 1.8254380272550292,
"grad_norm": 0.22454106978014404,
"learning_rate": 2.1741221741221743e-05,
"loss": 0.4147,
"step": 1407
},
{
"epoch": 1.826735885788449,
"grad_norm": 0.21599295500558674,
"learning_rate": 2.171717171717172e-05,
"loss": 0.4349,
"step": 1408
},
{
"epoch": 1.828033744321869,
"grad_norm": 0.2127033614651673,
"learning_rate": 2.1693121693121692e-05,
"loss": 0.4206,
"step": 1409
},
{
"epoch": 1.8293316028552886,
"grad_norm": 0.21925596786696352,
"learning_rate": 2.166907166907167e-05,
"loss": 0.4399,
"step": 1410
},
{
"epoch": 1.8306294613887086,
"grad_norm": 0.21016324905145667,
"learning_rate": 2.1645021645021645e-05,
"loss": 0.4222,
"step": 1411
},
{
"epoch": 1.8319273199221286,
"grad_norm": 0.22520381391920555,
"learning_rate": 2.1620971620971623e-05,
"loss": 0.4258,
"step": 1412
},
{
"epoch": 1.8332251784555482,
"grad_norm": 0.22141690462102792,
"learning_rate": 2.1596921596921598e-05,
"loss": 0.412,
"step": 1413
},
{
"epoch": 1.8345230369889682,
"grad_norm": 0.2429839281627191,
"learning_rate": 2.1572871572871576e-05,
"loss": 0.4269,
"step": 1414
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.2160140354835784,
"learning_rate": 2.1548821548821547e-05,
"loss": 0.4205,
"step": 1415
},
{
"epoch": 1.8371187540558078,
"grad_norm": 0.2402260672982623,
"learning_rate": 2.1524771524771526e-05,
"loss": 0.4193,
"step": 1416
},
{
"epoch": 1.8384166125892278,
"grad_norm": 0.29744843810112265,
"learning_rate": 2.15007215007215e-05,
"loss": 0.4325,
"step": 1417
},
{
"epoch": 1.8397144711226476,
"grad_norm": 0.22530015703559994,
"learning_rate": 2.147667147667148e-05,
"loss": 0.4318,
"step": 1418
},
{
"epoch": 1.8410123296560674,
"grad_norm": 0.29390956909610316,
"learning_rate": 2.1452621452621453e-05,
"loss": 0.4288,
"step": 1419
},
{
"epoch": 1.8423101881894874,
"grad_norm": 0.23358318964698258,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.4084,
"step": 1420
},
{
"epoch": 1.8436080467229072,
"grad_norm": 0.21167664114993395,
"learning_rate": 2.1404521404521406e-05,
"loss": 0.4313,
"step": 1421
},
{
"epoch": 1.844905905256327,
"grad_norm": 0.2460698519801602,
"learning_rate": 2.138047138047138e-05,
"loss": 0.4155,
"step": 1422
},
{
"epoch": 1.846203763789747,
"grad_norm": 0.23025941782631765,
"learning_rate": 2.135642135642136e-05,
"loss": 0.4057,
"step": 1423
},
{
"epoch": 1.8475016223231666,
"grad_norm": 0.19865359681586736,
"learning_rate": 2.1332371332371334e-05,
"loss": 0.4075,
"step": 1424
},
{
"epoch": 1.8487994808565866,
"grad_norm": 0.22150167838157933,
"learning_rate": 2.1308321308321312e-05,
"loss": 0.4338,
"step": 1425
},
{
"epoch": 1.8500973393900066,
"grad_norm": 0.27381218064289997,
"learning_rate": 2.1284271284271284e-05,
"loss": 0.4385,
"step": 1426
},
{
"epoch": 1.8513951979234262,
"grad_norm": 0.2386126810899565,
"learning_rate": 2.1260221260221262e-05,
"loss": 0.4138,
"step": 1427
},
{
"epoch": 1.8526930564568462,
"grad_norm": 0.23844253499070778,
"learning_rate": 2.1236171236171237e-05,
"loss": 0.417,
"step": 1428
},
{
"epoch": 1.853990914990266,
"grad_norm": 0.24734871284649604,
"learning_rate": 2.1212121212121215e-05,
"loss": 0.4266,
"step": 1429
},
{
"epoch": 1.8552887735236858,
"grad_norm": 0.2581372866509555,
"learning_rate": 2.118807118807119e-05,
"loss": 0.4073,
"step": 1430
},
{
"epoch": 1.8565866320571058,
"grad_norm": 0.20591243236055737,
"learning_rate": 2.1164021164021164e-05,
"loss": 0.3973,
"step": 1431
},
{
"epoch": 1.8578844905905256,
"grad_norm": 0.25393718244850216,
"learning_rate": 2.113997113997114e-05,
"loss": 0.4237,
"step": 1432
},
{
"epoch": 1.8591823491239454,
"grad_norm": 0.256757051595813,
"learning_rate": 2.1115921115921117e-05,
"loss": 0.4276,
"step": 1433
},
{
"epoch": 1.8604802076573654,
"grad_norm": 0.2199746107316156,
"learning_rate": 2.1091871091871092e-05,
"loss": 0.4027,
"step": 1434
},
{
"epoch": 1.8617780661907852,
"grad_norm": 0.22993418151409517,
"learning_rate": 2.106782106782107e-05,
"loss": 0.4258,
"step": 1435
},
{
"epoch": 1.863075924724205,
"grad_norm": 0.23986794245337564,
"learning_rate": 2.1043771043771045e-05,
"loss": 0.4092,
"step": 1436
},
{
"epoch": 1.864373783257625,
"grad_norm": 0.2503767269878855,
"learning_rate": 2.101972101972102e-05,
"loss": 0.4337,
"step": 1437
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.19966379931345576,
"learning_rate": 2.0995670995670998e-05,
"loss": 0.4083,
"step": 1438
},
{
"epoch": 1.8669695003244646,
"grad_norm": 0.22975695557758422,
"learning_rate": 2.0971620971620973e-05,
"loss": 0.4155,
"step": 1439
},
{
"epoch": 1.8682673588578846,
"grad_norm": 0.26927614268096606,
"learning_rate": 2.094757094757095e-05,
"loss": 0.3885,
"step": 1440
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.20373734252329936,
"learning_rate": 2.0923520923520926e-05,
"loss": 0.4291,
"step": 1441
},
{
"epoch": 1.8708630759247242,
"grad_norm": 0.2683958306016899,
"learning_rate": 2.08994708994709e-05,
"loss": 0.433,
"step": 1442
},
{
"epoch": 1.872160934458144,
"grad_norm": 0.26568578858883407,
"learning_rate": 2.0875420875420875e-05,
"loss": 0.4379,
"step": 1443
},
{
"epoch": 1.8734587929915638,
"grad_norm": 0.2365065973899857,
"learning_rate": 2.0851370851370853e-05,
"loss": 0.4372,
"step": 1444
},
{
"epoch": 1.8747566515249838,
"grad_norm": 0.2160536365282337,
"learning_rate": 2.0827320827320828e-05,
"loss": 0.4249,
"step": 1445
},
{
"epoch": 1.8760545100584036,
"grad_norm": 0.2698594967367338,
"learning_rate": 2.0803270803270806e-05,
"loss": 0.4251,
"step": 1446
},
{
"epoch": 1.8773523685918234,
"grad_norm": 0.240476141319818,
"learning_rate": 2.077922077922078e-05,
"loss": 0.4051,
"step": 1447
},
{
"epoch": 1.8786502271252434,
"grad_norm": 0.20313150197250998,
"learning_rate": 2.0755170755170756e-05,
"loss": 0.4142,
"step": 1448
},
{
"epoch": 1.8799480856586632,
"grad_norm": 0.2513888218859537,
"learning_rate": 2.073112073112073e-05,
"loss": 0.4235,
"step": 1449
},
{
"epoch": 1.881245944192083,
"grad_norm": 0.263020254508393,
"learning_rate": 2.070707070707071e-05,
"loss": 0.4133,
"step": 1450
},
{
"epoch": 1.882543802725503,
"grad_norm": 0.19807928758761542,
"learning_rate": 2.0683020683020683e-05,
"loss": 0.4211,
"step": 1451
},
{
"epoch": 1.8838416612589228,
"grad_norm": 0.28553479995616016,
"learning_rate": 2.065897065897066e-05,
"loss": 0.4295,
"step": 1452
},
{
"epoch": 1.8851395197923426,
"grad_norm": 0.22454512768715873,
"learning_rate": 2.0634920634920636e-05,
"loss": 0.4164,
"step": 1453
},
{
"epoch": 1.8864373783257626,
"grad_norm": 0.2212283630425153,
"learning_rate": 2.061087061087061e-05,
"loss": 0.4309,
"step": 1454
},
{
"epoch": 1.8877352368591822,
"grad_norm": 0.21880750363041376,
"learning_rate": 2.058682058682059e-05,
"loss": 0.4148,
"step": 1455
},
{
"epoch": 1.8890330953926022,
"grad_norm": 0.2698709703952382,
"learning_rate": 2.0562770562770564e-05,
"loss": 0.4339,
"step": 1456
},
{
"epoch": 1.890330953926022,
"grad_norm": 0.21631366892137663,
"learning_rate": 2.0538720538720542e-05,
"loss": 0.4209,
"step": 1457
},
{
"epoch": 1.8916288124594418,
"grad_norm": 0.22312561756649457,
"learning_rate": 2.0514670514670517e-05,
"loss": 0.4205,
"step": 1458
},
{
"epoch": 1.8929266709928618,
"grad_norm": 0.22982817420831553,
"learning_rate": 2.0490620490620492e-05,
"loss": 0.4127,
"step": 1459
},
{
"epoch": 1.8942245295262816,
"grad_norm": 0.23011803773822845,
"learning_rate": 2.0466570466570467e-05,
"loss": 0.4018,
"step": 1460
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.2156774757909124,
"learning_rate": 2.0442520442520445e-05,
"loss": 0.448,
"step": 1461
},
{
"epoch": 1.8968202465931214,
"grad_norm": 0.2465313564522942,
"learning_rate": 2.041847041847042e-05,
"loss": 0.4433,
"step": 1462
},
{
"epoch": 1.8981181051265412,
"grad_norm": 0.21906089100592563,
"learning_rate": 2.0394420394420398e-05,
"loss": 0.4103,
"step": 1463
},
{
"epoch": 1.899415963659961,
"grad_norm": 0.21024875722994074,
"learning_rate": 2.037037037037037e-05,
"loss": 0.3951,
"step": 1464
},
{
"epoch": 1.900713822193381,
"grad_norm": 0.18955824822069273,
"learning_rate": 2.0346320346320347e-05,
"loss": 0.4174,
"step": 1465
},
{
"epoch": 1.9020116807268008,
"grad_norm": 0.23144049792472646,
"learning_rate": 2.0322270322270322e-05,
"loss": 0.3998,
"step": 1466
},
{
"epoch": 1.9033095392602206,
"grad_norm": 0.21081067517865779,
"learning_rate": 2.02982202982203e-05,
"loss": 0.4103,
"step": 1467
},
{
"epoch": 1.9046073977936406,
"grad_norm": 0.2194010873045385,
"learning_rate": 2.0274170274170275e-05,
"loss": 0.4123,
"step": 1468
},
{
"epoch": 1.9059052563270602,
"grad_norm": 0.20757690294910305,
"learning_rate": 2.025012025012025e-05,
"loss": 0.4012,
"step": 1469
},
{
"epoch": 1.9072031148604802,
"grad_norm": 0.2628488845364361,
"learning_rate": 2.0226070226070225e-05,
"loss": 0.4118,
"step": 1470
},
{
"epoch": 1.9085009733939,
"grad_norm": 0.2503984380267546,
"learning_rate": 2.0202020202020203e-05,
"loss": 0.4277,
"step": 1471
},
{
"epoch": 1.9097988319273198,
"grad_norm": 0.20731051341055765,
"learning_rate": 2.0177970177970177e-05,
"loss": 0.4223,
"step": 1472
},
{
"epoch": 1.9110966904607398,
"grad_norm": 0.2469892137078129,
"learning_rate": 2.0153920153920156e-05,
"loss": 0.4278,
"step": 1473
},
{
"epoch": 1.9123945489941596,
"grad_norm": 0.2302509090632293,
"learning_rate": 2.012987012987013e-05,
"loss": 0.4286,
"step": 1474
},
{
"epoch": 1.9136924075275794,
"grad_norm": 0.21560820581873713,
"learning_rate": 2.0105820105820105e-05,
"loss": 0.4036,
"step": 1475
},
{
"epoch": 1.9149902660609994,
"grad_norm": 0.21761526673837062,
"learning_rate": 2.0081770081770083e-05,
"loss": 0.4383,
"step": 1476
},
{
"epoch": 1.9162881245944192,
"grad_norm": 0.25419859148323953,
"learning_rate": 2.0057720057720058e-05,
"loss": 0.4168,
"step": 1477
},
{
"epoch": 1.917585983127839,
"grad_norm": 0.21447148417291215,
"learning_rate": 2.0033670033670036e-05,
"loss": 0.4233,
"step": 1478
},
{
"epoch": 1.918883841661259,
"grad_norm": 0.22177181102304355,
"learning_rate": 2.000962000962001e-05,
"loss": 0.3973,
"step": 1479
},
{
"epoch": 1.9201817001946788,
"grad_norm": 0.2361964777550035,
"learning_rate": 1.9985569985569986e-05,
"loss": 0.4062,
"step": 1480
},
{
"epoch": 1.9214795587280986,
"grad_norm": 0.2268625180335479,
"learning_rate": 1.996151996151996e-05,
"loss": 0.4288,
"step": 1481
},
{
"epoch": 1.9227774172615186,
"grad_norm": 0.22109891172640594,
"learning_rate": 1.993746993746994e-05,
"loss": 0.4168,
"step": 1482
},
{
"epoch": 1.9240752757949382,
"grad_norm": 0.24199558362942594,
"learning_rate": 1.9913419913419914e-05,
"loss": 0.4368,
"step": 1483
},
{
"epoch": 1.9253731343283582,
"grad_norm": 0.23386196057480746,
"learning_rate": 1.9889369889369892e-05,
"loss": 0.4929,
"step": 1484
},
{
"epoch": 1.9266709928617782,
"grad_norm": 2.3863223206635507,
"learning_rate": 1.9865319865319866e-05,
"loss": 0.4321,
"step": 1485
},
{
"epoch": 1.9279688513951978,
"grad_norm": 0.2117744722116347,
"learning_rate": 1.984126984126984e-05,
"loss": 0.4046,
"step": 1486
},
{
"epoch": 1.9292667099286178,
"grad_norm": 0.23753639694866985,
"learning_rate": 1.9817219817219816e-05,
"loss": 0.4088,
"step": 1487
},
{
"epoch": 1.9305645684620376,
"grad_norm": 0.2029549567060751,
"learning_rate": 1.9793169793169794e-05,
"loss": 0.4129,
"step": 1488
},
{
"epoch": 1.9318624269954574,
"grad_norm": 0.20999056789664505,
"learning_rate": 1.976911976911977e-05,
"loss": 0.4149,
"step": 1489
},
{
"epoch": 1.9331602855288774,
"grad_norm": 0.25609740431868805,
"learning_rate": 1.9745069745069747e-05,
"loss": 0.4422,
"step": 1490
},
{
"epoch": 1.9344581440622972,
"grad_norm": 0.23240223312760538,
"learning_rate": 1.9721019721019722e-05,
"loss": 0.4376,
"step": 1491
},
{
"epoch": 1.935756002595717,
"grad_norm": 0.22288941915151747,
"learning_rate": 1.9696969696969697e-05,
"loss": 0.4031,
"step": 1492
},
{
"epoch": 1.937053861129137,
"grad_norm": 0.20823811668022293,
"learning_rate": 1.9672919672919675e-05,
"loss": 0.4141,
"step": 1493
},
{
"epoch": 1.9383517196625568,
"grad_norm": 0.22958571482808876,
"learning_rate": 1.964886964886965e-05,
"loss": 0.4205,
"step": 1494
},
{
"epoch": 1.9396495781959766,
"grad_norm": 0.24269527070284858,
"learning_rate": 1.9624819624819628e-05,
"loss": 0.414,
"step": 1495
},
{
"epoch": 1.9409474367293966,
"grad_norm": 0.2070554761070819,
"learning_rate": 1.9600769600769603e-05,
"loss": 0.4045,
"step": 1496
},
{
"epoch": 1.9422452952628162,
"grad_norm": 0.24376293095622897,
"learning_rate": 1.9576719576719577e-05,
"loss": 0.4257,
"step": 1497
},
{
"epoch": 1.9435431537962362,
"grad_norm": 0.2254498669697948,
"learning_rate": 1.9552669552669552e-05,
"loss": 0.421,
"step": 1498
},
{
"epoch": 1.9448410123296562,
"grad_norm": 0.21748513808130843,
"learning_rate": 1.952861952861953e-05,
"loss": 0.4062,
"step": 1499
},
{
"epoch": 1.9461388708630758,
"grad_norm": 0.2148376810996354,
"learning_rate": 1.9504569504569505e-05,
"loss": 0.4203,
"step": 1500
},
{
"epoch": 1.9474367293964958,
"grad_norm": 0.25871259714383205,
"learning_rate": 1.9480519480519483e-05,
"loss": 0.4229,
"step": 1501
},
{
"epoch": 1.9487345879299156,
"grad_norm": 0.19582347887373358,
"learning_rate": 1.9456469456469455e-05,
"loss": 0.4081,
"step": 1502
},
{
"epoch": 1.9500324464633354,
"grad_norm": 0.22789399470009464,
"learning_rate": 1.9432419432419433e-05,
"loss": 0.4245,
"step": 1503
},
{
"epoch": 1.9513303049967554,
"grad_norm": 0.23018173092515049,
"learning_rate": 1.9408369408369408e-05,
"loss": 0.4216,
"step": 1504
},
{
"epoch": 1.9526281635301752,
"grad_norm": 0.21444832133823605,
"learning_rate": 1.9384319384319386e-05,
"loss": 0.4147,
"step": 1505
},
{
"epoch": 1.953926022063595,
"grad_norm": 0.20562647584839736,
"learning_rate": 1.936026936026936e-05,
"loss": 0.4008,
"step": 1506
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.23110467483063488,
"learning_rate": 1.933621933621934e-05,
"loss": 0.3972,
"step": 1507
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.21834150518213843,
"learning_rate": 1.9312169312169313e-05,
"loss": 0.4256,
"step": 1508
},
{
"epoch": 1.9578195976638546,
"grad_norm": 0.21958450668275112,
"learning_rate": 1.9288119288119288e-05,
"loss": 0.4088,
"step": 1509
},
{
"epoch": 1.9591174561972746,
"grad_norm": 0.20052094185224426,
"learning_rate": 1.9264069264069266e-05,
"loss": 0.4121,
"step": 1510
},
{
"epoch": 1.9604153147306942,
"grad_norm": 0.24326880305407378,
"learning_rate": 1.924001924001924e-05,
"loss": 0.422,
"step": 1511
},
{
"epoch": 1.9617131732641142,
"grad_norm": 0.2410106190975958,
"learning_rate": 1.921596921596922e-05,
"loss": 0.4085,
"step": 1512
},
{
"epoch": 1.9630110317975342,
"grad_norm": 0.20377491892233185,
"learning_rate": 1.919191919191919e-05,
"loss": 0.4312,
"step": 1513
},
{
"epoch": 1.9643088903309538,
"grad_norm": 0.22992091739225845,
"learning_rate": 1.916786916786917e-05,
"loss": 0.4283,
"step": 1514
},
{
"epoch": 1.9656067488643738,
"grad_norm": 0.23320180740415136,
"learning_rate": 1.9143819143819144e-05,
"loss": 0.4167,
"step": 1515
},
{
"epoch": 1.9669046073977936,
"grad_norm": 0.21478096347520134,
"learning_rate": 1.9119769119769122e-05,
"loss": 0.4373,
"step": 1516
},
{
"epoch": 1.9682024659312134,
"grad_norm": 0.24312143244424492,
"learning_rate": 1.9095719095719097e-05,
"loss": 0.4384,
"step": 1517
},
{
"epoch": 1.9695003244646334,
"grad_norm": 0.22013684407762923,
"learning_rate": 1.9071669071669075e-05,
"loss": 0.4244,
"step": 1518
},
{
"epoch": 1.9707981829980532,
"grad_norm": 0.22394887240003014,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.419,
"step": 1519
},
{
"epoch": 1.972096041531473,
"grad_norm": 0.2319362635386066,
"learning_rate": 1.9023569023569024e-05,
"loss": 0.4059,
"step": 1520
},
{
"epoch": 1.973393900064893,
"grad_norm": 0.1980072863895625,
"learning_rate": 1.8999518999519e-05,
"loss": 0.4012,
"step": 1521
},
{
"epoch": 1.9746917585983128,
"grad_norm": 0.21340412052310542,
"learning_rate": 1.8975468975468977e-05,
"loss": 0.4252,
"step": 1522
},
{
"epoch": 1.9759896171317326,
"grad_norm": 0.20523875562201788,
"learning_rate": 1.8951418951418952e-05,
"loss": 0.3954,
"step": 1523
},
{
"epoch": 1.9772874756651526,
"grad_norm": 0.21593958433489607,
"learning_rate": 1.8927368927368927e-05,
"loss": 0.4204,
"step": 1524
},
{
"epoch": 1.9785853341985724,
"grad_norm": 0.1982941991349422,
"learning_rate": 1.8903318903318905e-05,
"loss": 0.3982,
"step": 1525
},
{
"epoch": 1.9798831927319922,
"grad_norm": 0.20466190375196575,
"learning_rate": 1.887926887926888e-05,
"loss": 0.4134,
"step": 1526
},
{
"epoch": 1.9811810512654122,
"grad_norm": 0.21442210205444864,
"learning_rate": 1.8855218855218858e-05,
"loss": 0.4221,
"step": 1527
},
{
"epoch": 1.9824789097988318,
"grad_norm": 0.2077434816627499,
"learning_rate": 1.8831168831168833e-05,
"loss": 0.4168,
"step": 1528
},
{
"epoch": 1.9837767683322518,
"grad_norm": 0.20932779718622976,
"learning_rate": 1.880711880711881e-05,
"loss": 0.4129,
"step": 1529
},
{
"epoch": 1.9850746268656716,
"grad_norm": 0.20073142812922465,
"learning_rate": 1.8783068783068782e-05,
"loss": 0.4282,
"step": 1530
},
{
"epoch": 1.9863724853990914,
"grad_norm": 0.21070295646641607,
"learning_rate": 1.875901875901876e-05,
"loss": 0.414,
"step": 1531
},
{
"epoch": 1.9876703439325114,
"grad_norm": 0.1983254407503139,
"learning_rate": 1.8734968734968735e-05,
"loss": 0.4246,
"step": 1532
},
{
"epoch": 1.9889682024659312,
"grad_norm": 0.2063440750783136,
"learning_rate": 1.8710918710918713e-05,
"loss": 0.4127,
"step": 1533
},
{
"epoch": 1.990266060999351,
"grad_norm": 0.20062950578015543,
"learning_rate": 1.8686868686868688e-05,
"loss": 0.404,
"step": 1534
},
{
"epoch": 1.991563919532771,
"grad_norm": 0.22191712136507424,
"learning_rate": 1.8662818662818663e-05,
"loss": 0.4119,
"step": 1535
},
{
"epoch": 1.9928617780661908,
"grad_norm": 0.2215336165604822,
"learning_rate": 1.8638768638768638e-05,
"loss": 0.4121,
"step": 1536
},
{
"epoch": 1.9941596365996106,
"grad_norm": 0.20271253230410582,
"learning_rate": 1.8614718614718616e-05,
"loss": 0.4023,
"step": 1537
},
{
"epoch": 1.9954574951330306,
"grad_norm": 0.26159702346568764,
"learning_rate": 1.859066859066859e-05,
"loss": 0.4102,
"step": 1538
},
{
"epoch": 1.9967553536664504,
"grad_norm": 0.21830457585192162,
"learning_rate": 1.856661856661857e-05,
"loss": 0.4129,
"step": 1539
},
{
"epoch": 1.9980532121998702,
"grad_norm": 0.21634003518886286,
"learning_rate": 1.8542568542568544e-05,
"loss": 0.3979,
"step": 1540
},
{
"epoch": 1.9993510707332902,
"grad_norm": 0.21520818627840688,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.4072,
"step": 1541
},
{
"epoch": 2.0,
"grad_norm": 0.3162004070666462,
"learning_rate": 1.8494468494468496e-05,
"loss": 0.3731,
"step": 1542
},
{
"epoch": 2.00129785853342,
"grad_norm": 0.3002746540509754,
"learning_rate": 1.847041847041847e-05,
"loss": 0.351,
"step": 1543
},
{
"epoch": 2.0025957170668396,
"grad_norm": 0.22179389161987628,
"learning_rate": 1.844636844636845e-05,
"loss": 0.3591,
"step": 1544
},
{
"epoch": 2.0038935756002596,
"grad_norm": 0.29853309659990307,
"learning_rate": 1.8422318422318424e-05,
"loss": 0.3502,
"step": 1545
},
{
"epoch": 2.0051914341336796,
"grad_norm": 0.27887815631774426,
"learning_rate": 1.83982683982684e-05,
"loss": 0.3397,
"step": 1546
},
{
"epoch": 2.006489292667099,
"grad_norm": 0.21973812049478386,
"learning_rate": 1.8374218374218374e-05,
"loss": 0.3429,
"step": 1547
},
{
"epoch": 2.007787151200519,
"grad_norm": 0.26933885808676494,
"learning_rate": 1.8350168350168352e-05,
"loss": 0.3436,
"step": 1548
},
{
"epoch": 2.009085009733939,
"grad_norm": 0.2996171251030684,
"learning_rate": 1.8326118326118327e-05,
"loss": 0.3409,
"step": 1549
},
{
"epoch": 2.010382868267359,
"grad_norm": 0.24083124258461439,
"learning_rate": 1.8302068302068305e-05,
"loss": 0.3386,
"step": 1550
},
{
"epoch": 2.011680726800779,
"grad_norm": 0.30787012297971555,
"learning_rate": 1.827801827801828e-05,
"loss": 0.3478,
"step": 1551
},
{
"epoch": 2.0129785853341984,
"grad_norm": 0.2857849577396285,
"learning_rate": 1.8253968253968254e-05,
"loss": 0.347,
"step": 1552
},
{
"epoch": 2.0142764438676184,
"grad_norm": 0.2458814691129703,
"learning_rate": 1.822991822991823e-05,
"loss": 0.338,
"step": 1553
},
{
"epoch": 2.0155743024010384,
"grad_norm": 0.24217238914022393,
"learning_rate": 1.8205868205868207e-05,
"loss": 0.3527,
"step": 1554
},
{
"epoch": 2.016872160934458,
"grad_norm": 0.2708034381508514,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.343,
"step": 1555
},
{
"epoch": 2.018170019467878,
"grad_norm": 0.2771300303365467,
"learning_rate": 1.815776815776816e-05,
"loss": 0.3578,
"step": 1556
},
{
"epoch": 2.019467878001298,
"grad_norm": 0.24314761286723605,
"learning_rate": 1.8133718133718135e-05,
"loss": 0.3628,
"step": 1557
},
{
"epoch": 2.0207657365347176,
"grad_norm": 0.24528635873262158,
"learning_rate": 1.810966810966811e-05,
"loss": 0.3478,
"step": 1558
},
{
"epoch": 2.0220635950681376,
"grad_norm": 0.2441718287388452,
"learning_rate": 1.8085618085618085e-05,
"loss": 0.3463,
"step": 1559
},
{
"epoch": 2.0233614536015576,
"grad_norm": 0.2429476875932917,
"learning_rate": 1.8061568061568063e-05,
"loss": 0.3533,
"step": 1560
},
{
"epoch": 2.024659312134977,
"grad_norm": 0.2584288721917747,
"learning_rate": 1.8037518037518038e-05,
"loss": 0.3443,
"step": 1561
},
{
"epoch": 2.025957170668397,
"grad_norm": 0.21619552093380776,
"learning_rate": 1.8013468013468016e-05,
"loss": 0.3313,
"step": 1562
},
{
"epoch": 2.027255029201817,
"grad_norm": 0.22416123032351348,
"learning_rate": 1.798941798941799e-05,
"loss": 0.3396,
"step": 1563
},
{
"epoch": 2.028552887735237,
"grad_norm": 0.23762867221262482,
"learning_rate": 1.7965367965367965e-05,
"loss": 0.3328,
"step": 1564
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.23910726203342522,
"learning_rate": 1.7941317941317943e-05,
"loss": 0.3373,
"step": 1565
},
{
"epoch": 2.0311486048020764,
"grad_norm": 0.2270224344033579,
"learning_rate": 1.7917267917267918e-05,
"loss": 0.3414,
"step": 1566
},
{
"epoch": 2.0324464633354964,
"grad_norm": 0.22685492291746387,
"learning_rate": 1.7893217893217896e-05,
"loss": 0.334,
"step": 1567
},
{
"epoch": 2.0337443218689164,
"grad_norm": 0.23079013731214076,
"learning_rate": 1.7869167869167868e-05,
"loss": 0.3373,
"step": 1568
},
{
"epoch": 2.035042180402336,
"grad_norm": 0.23832074407215584,
"learning_rate": 1.7845117845117846e-05,
"loss": 0.3402,
"step": 1569
},
{
"epoch": 2.036340038935756,
"grad_norm": 0.2413146931238051,
"learning_rate": 1.782106782106782e-05,
"loss": 0.3476,
"step": 1570
},
{
"epoch": 2.037637897469176,
"grad_norm": 0.23685355784727574,
"learning_rate": 1.77970177970178e-05,
"loss": 0.3397,
"step": 1571
},
{
"epoch": 2.0389357560025956,
"grad_norm": 0.24437850977020956,
"learning_rate": 1.7772967772967774e-05,
"loss": 0.3445,
"step": 1572
},
{
"epoch": 2.0402336145360156,
"grad_norm": 0.22724458516557208,
"learning_rate": 1.7748917748917752e-05,
"loss": 0.3443,
"step": 1573
},
{
"epoch": 2.0415314730694356,
"grad_norm": 0.23475541449011794,
"learning_rate": 1.7724867724867723e-05,
"loss": 0.3432,
"step": 1574
},
{
"epoch": 2.042829331602855,
"grad_norm": 0.21469511225658197,
"learning_rate": 1.77008177008177e-05,
"loss": 0.3473,
"step": 1575
},
{
"epoch": 2.044127190136275,
"grad_norm": 0.22486022557380209,
"learning_rate": 1.7676767676767676e-05,
"loss": 0.3566,
"step": 1576
},
{
"epoch": 2.045425048669695,
"grad_norm": 0.22895350371242218,
"learning_rate": 1.7652717652717654e-05,
"loss": 0.3478,
"step": 1577
},
{
"epoch": 2.046722907203115,
"grad_norm": 0.24538812579393815,
"learning_rate": 1.762866762866763e-05,
"loss": 0.3332,
"step": 1578
},
{
"epoch": 2.048020765736535,
"grad_norm": 0.25912360209705504,
"learning_rate": 1.7604617604617604e-05,
"loss": 0.344,
"step": 1579
},
{
"epoch": 2.0493186242699544,
"grad_norm": 0.19959244601082998,
"learning_rate": 1.7580567580567582e-05,
"loss": 0.3337,
"step": 1580
},
{
"epoch": 2.0506164828033744,
"grad_norm": 0.22265382752478494,
"learning_rate": 1.7556517556517557e-05,
"loss": 0.3385,
"step": 1581
},
{
"epoch": 2.0519143413367944,
"grad_norm": 0.2165757161328648,
"learning_rate": 1.7532467532467535e-05,
"loss": 0.3339,
"step": 1582
},
{
"epoch": 2.053212199870214,
"grad_norm": 0.21372021503164076,
"learning_rate": 1.750841750841751e-05,
"loss": 0.3507,
"step": 1583
},
{
"epoch": 2.054510058403634,
"grad_norm": 0.2336377004408556,
"learning_rate": 1.7484367484367488e-05,
"loss": 0.3491,
"step": 1584
},
{
"epoch": 2.055807916937054,
"grad_norm": 0.2117993328839063,
"learning_rate": 1.746031746031746e-05,
"loss": 0.3407,
"step": 1585
},
{
"epoch": 2.0571057754704736,
"grad_norm": 0.21231266244658922,
"learning_rate": 1.7436267436267437e-05,
"loss": 0.3444,
"step": 1586
},
{
"epoch": 2.0584036340038936,
"grad_norm": 0.21032023819015722,
"learning_rate": 1.7412217412217412e-05,
"loss": 0.3398,
"step": 1587
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.2371048058409055,
"learning_rate": 1.738816738816739e-05,
"loss": 0.3445,
"step": 1588
},
{
"epoch": 2.060999351070733,
"grad_norm": 0.2059222075267882,
"learning_rate": 1.7364117364117365e-05,
"loss": 0.3443,
"step": 1589
},
{
"epoch": 2.062297209604153,
"grad_norm": 0.22719406397240552,
"learning_rate": 1.734006734006734e-05,
"loss": 0.3495,
"step": 1590
},
{
"epoch": 2.063595068137573,
"grad_norm": 0.2222707506963988,
"learning_rate": 1.7316017316017315e-05,
"loss": 0.3399,
"step": 1591
},
{
"epoch": 2.064892926670993,
"grad_norm": 0.22555682797470167,
"learning_rate": 1.7291967291967293e-05,
"loss": 0.3383,
"step": 1592
},
{
"epoch": 2.066190785204413,
"grad_norm": 0.22889368520998704,
"learning_rate": 1.7267917267917268e-05,
"loss": 0.3418,
"step": 1593
},
{
"epoch": 2.0674886437378324,
"grad_norm": 0.22102058324621057,
"learning_rate": 1.7243867243867246e-05,
"loss": 0.3325,
"step": 1594
},
{
"epoch": 2.0687865022712524,
"grad_norm": 0.23774221641545448,
"learning_rate": 1.721981721981722e-05,
"loss": 0.3369,
"step": 1595
},
{
"epoch": 2.0700843608046724,
"grad_norm": 0.24890061582412498,
"learning_rate": 1.7195767195767195e-05,
"loss": 0.3441,
"step": 1596
},
{
"epoch": 2.071382219338092,
"grad_norm": 0.2100873376295878,
"learning_rate": 1.7171717171717173e-05,
"loss": 0.3303,
"step": 1597
},
{
"epoch": 2.072680077871512,
"grad_norm": 0.22680230775268373,
"learning_rate": 1.7147667147667148e-05,
"loss": 0.3371,
"step": 1598
},
{
"epoch": 2.073977936404932,
"grad_norm": 0.20382351560763964,
"learning_rate": 1.7123617123617126e-05,
"loss": 0.3337,
"step": 1599
},
{
"epoch": 2.0752757949383516,
"grad_norm": 0.21534525112062658,
"learning_rate": 1.70995670995671e-05,
"loss": 0.3371,
"step": 1600
},
{
"epoch": 2.0765736534717716,
"grad_norm": 0.21827103094501965,
"learning_rate": 1.7075517075517076e-05,
"loss": 0.3357,
"step": 1601
},
{
"epoch": 2.0778715120051916,
"grad_norm": 0.21047818536264323,
"learning_rate": 1.705146705146705e-05,
"loss": 0.3358,
"step": 1602
},
{
"epoch": 2.079169370538611,
"grad_norm": 0.24359393587806577,
"learning_rate": 1.702741702741703e-05,
"loss": 0.3409,
"step": 1603
},
{
"epoch": 2.080467229072031,
"grad_norm": 0.32771668536616283,
"learning_rate": 1.7003367003367004e-05,
"loss": 0.3296,
"step": 1604
},
{
"epoch": 2.081765087605451,
"grad_norm": 0.2105738510166506,
"learning_rate": 1.6979316979316982e-05,
"loss": 0.3471,
"step": 1605
},
{
"epoch": 2.0830629461388708,
"grad_norm": 0.2609805732511619,
"learning_rate": 1.6955266955266957e-05,
"loss": 0.3395,
"step": 1606
},
{
"epoch": 2.084360804672291,
"grad_norm": 0.20813077643429093,
"learning_rate": 1.693121693121693e-05,
"loss": 0.3323,
"step": 1607
},
{
"epoch": 2.0856586632057104,
"grad_norm": 0.20588845509767667,
"learning_rate": 1.6907166907166906e-05,
"loss": 0.3395,
"step": 1608
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.2231300777445713,
"learning_rate": 1.6883116883116884e-05,
"loss": 0.3448,
"step": 1609
},
{
"epoch": 2.0882543802725504,
"grad_norm": 0.2262598516285643,
"learning_rate": 1.685906685906686e-05,
"loss": 0.3524,
"step": 1610
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.2112479505042923,
"learning_rate": 1.6835016835016837e-05,
"loss": 0.335,
"step": 1611
},
{
"epoch": 2.09085009733939,
"grad_norm": 0.22189847243097133,
"learning_rate": 1.6810966810966812e-05,
"loss": 0.3312,
"step": 1612
},
{
"epoch": 2.09214795587281,
"grad_norm": 0.21750458672162346,
"learning_rate": 1.6786916786916787e-05,
"loss": 0.3382,
"step": 1613
},
{
"epoch": 2.0934458144062296,
"grad_norm": 0.22791119863516698,
"learning_rate": 1.6762866762866765e-05,
"loss": 0.3346,
"step": 1614
},
{
"epoch": 2.0947436729396496,
"grad_norm": 0.2253801306495037,
"learning_rate": 1.673881673881674e-05,
"loss": 0.3587,
"step": 1615
},
{
"epoch": 2.0960415314730696,
"grad_norm": 0.21941342953368462,
"learning_rate": 1.6714766714766718e-05,
"loss": 0.322,
"step": 1616
},
{
"epoch": 2.097339390006489,
"grad_norm": 0.24442422654973892,
"learning_rate": 1.6690716690716693e-05,
"loss": 0.3613,
"step": 1617
},
{
"epoch": 2.098637248539909,
"grad_norm": 0.20892381899190737,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.3372,
"step": 1618
},
{
"epoch": 2.099935107073329,
"grad_norm": 0.2204239619365003,
"learning_rate": 1.6642616642616642e-05,
"loss": 0.345,
"step": 1619
},
{
"epoch": 2.1012329656067488,
"grad_norm": 0.2433539902885864,
"learning_rate": 1.661856661856662e-05,
"loss": 0.3399,
"step": 1620
},
{
"epoch": 2.102530824140169,
"grad_norm": 0.21756713439856754,
"learning_rate": 1.6594516594516595e-05,
"loss": 0.3258,
"step": 1621
},
{
"epoch": 2.103828682673589,
"grad_norm": 0.21185754465198486,
"learning_rate": 1.6570466570466573e-05,
"loss": 0.3338,
"step": 1622
},
{
"epoch": 2.1051265412070084,
"grad_norm": 0.19886634840357484,
"learning_rate": 1.6546416546416545e-05,
"loss": 0.3379,
"step": 1623
},
{
"epoch": 2.1064243997404284,
"grad_norm": 0.23378633487014983,
"learning_rate": 1.6522366522366523e-05,
"loss": 0.3518,
"step": 1624
},
{
"epoch": 2.107722258273848,
"grad_norm": 0.22367396102680723,
"learning_rate": 1.6498316498316498e-05,
"loss": 0.3341,
"step": 1625
},
{
"epoch": 2.109020116807268,
"grad_norm": 0.2063911182342616,
"learning_rate": 1.6474266474266476e-05,
"loss": 0.3446,
"step": 1626
},
{
"epoch": 2.110317975340688,
"grad_norm": 0.22050227302851286,
"learning_rate": 1.645021645021645e-05,
"loss": 0.3359,
"step": 1627
},
{
"epoch": 2.1116158338741076,
"grad_norm": 0.24896315784423037,
"learning_rate": 1.642616642616643e-05,
"loss": 0.3311,
"step": 1628
},
{
"epoch": 2.1129136924075276,
"grad_norm": 0.24014540903069778,
"learning_rate": 1.6402116402116404e-05,
"loss": 0.3455,
"step": 1629
},
{
"epoch": 2.1142115509409476,
"grad_norm": 0.22208241595868494,
"learning_rate": 1.637806637806638e-05,
"loss": 0.3609,
"step": 1630
},
{
"epoch": 2.115509409474367,
"grad_norm": 0.25185975208100675,
"learning_rate": 1.6354016354016356e-05,
"loss": 0.3355,
"step": 1631
},
{
"epoch": 2.116807268007787,
"grad_norm": 0.25068577932779473,
"learning_rate": 1.632996632996633e-05,
"loss": 0.3378,
"step": 1632
},
{
"epoch": 2.118105126541207,
"grad_norm": 0.23743465262529753,
"learning_rate": 1.630591630591631e-05,
"loss": 0.3453,
"step": 1633
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.2873478730260964,
"learning_rate": 1.628186628186628e-05,
"loss": 0.3469,
"step": 1634
},
{
"epoch": 2.120700843608047,
"grad_norm": 0.23190098964145786,
"learning_rate": 1.625781625781626e-05,
"loss": 0.3299,
"step": 1635
},
{
"epoch": 2.1219987021414664,
"grad_norm": 0.24692649408648704,
"learning_rate": 1.6233766233766234e-05,
"loss": 0.3413,
"step": 1636
},
{
"epoch": 2.1232965606748864,
"grad_norm": 0.2601882707422891,
"learning_rate": 1.6209716209716212e-05,
"loss": 0.3237,
"step": 1637
},
{
"epoch": 2.1245944192083064,
"grad_norm": 0.2307540235499652,
"learning_rate": 1.6185666185666187e-05,
"loss": 0.3479,
"step": 1638
},
{
"epoch": 2.125892277741726,
"grad_norm": 0.24207343026907496,
"learning_rate": 1.6161616161616165e-05,
"loss": 0.342,
"step": 1639
},
{
"epoch": 2.127190136275146,
"grad_norm": 0.25704183477977455,
"learning_rate": 1.6137566137566136e-05,
"loss": 0.347,
"step": 1640
},
{
"epoch": 2.128487994808566,
"grad_norm": 0.2204667533876293,
"learning_rate": 1.6113516113516114e-05,
"loss": 0.3421,
"step": 1641
},
{
"epoch": 2.1297858533419856,
"grad_norm": 0.209970904431735,
"learning_rate": 1.608946608946609e-05,
"loss": 0.3434,
"step": 1642
},
{
"epoch": 2.1310837118754056,
"grad_norm": 0.2322772777553896,
"learning_rate": 1.6065416065416067e-05,
"loss": 0.3462,
"step": 1643
},
{
"epoch": 2.1323815704088256,
"grad_norm": 0.24539006715088627,
"learning_rate": 1.6041366041366042e-05,
"loss": 0.3474,
"step": 1644
},
{
"epoch": 2.133679428942245,
"grad_norm": 0.2071657985362404,
"learning_rate": 1.6017316017316017e-05,
"loss": 0.315,
"step": 1645
},
{
"epoch": 2.134977287475665,
"grad_norm": 0.22559015132345064,
"learning_rate": 1.5993265993265995e-05,
"loss": 0.3435,
"step": 1646
},
{
"epoch": 2.136275146009085,
"grad_norm": 0.21866568313609377,
"learning_rate": 1.596921596921597e-05,
"loss": 0.3214,
"step": 1647
},
{
"epoch": 2.1375730045425048,
"grad_norm": 0.22054227732700918,
"learning_rate": 1.5945165945165948e-05,
"loss": 0.3366,
"step": 1648
},
{
"epoch": 2.1388708630759248,
"grad_norm": 0.2513837156975892,
"learning_rate": 1.5921115921115923e-05,
"loss": 0.3661,
"step": 1649
},
{
"epoch": 2.140168721609345,
"grad_norm": 0.22217224038750824,
"learning_rate": 1.5897065897065898e-05,
"loss": 0.3377,
"step": 1650
},
{
"epoch": 2.1414665801427644,
"grad_norm": 0.21936038896279167,
"learning_rate": 1.5873015873015872e-05,
"loss": 0.3326,
"step": 1651
},
{
"epoch": 2.1427644386761844,
"grad_norm": 0.2064910202725993,
"learning_rate": 1.584896584896585e-05,
"loss": 0.3365,
"step": 1652
},
{
"epoch": 2.144062297209604,
"grad_norm": 0.20100161814145587,
"learning_rate": 1.5824915824915825e-05,
"loss": 0.3278,
"step": 1653
},
{
"epoch": 2.145360155743024,
"grad_norm": 0.21879340248910953,
"learning_rate": 1.5800865800865803e-05,
"loss": 0.3431,
"step": 1654
},
{
"epoch": 2.146658014276444,
"grad_norm": 0.2129168285291938,
"learning_rate": 1.5776815776815778e-05,
"loss": 0.3503,
"step": 1655
},
{
"epoch": 2.1479558728098636,
"grad_norm": 0.2091423056576906,
"learning_rate": 1.5752765752765753e-05,
"loss": 0.3492,
"step": 1656
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.2828239816378975,
"learning_rate": 1.5728715728715728e-05,
"loss": 0.3564,
"step": 1657
},
{
"epoch": 2.1505515898767036,
"grad_norm": 0.21413983501492062,
"learning_rate": 1.5704665704665706e-05,
"loss": 0.3544,
"step": 1658
},
{
"epoch": 2.151849448410123,
"grad_norm": 0.21542714936607865,
"learning_rate": 1.568061568061568e-05,
"loss": 0.3399,
"step": 1659
},
{
"epoch": 2.153147306943543,
"grad_norm": 0.23220715953725188,
"learning_rate": 1.565656565656566e-05,
"loss": 0.3322,
"step": 1660
},
{
"epoch": 2.154445165476963,
"grad_norm": 0.20221618002166372,
"learning_rate": 1.563251563251563e-05,
"loss": 0.3297,
"step": 1661
},
{
"epoch": 2.1557430240103828,
"grad_norm": 0.23075242395941706,
"learning_rate": 1.560846560846561e-05,
"loss": 0.3372,
"step": 1662
},
{
"epoch": 2.1570408825438028,
"grad_norm": 0.2065260094429175,
"learning_rate": 1.5584415584415583e-05,
"loss": 0.3396,
"step": 1663
},
{
"epoch": 2.158338741077223,
"grad_norm": 0.2048397557697161,
"learning_rate": 1.556036556036556e-05,
"loss": 0.338,
"step": 1664
},
{
"epoch": 2.1596365996106424,
"grad_norm": 0.21058822127057414,
"learning_rate": 1.5536315536315536e-05,
"loss": 0.3475,
"step": 1665
},
{
"epoch": 2.1609344581440624,
"grad_norm": 0.20856117486455514,
"learning_rate": 1.5512265512265514e-05,
"loss": 0.3565,
"step": 1666
},
{
"epoch": 2.162232316677482,
"grad_norm": 0.2101578604301147,
"learning_rate": 1.548821548821549e-05,
"loss": 0.3341,
"step": 1667
},
{
"epoch": 2.163530175210902,
"grad_norm": 0.1948843188896066,
"learning_rate": 1.5464165464165464e-05,
"loss": 0.3307,
"step": 1668
},
{
"epoch": 2.164828033744322,
"grad_norm": 0.22409902293345668,
"learning_rate": 1.5440115440115442e-05,
"loss": 0.3484,
"step": 1669
},
{
"epoch": 2.1661258922777415,
"grad_norm": 0.22671547696900604,
"learning_rate": 1.5416065416065417e-05,
"loss": 0.3523,
"step": 1670
},
{
"epoch": 2.1674237508111616,
"grad_norm": 0.21140422198147263,
"learning_rate": 1.5392015392015395e-05,
"loss": 0.3455,
"step": 1671
},
{
"epoch": 2.1687216093445816,
"grad_norm": 0.21245427765806485,
"learning_rate": 1.5367965367965366e-05,
"loss": 0.3429,
"step": 1672
},
{
"epoch": 2.170019467878001,
"grad_norm": 0.20475755840127652,
"learning_rate": 1.5343915343915344e-05,
"loss": 0.334,
"step": 1673
},
{
"epoch": 2.171317326411421,
"grad_norm": 0.22161947726300274,
"learning_rate": 1.531986531986532e-05,
"loss": 0.3445,
"step": 1674
},
{
"epoch": 2.172615184944841,
"grad_norm": 0.20686901638420155,
"learning_rate": 1.5295815295815297e-05,
"loss": 0.3423,
"step": 1675
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.21241410102955396,
"learning_rate": 1.5271765271765272e-05,
"loss": 0.3272,
"step": 1676
},
{
"epoch": 2.1752109020116808,
"grad_norm": 0.20178219626778365,
"learning_rate": 1.524771524771525e-05,
"loss": 0.3429,
"step": 1677
},
{
"epoch": 2.176508760545101,
"grad_norm": 0.24450118577688046,
"learning_rate": 1.5223665223665223e-05,
"loss": 0.3481,
"step": 1678
},
{
"epoch": 2.1778066190785204,
"grad_norm": 0.20454351303651527,
"learning_rate": 1.51996151996152e-05,
"loss": 0.3581,
"step": 1679
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.21613091983231933,
"learning_rate": 1.5175565175565176e-05,
"loss": 0.3299,
"step": 1680
},
{
"epoch": 2.1804023361453604,
"grad_norm": 0.23413467617760164,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.3622,
"step": 1681
},
{
"epoch": 2.18170019467878,
"grad_norm": 0.21133908102093862,
"learning_rate": 1.512746512746513e-05,
"loss": 0.3323,
"step": 1682
},
{
"epoch": 2.1829980532122,
"grad_norm": 0.2117083520838165,
"learning_rate": 1.5103415103415102e-05,
"loss": 0.3394,
"step": 1683
},
{
"epoch": 2.1842959117456195,
"grad_norm": 0.21732760719194733,
"learning_rate": 1.5079365079365079e-05,
"loss": 0.3419,
"step": 1684
},
{
"epoch": 2.1855937702790396,
"grad_norm": 0.20428681560868892,
"learning_rate": 1.5055315055315055e-05,
"loss": 0.3346,
"step": 1685
},
{
"epoch": 2.1868916288124596,
"grad_norm": 0.21590103314227366,
"learning_rate": 1.5031265031265032e-05,
"loss": 0.3662,
"step": 1686
},
{
"epoch": 2.188189487345879,
"grad_norm": 0.21412757670479102,
"learning_rate": 1.5007215007215008e-05,
"loss": 0.3334,
"step": 1687
},
{
"epoch": 2.189487345879299,
"grad_norm": 0.2161794768756431,
"learning_rate": 1.4983164983164985e-05,
"loss": 0.334,
"step": 1688
},
{
"epoch": 2.190785204412719,
"grad_norm": 0.2121354087724828,
"learning_rate": 1.495911495911496e-05,
"loss": 0.344,
"step": 1689
},
{
"epoch": 2.1920830629461387,
"grad_norm": 0.20921645686294296,
"learning_rate": 1.4935064935064936e-05,
"loss": 0.3397,
"step": 1690
},
{
"epoch": 2.1933809214795588,
"grad_norm": 0.20966308146376983,
"learning_rate": 1.4911014911014912e-05,
"loss": 0.3464,
"step": 1691
},
{
"epoch": 2.194678780012979,
"grad_norm": 0.21908775100267822,
"learning_rate": 1.4886964886964889e-05,
"loss": 0.3317,
"step": 1692
},
{
"epoch": 2.1959766385463984,
"grad_norm": 0.22468058324603687,
"learning_rate": 1.4862914862914865e-05,
"loss": 0.3407,
"step": 1693
},
{
"epoch": 2.1972744970798184,
"grad_norm": 0.22075013149066078,
"learning_rate": 1.4838864838864838e-05,
"loss": 0.3283,
"step": 1694
},
{
"epoch": 2.198572355613238,
"grad_norm": 0.21551594408964253,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.3199,
"step": 1695
},
{
"epoch": 2.199870214146658,
"grad_norm": 0.21485802609507748,
"learning_rate": 1.4790764790764791e-05,
"loss": 0.3287,
"step": 1696
},
{
"epoch": 2.201168072680078,
"grad_norm": 0.2151673200316436,
"learning_rate": 1.4766714766714768e-05,
"loss": 0.3426,
"step": 1697
},
{
"epoch": 2.2024659312134975,
"grad_norm": 0.22612559865377715,
"learning_rate": 1.4742664742664744e-05,
"loss": 0.3573,
"step": 1698
},
{
"epoch": 2.2037637897469176,
"grad_norm": 0.21837527709726925,
"learning_rate": 1.471861471861472e-05,
"loss": 0.3325,
"step": 1699
},
{
"epoch": 2.2050616482803376,
"grad_norm": 0.21236356177018392,
"learning_rate": 1.4694564694564694e-05,
"loss": 0.3553,
"step": 1700
},
{
"epoch": 2.206359506813757,
"grad_norm": 0.19590709294592334,
"learning_rate": 1.467051467051467e-05,
"loss": 0.3215,
"step": 1701
},
{
"epoch": 2.207657365347177,
"grad_norm": 0.22771176110161379,
"learning_rate": 1.4646464646464647e-05,
"loss": 0.3465,
"step": 1702
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.22229793471063694,
"learning_rate": 1.4622414622414623e-05,
"loss": 0.3432,
"step": 1703
},
{
"epoch": 2.2102530824140167,
"grad_norm": 0.21252084935676113,
"learning_rate": 1.45983645983646e-05,
"loss": 0.3508,
"step": 1704
},
{
"epoch": 2.2115509409474368,
"grad_norm": 0.2202632087669158,
"learning_rate": 1.4574314574314573e-05,
"loss": 0.3369,
"step": 1705
},
{
"epoch": 2.2128487994808568,
"grad_norm": 0.21520394343961766,
"learning_rate": 1.455026455026455e-05,
"loss": 0.3327,
"step": 1706
},
{
"epoch": 2.2141466580142763,
"grad_norm": 0.23322877877504564,
"learning_rate": 1.4526214526214526e-05,
"loss": 0.3491,
"step": 1707
},
{
"epoch": 2.2154445165476964,
"grad_norm": 0.22519715240573737,
"learning_rate": 1.4502164502164502e-05,
"loss": 0.3403,
"step": 1708
},
{
"epoch": 2.2167423750811164,
"grad_norm": 0.22210131676178194,
"learning_rate": 1.4478114478114479e-05,
"loss": 0.344,
"step": 1709
},
{
"epoch": 2.218040233614536,
"grad_norm": 0.2119128486555464,
"learning_rate": 1.4454064454064455e-05,
"loss": 0.3562,
"step": 1710
},
{
"epoch": 2.219338092147956,
"grad_norm": 0.26673890205097306,
"learning_rate": 1.443001443001443e-05,
"loss": 0.3284,
"step": 1711
},
{
"epoch": 2.2206359506813755,
"grad_norm": 0.22633092379431619,
"learning_rate": 1.4405964405964406e-05,
"loss": 0.3383,
"step": 1712
},
{
"epoch": 2.2219338092147956,
"grad_norm": 0.21248117786007845,
"learning_rate": 1.4381914381914383e-05,
"loss": 0.3458,
"step": 1713
},
{
"epoch": 2.2232316677482156,
"grad_norm": 0.23708025124779677,
"learning_rate": 1.435786435786436e-05,
"loss": 0.3521,
"step": 1714
},
{
"epoch": 2.224529526281635,
"grad_norm": 0.2216080470200345,
"learning_rate": 1.4333814333814336e-05,
"loss": 0.338,
"step": 1715
},
{
"epoch": 2.225827384815055,
"grad_norm": 0.20064012115244553,
"learning_rate": 1.4309764309764309e-05,
"loss": 0.3323,
"step": 1716
},
{
"epoch": 2.227125243348475,
"grad_norm": 0.22946959688466415,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.3533,
"step": 1717
},
{
"epoch": 2.2284231018818947,
"grad_norm": 0.23129226313271936,
"learning_rate": 1.4261664261664262e-05,
"loss": 0.3477,
"step": 1718
},
{
"epoch": 2.2297209604153148,
"grad_norm": 0.23801346245682223,
"learning_rate": 1.4237614237614238e-05,
"loss": 0.3525,
"step": 1719
},
{
"epoch": 2.2310188189487348,
"grad_norm": 0.20874849017749872,
"learning_rate": 1.4213564213564215e-05,
"loss": 0.3414,
"step": 1720
},
{
"epoch": 2.2323166774821543,
"grad_norm": 0.21629318124043379,
"learning_rate": 1.4189514189514191e-05,
"loss": 0.3379,
"step": 1721
},
{
"epoch": 2.2336145360155744,
"grad_norm": 0.22905806827791111,
"learning_rate": 1.4165464165464164e-05,
"loss": 0.337,
"step": 1722
},
{
"epoch": 2.234912394548994,
"grad_norm": 0.2279391640803175,
"learning_rate": 1.4141414141414141e-05,
"loss": 0.3536,
"step": 1723
},
{
"epoch": 2.236210253082414,
"grad_norm": 0.20743141809266266,
"learning_rate": 1.4117364117364117e-05,
"loss": 0.3331,
"step": 1724
},
{
"epoch": 2.237508111615834,
"grad_norm": 0.21930312729137855,
"learning_rate": 1.4093314093314094e-05,
"loss": 0.3426,
"step": 1725
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.21584060808323735,
"learning_rate": 1.406926406926407e-05,
"loss": 0.3326,
"step": 1726
},
{
"epoch": 2.2401038286826735,
"grad_norm": 0.20452984796446286,
"learning_rate": 1.4045214045214045e-05,
"loss": 0.3407,
"step": 1727
},
{
"epoch": 2.2414016872160936,
"grad_norm": 0.21278894400954876,
"learning_rate": 1.4021164021164022e-05,
"loss": 0.3272,
"step": 1728
},
{
"epoch": 2.242699545749513,
"grad_norm": 0.19989940211340443,
"learning_rate": 1.3997113997113998e-05,
"loss": 0.3268,
"step": 1729
},
{
"epoch": 2.243997404282933,
"grad_norm": 0.20756531216735427,
"learning_rate": 1.3973063973063974e-05,
"loss": 0.3428,
"step": 1730
},
{
"epoch": 2.245295262816353,
"grad_norm": 0.2289163023889471,
"learning_rate": 1.3949013949013951e-05,
"loss": 0.3531,
"step": 1731
},
{
"epoch": 2.2465931213497727,
"grad_norm": 0.19474714109733912,
"learning_rate": 1.3924963924963927e-05,
"loss": 0.3232,
"step": 1732
},
{
"epoch": 2.2478909798831928,
"grad_norm": 0.20474938955793603,
"learning_rate": 1.39009139009139e-05,
"loss": 0.3368,
"step": 1733
},
{
"epoch": 2.2491888384166128,
"grad_norm": 0.2134278581088506,
"learning_rate": 1.3876863876863877e-05,
"loss": 0.3332,
"step": 1734
},
{
"epoch": 2.2504866969500323,
"grad_norm": 0.20557863182364947,
"learning_rate": 1.3852813852813853e-05,
"loss": 0.3189,
"step": 1735
},
{
"epoch": 2.2517845554834524,
"grad_norm": 0.20536064005062202,
"learning_rate": 1.382876382876383e-05,
"loss": 0.3432,
"step": 1736
},
{
"epoch": 2.2530824140168724,
"grad_norm": 0.20167055126244063,
"learning_rate": 1.3804713804713806e-05,
"loss": 0.3333,
"step": 1737
},
{
"epoch": 2.254380272550292,
"grad_norm": 0.21070060115934763,
"learning_rate": 1.378066378066378e-05,
"loss": 0.3447,
"step": 1738
},
{
"epoch": 2.255678131083712,
"grad_norm": 0.2207628970020607,
"learning_rate": 1.3756613756613756e-05,
"loss": 0.3583,
"step": 1739
},
{
"epoch": 2.256975989617132,
"grad_norm": 0.2127435360633616,
"learning_rate": 1.3732563732563732e-05,
"loss": 0.3348,
"step": 1740
},
{
"epoch": 2.2582738481505515,
"grad_norm": 0.21284122659552568,
"learning_rate": 1.3708513708513709e-05,
"loss": 0.3625,
"step": 1741
},
{
"epoch": 2.2595717066839716,
"grad_norm": 0.1950008896432417,
"learning_rate": 1.3684463684463685e-05,
"loss": 0.3324,
"step": 1742
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.2020508328430615,
"learning_rate": 1.3660413660413662e-05,
"loss": 0.3479,
"step": 1743
},
{
"epoch": 2.262167423750811,
"grad_norm": 0.20145545550530372,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.3499,
"step": 1744
},
{
"epoch": 2.263465282284231,
"grad_norm": 0.20947296790103498,
"learning_rate": 1.3612313612313613e-05,
"loss": 0.3499,
"step": 1745
},
{
"epoch": 2.2647631408176507,
"grad_norm": 0.20527034801018748,
"learning_rate": 1.358826358826359e-05,
"loss": 0.3311,
"step": 1746
},
{
"epoch": 2.2660609993510707,
"grad_norm": 0.21571814965895064,
"learning_rate": 1.3564213564213566e-05,
"loss": 0.3445,
"step": 1747
},
{
"epoch": 2.2673588578844908,
"grad_norm": 0.1951070244580519,
"learning_rate": 1.3540163540163542e-05,
"loss": 0.3317,
"step": 1748
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.20164098805440273,
"learning_rate": 1.3516113516113516e-05,
"loss": 0.3489,
"step": 1749
},
{
"epoch": 2.2699545749513304,
"grad_norm": 0.2037235219687979,
"learning_rate": 1.3492063492063492e-05,
"loss": 0.3447,
"step": 1750
},
{
"epoch": 2.27125243348475,
"grad_norm": 0.21220292515247122,
"learning_rate": 1.3468013468013468e-05,
"loss": 0.3395,
"step": 1751
},
{
"epoch": 2.27255029201817,
"grad_norm": 0.2037471406983462,
"learning_rate": 1.3443963443963445e-05,
"loss": 0.3415,
"step": 1752
},
{
"epoch": 2.27384815055159,
"grad_norm": 0.20298389579886292,
"learning_rate": 1.3419913419913421e-05,
"loss": 0.3374,
"step": 1753
},
{
"epoch": 2.2751460090850095,
"grad_norm": 0.20691493521870025,
"learning_rate": 1.3395863395863398e-05,
"loss": 0.3519,
"step": 1754
},
{
"epoch": 2.2764438676184295,
"grad_norm": 0.2013802963124298,
"learning_rate": 1.3371813371813371e-05,
"loss": 0.3411,
"step": 1755
},
{
"epoch": 2.2777417261518496,
"grad_norm": 0.20436433898344716,
"learning_rate": 1.3347763347763347e-05,
"loss": 0.3447,
"step": 1756
},
{
"epoch": 2.279039584685269,
"grad_norm": 0.1926683227358606,
"learning_rate": 1.3323713323713324e-05,
"loss": 0.3326,
"step": 1757
},
{
"epoch": 2.280337443218689,
"grad_norm": 0.22362998880849946,
"learning_rate": 1.32996632996633e-05,
"loss": 0.338,
"step": 1758
},
{
"epoch": 2.281635301752109,
"grad_norm": 0.19696154548860742,
"learning_rate": 1.3275613275613277e-05,
"loss": 0.3302,
"step": 1759
},
{
"epoch": 2.2829331602855287,
"grad_norm": 0.21290597699073446,
"learning_rate": 1.3251563251563252e-05,
"loss": 0.3341,
"step": 1760
},
{
"epoch": 2.2842310188189487,
"grad_norm": 0.2025599154856036,
"learning_rate": 1.3227513227513228e-05,
"loss": 0.3292,
"step": 1761
},
{
"epoch": 2.2855288773523688,
"grad_norm": 0.20827197895684288,
"learning_rate": 1.3203463203463205e-05,
"loss": 0.348,
"step": 1762
},
{
"epoch": 2.2868267358857883,
"grad_norm": 0.2031367197949487,
"learning_rate": 1.3179413179413181e-05,
"loss": 0.3353,
"step": 1763
},
{
"epoch": 2.2881245944192083,
"grad_norm": 0.20119905187117781,
"learning_rate": 1.3155363155363157e-05,
"loss": 0.3291,
"step": 1764
},
{
"epoch": 2.2894224529526284,
"grad_norm": 0.19920483051203258,
"learning_rate": 1.3131313131313134e-05,
"loss": 0.3475,
"step": 1765
},
{
"epoch": 2.290720311486048,
"grad_norm": 0.3011556934952649,
"learning_rate": 1.3107263107263107e-05,
"loss": 0.3535,
"step": 1766
},
{
"epoch": 2.292018170019468,
"grad_norm": 0.21100649854372194,
"learning_rate": 1.3083213083213083e-05,
"loss": 0.3469,
"step": 1767
},
{
"epoch": 2.293316028552888,
"grad_norm": 0.1984331828091875,
"learning_rate": 1.305916305916306e-05,
"loss": 0.346,
"step": 1768
},
{
"epoch": 2.2946138870863075,
"grad_norm": 0.2128841448679342,
"learning_rate": 1.3035113035113036e-05,
"loss": 0.3426,
"step": 1769
},
{
"epoch": 2.2959117456197276,
"grad_norm": 0.20032623160831234,
"learning_rate": 1.3011063011063013e-05,
"loss": 0.3323,
"step": 1770
},
{
"epoch": 2.297209604153147,
"grad_norm": 0.21020072070413953,
"learning_rate": 1.2987012987012986e-05,
"loss": 0.345,
"step": 1771
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.220140017454239,
"learning_rate": 1.2962962962962962e-05,
"loss": 0.338,
"step": 1772
},
{
"epoch": 2.299805321219987,
"grad_norm": 0.20602192883284073,
"learning_rate": 1.2938912938912939e-05,
"loss": 0.3295,
"step": 1773
},
{
"epoch": 2.3011031797534067,
"grad_norm": 0.19782214494971023,
"learning_rate": 1.2914862914862915e-05,
"loss": 0.3443,
"step": 1774
},
{
"epoch": 2.3024010382868267,
"grad_norm": 0.2212349266901331,
"learning_rate": 1.2890812890812892e-05,
"loss": 0.3372,
"step": 1775
},
{
"epoch": 2.3036988968202468,
"grad_norm": 0.20379287350155195,
"learning_rate": 1.2866762866762868e-05,
"loss": 0.3298,
"step": 1776
},
{
"epoch": 2.3049967553536663,
"grad_norm": 0.22633645234017427,
"learning_rate": 1.2842712842712843e-05,
"loss": 0.3358,
"step": 1777
},
{
"epoch": 2.3062946138870863,
"grad_norm": 0.22571485246122885,
"learning_rate": 1.281866281866282e-05,
"loss": 0.3535,
"step": 1778
},
{
"epoch": 2.3075924724205064,
"grad_norm": 0.1971894196957368,
"learning_rate": 1.2794612794612796e-05,
"loss": 0.3437,
"step": 1779
},
{
"epoch": 2.308890330953926,
"grad_norm": 0.21039001676553398,
"learning_rate": 1.2770562770562773e-05,
"loss": 0.3436,
"step": 1780
},
{
"epoch": 2.310188189487346,
"grad_norm": 0.21586183237415976,
"learning_rate": 1.2746512746512749e-05,
"loss": 0.3371,
"step": 1781
},
{
"epoch": 2.3114860480207655,
"grad_norm": 0.22449090420078008,
"learning_rate": 1.2722462722462722e-05,
"loss": 0.3426,
"step": 1782
},
{
"epoch": 2.3127839065541855,
"grad_norm": 0.20000822387430392,
"learning_rate": 1.2698412698412699e-05,
"loss": 0.3273,
"step": 1783
},
{
"epoch": 2.3140817650876055,
"grad_norm": 0.19796366453881245,
"learning_rate": 1.2674362674362675e-05,
"loss": 0.341,
"step": 1784
},
{
"epoch": 2.315379623621025,
"grad_norm": 0.20625398962095803,
"learning_rate": 1.2650312650312651e-05,
"loss": 0.3327,
"step": 1785
},
{
"epoch": 2.316677482154445,
"grad_norm": 0.2018595792474862,
"learning_rate": 1.2626262626262628e-05,
"loss": 0.3425,
"step": 1786
},
{
"epoch": 2.317975340687865,
"grad_norm": 0.21810379177370443,
"learning_rate": 1.2602212602212604e-05,
"loss": 0.3411,
"step": 1787
},
{
"epoch": 2.3192731992212847,
"grad_norm": 0.21122946645750976,
"learning_rate": 1.2578162578162577e-05,
"loss": 0.3519,
"step": 1788
},
{
"epoch": 2.3205710577547047,
"grad_norm": 0.21682608106667023,
"learning_rate": 1.2554112554112554e-05,
"loss": 0.3391,
"step": 1789
},
{
"epoch": 2.3218689162881248,
"grad_norm": 0.21596334841986267,
"learning_rate": 1.253006253006253e-05,
"loss": 0.3478,
"step": 1790
},
{
"epoch": 2.3231667748215443,
"grad_norm": 0.21752697824305056,
"learning_rate": 1.2506012506012507e-05,
"loss": 0.3327,
"step": 1791
},
{
"epoch": 2.3244646333549643,
"grad_norm": 0.19911928426673434,
"learning_rate": 1.2481962481962482e-05,
"loss": 0.3337,
"step": 1792
},
{
"epoch": 2.3257624918883844,
"grad_norm": 0.21067483324446745,
"learning_rate": 1.2457912457912458e-05,
"loss": 0.3383,
"step": 1793
},
{
"epoch": 2.327060350421804,
"grad_norm": 0.21855693528672904,
"learning_rate": 1.2433862433862433e-05,
"loss": 0.3392,
"step": 1794
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.2313881551522217,
"learning_rate": 1.240981240981241e-05,
"loss": 0.344,
"step": 1795
},
{
"epoch": 2.329656067488644,
"grad_norm": 0.21647217328647403,
"learning_rate": 1.2385762385762386e-05,
"loss": 0.3498,
"step": 1796
},
{
"epoch": 2.3309539260220635,
"grad_norm": 0.23563572241047098,
"learning_rate": 1.2361712361712362e-05,
"loss": 0.3492,
"step": 1797
},
{
"epoch": 2.3322517845554835,
"grad_norm": 0.22879253600129817,
"learning_rate": 1.2337662337662339e-05,
"loss": 0.3476,
"step": 1798
},
{
"epoch": 2.3335496430889036,
"grad_norm": 0.2194770239864021,
"learning_rate": 1.2313612313612315e-05,
"loss": 0.3499,
"step": 1799
},
{
"epoch": 2.334847501622323,
"grad_norm": 0.2323432486506921,
"learning_rate": 1.228956228956229e-05,
"loss": 0.3498,
"step": 1800
},
{
"epoch": 2.336145360155743,
"grad_norm": 0.2379829541757471,
"learning_rate": 1.2265512265512267e-05,
"loss": 0.3455,
"step": 1801
},
{
"epoch": 2.3374432186891627,
"grad_norm": 0.22305188673020696,
"learning_rate": 1.2241462241462243e-05,
"loss": 0.3464,
"step": 1802
},
{
"epoch": 2.3387410772225827,
"grad_norm": 0.20626742501221118,
"learning_rate": 1.2217412217412218e-05,
"loss": 0.3383,
"step": 1803
},
{
"epoch": 2.3400389357560027,
"grad_norm": 0.20476597434634727,
"learning_rate": 1.2193362193362194e-05,
"loss": 0.3271,
"step": 1804
},
{
"epoch": 2.3413367942894223,
"grad_norm": 0.2361285638480301,
"learning_rate": 1.2169312169312169e-05,
"loss": 0.363,
"step": 1805
},
{
"epoch": 2.3426346528228423,
"grad_norm": 0.21885176640576573,
"learning_rate": 1.2145262145262145e-05,
"loss": 0.3322,
"step": 1806
},
{
"epoch": 2.3439325113562623,
"grad_norm": 0.2186287737946408,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.3517,
"step": 1807
},
{
"epoch": 2.345230369889682,
"grad_norm": 0.22242925263530489,
"learning_rate": 1.2097162097162097e-05,
"loss": 0.3556,
"step": 1808
},
{
"epoch": 2.346528228423102,
"grad_norm": 0.21839105455863506,
"learning_rate": 1.2073112073112073e-05,
"loss": 0.3537,
"step": 1809
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.19288692731778082,
"learning_rate": 1.204906204906205e-05,
"loss": 0.3267,
"step": 1810
},
{
"epoch": 2.3491239454899415,
"grad_norm": 0.21299009964604498,
"learning_rate": 1.2025012025012024e-05,
"loss": 0.3343,
"step": 1811
},
{
"epoch": 2.3504218040233615,
"grad_norm": 0.20073387684513502,
"learning_rate": 1.2000962000962001e-05,
"loss": 0.3366,
"step": 1812
},
{
"epoch": 2.351719662556781,
"grad_norm": 0.20924433976474296,
"learning_rate": 1.1976911976911977e-05,
"loss": 0.3503,
"step": 1813
},
{
"epoch": 2.353017521090201,
"grad_norm": 0.2024959718616962,
"learning_rate": 1.1952861952861954e-05,
"loss": 0.3398,
"step": 1814
},
{
"epoch": 2.354315379623621,
"grad_norm": 0.20136147992617448,
"learning_rate": 1.192881192881193e-05,
"loss": 0.3329,
"step": 1815
},
{
"epoch": 2.3556132381570407,
"grad_norm": 0.2023856480954257,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.3431,
"step": 1816
},
{
"epoch": 2.3569110966904607,
"grad_norm": 0.19571106019001464,
"learning_rate": 1.1880711880711882e-05,
"loss": 0.3409,
"step": 1817
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.20043918045056716,
"learning_rate": 1.1856661856661858e-05,
"loss": 0.3457,
"step": 1818
},
{
"epoch": 2.3595068137573003,
"grad_norm": 0.20301799957042976,
"learning_rate": 1.1832611832611833e-05,
"loss": 0.358,
"step": 1819
},
{
"epoch": 2.3608046722907203,
"grad_norm": 0.1985053306211951,
"learning_rate": 1.180856180856181e-05,
"loss": 0.3476,
"step": 1820
},
{
"epoch": 2.3621025308241403,
"grad_norm": 0.1946833217729552,
"learning_rate": 1.1784511784511786e-05,
"loss": 0.3381,
"step": 1821
},
{
"epoch": 2.36340038935756,
"grad_norm": 0.20694128635780762,
"learning_rate": 1.176046176046176e-05,
"loss": 0.3484,
"step": 1822
},
{
"epoch": 2.36469824789098,
"grad_norm": 0.19680244677531286,
"learning_rate": 1.1736411736411737e-05,
"loss": 0.3412,
"step": 1823
},
{
"epoch": 2.3659961064244,
"grad_norm": 0.19475058414044913,
"learning_rate": 1.1712361712361713e-05,
"loss": 0.3338,
"step": 1824
},
{
"epoch": 2.3672939649578195,
"grad_norm": 0.20017845696117334,
"learning_rate": 1.1688311688311688e-05,
"loss": 0.3292,
"step": 1825
},
{
"epoch": 2.3685918234912395,
"grad_norm": 0.21405181485690658,
"learning_rate": 1.1664261664261665e-05,
"loss": 0.3319,
"step": 1826
},
{
"epoch": 2.3698896820246595,
"grad_norm": 0.2255906580166369,
"learning_rate": 1.164021164021164e-05,
"loss": 0.3583,
"step": 1827
},
{
"epoch": 2.371187540558079,
"grad_norm": 0.19990250671178067,
"learning_rate": 1.1616161616161616e-05,
"loss": 0.337,
"step": 1828
},
{
"epoch": 2.372485399091499,
"grad_norm": 0.19827952220037648,
"learning_rate": 1.1592111592111592e-05,
"loss": 0.3432,
"step": 1829
},
{
"epoch": 2.3737832576249187,
"grad_norm": 0.19939279257051523,
"learning_rate": 1.1568061568061569e-05,
"loss": 0.3374,
"step": 1830
},
{
"epoch": 2.3750811161583387,
"grad_norm": 0.1906800360211246,
"learning_rate": 1.1544011544011545e-05,
"loss": 0.3423,
"step": 1831
},
{
"epoch": 2.3763789746917587,
"grad_norm": 0.2027119176012166,
"learning_rate": 1.1519961519961522e-05,
"loss": 0.3431,
"step": 1832
},
{
"epoch": 2.3776768332251783,
"grad_norm": 0.20771653103434248,
"learning_rate": 1.1495911495911497e-05,
"loss": 0.3621,
"step": 1833
},
{
"epoch": 2.3789746917585983,
"grad_norm": 0.18554441127828813,
"learning_rate": 1.1471861471861473e-05,
"loss": 0.326,
"step": 1834
},
{
"epoch": 2.3802725502920183,
"grad_norm": 0.19747340923777565,
"learning_rate": 1.144781144781145e-05,
"loss": 0.3247,
"step": 1835
},
{
"epoch": 2.381570408825438,
"grad_norm": 0.20647886448091093,
"learning_rate": 1.1423761423761424e-05,
"loss": 0.3341,
"step": 1836
},
{
"epoch": 2.382868267358858,
"grad_norm": 0.1957627449624196,
"learning_rate": 1.13997113997114e-05,
"loss": 0.3328,
"step": 1837
},
{
"epoch": 2.3841661258922775,
"grad_norm": 0.19525704689585352,
"learning_rate": 1.1375661375661376e-05,
"loss": 0.347,
"step": 1838
},
{
"epoch": 2.3854639844256975,
"grad_norm": 0.20804623050610585,
"learning_rate": 1.1351611351611352e-05,
"loss": 0.3313,
"step": 1839
},
{
"epoch": 2.3867618429591175,
"grad_norm": 0.21139171271078994,
"learning_rate": 1.1327561327561329e-05,
"loss": 0.3295,
"step": 1840
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.19695078516317913,
"learning_rate": 1.1303511303511303e-05,
"loss": 0.3244,
"step": 1841
},
{
"epoch": 2.389357560025957,
"grad_norm": 0.21329704668287522,
"learning_rate": 1.127946127946128e-05,
"loss": 0.3496,
"step": 1842
},
{
"epoch": 2.390655418559377,
"grad_norm": 0.19692253824786773,
"learning_rate": 1.1255411255411256e-05,
"loss": 0.3414,
"step": 1843
},
{
"epoch": 2.3919532770927967,
"grad_norm": 0.20145116731281673,
"learning_rate": 1.1231361231361231e-05,
"loss": 0.3411,
"step": 1844
},
{
"epoch": 2.3932511356262167,
"grad_norm": 0.21094040457540925,
"learning_rate": 1.1207311207311207e-05,
"loss": 0.3425,
"step": 1845
},
{
"epoch": 2.3945489941596367,
"grad_norm": 0.20933806797553264,
"learning_rate": 1.1183261183261184e-05,
"loss": 0.3447,
"step": 1846
},
{
"epoch": 2.3958468526930563,
"grad_norm": 0.22025133979918343,
"learning_rate": 1.1159211159211159e-05,
"loss": 0.3276,
"step": 1847
},
{
"epoch": 2.3971447112264763,
"grad_norm": 0.2033688921914023,
"learning_rate": 1.1135161135161135e-05,
"loss": 0.3427,
"step": 1848
},
{
"epoch": 2.3984425697598963,
"grad_norm": 0.20001119671379927,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.3291,
"step": 1849
},
{
"epoch": 2.399740428293316,
"grad_norm": 0.2096895946679153,
"learning_rate": 1.1087061087061088e-05,
"loss": 0.3529,
"step": 1850
},
{
"epoch": 2.401038286826736,
"grad_norm": 0.20086557980683176,
"learning_rate": 1.1063011063011065e-05,
"loss": 0.3386,
"step": 1851
},
{
"epoch": 2.402336145360156,
"grad_norm": 0.19654561939004062,
"learning_rate": 1.103896103896104e-05,
"loss": 0.347,
"step": 1852
},
{
"epoch": 2.4036340038935755,
"grad_norm": 0.20190747910646842,
"learning_rate": 1.1014911014911016e-05,
"loss": 0.3355,
"step": 1853
},
{
"epoch": 2.4049318624269955,
"grad_norm": 0.20073723518377382,
"learning_rate": 1.0990860990860992e-05,
"loss": 0.3506,
"step": 1854
},
{
"epoch": 2.4062297209604155,
"grad_norm": 0.19812022485550956,
"learning_rate": 1.0966810966810967e-05,
"loss": 0.3326,
"step": 1855
},
{
"epoch": 2.407527579493835,
"grad_norm": 0.20293093391898026,
"learning_rate": 1.0942760942760944e-05,
"loss": 0.3437,
"step": 1856
},
{
"epoch": 2.408825438027255,
"grad_norm": 0.18445064925773152,
"learning_rate": 1.091871091871092e-05,
"loss": 0.3412,
"step": 1857
},
{
"epoch": 2.4101232965606747,
"grad_norm": 0.18982196010225733,
"learning_rate": 1.0894660894660895e-05,
"loss": 0.3334,
"step": 1858
},
{
"epoch": 2.4114211550940947,
"grad_norm": 0.19790842980140105,
"learning_rate": 1.0870610870610871e-05,
"loss": 0.341,
"step": 1859
},
{
"epoch": 2.4127190136275147,
"grad_norm": 0.21825470803362326,
"learning_rate": 1.0846560846560846e-05,
"loss": 0.3463,
"step": 1860
},
{
"epoch": 2.4140168721609343,
"grad_norm": 0.19742393864907667,
"learning_rate": 1.0822510822510823e-05,
"loss": 0.3275,
"step": 1861
},
{
"epoch": 2.4153147306943543,
"grad_norm": 0.19224523357142126,
"learning_rate": 1.0798460798460799e-05,
"loss": 0.3325,
"step": 1862
},
{
"epoch": 2.4166125892277743,
"grad_norm": 0.19938049624693138,
"learning_rate": 1.0774410774410774e-05,
"loss": 0.3262,
"step": 1863
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.19485806663699845,
"learning_rate": 1.075036075036075e-05,
"loss": 0.3357,
"step": 1864
},
{
"epoch": 2.419208306294614,
"grad_norm": 0.20583844260408463,
"learning_rate": 1.0726310726310727e-05,
"loss": 0.3408,
"step": 1865
},
{
"epoch": 2.420506164828034,
"grad_norm": 0.19685425374116253,
"learning_rate": 1.0702260702260703e-05,
"loss": 0.34,
"step": 1866
},
{
"epoch": 2.4218040233614535,
"grad_norm": 0.19475698760728785,
"learning_rate": 1.067821067821068e-05,
"loss": 0.3415,
"step": 1867
},
{
"epoch": 2.4231018818948735,
"grad_norm": 0.197170239994665,
"learning_rate": 1.0654160654160656e-05,
"loss": 0.3328,
"step": 1868
},
{
"epoch": 2.424399740428293,
"grad_norm": 0.2312526818007591,
"learning_rate": 1.0630110630110631e-05,
"loss": 0.3517,
"step": 1869
},
{
"epoch": 2.425697598961713,
"grad_norm": 0.21684364271507464,
"learning_rate": 1.0606060606060607e-05,
"loss": 0.3383,
"step": 1870
},
{
"epoch": 2.426995457495133,
"grad_norm": 0.1985082584320083,
"learning_rate": 1.0582010582010582e-05,
"loss": 0.3246,
"step": 1871
},
{
"epoch": 2.4282933160285527,
"grad_norm": 0.21714639429646007,
"learning_rate": 1.0557960557960559e-05,
"loss": 0.3497,
"step": 1872
},
{
"epoch": 2.4295911745619727,
"grad_norm": 0.22321524923011035,
"learning_rate": 1.0533910533910535e-05,
"loss": 0.3553,
"step": 1873
},
{
"epoch": 2.4308890330953927,
"grad_norm": 0.23372261304917366,
"learning_rate": 1.050986050986051e-05,
"loss": 0.3686,
"step": 1874
},
{
"epoch": 2.4321868916288123,
"grad_norm": 0.20001574330722693,
"learning_rate": 1.0485810485810486e-05,
"loss": 0.3351,
"step": 1875
},
{
"epoch": 2.4334847501622323,
"grad_norm": 0.20673565072923414,
"learning_rate": 1.0461760461760463e-05,
"loss": 0.3596,
"step": 1876
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.20095320206649409,
"learning_rate": 1.0437710437710438e-05,
"loss": 0.3328,
"step": 1877
},
{
"epoch": 2.436080467229072,
"grad_norm": 0.2789190666720269,
"learning_rate": 1.0413660413660414e-05,
"loss": 0.3509,
"step": 1878
},
{
"epoch": 2.437378325762492,
"grad_norm": 0.2079166722723446,
"learning_rate": 1.038961038961039e-05,
"loss": 0.3492,
"step": 1879
},
{
"epoch": 2.438676184295912,
"grad_norm": 0.21591855675091434,
"learning_rate": 1.0365560365560365e-05,
"loss": 0.3375,
"step": 1880
},
{
"epoch": 2.4399740428293315,
"grad_norm": 0.21397550791993689,
"learning_rate": 1.0341510341510342e-05,
"loss": 0.3495,
"step": 1881
},
{
"epoch": 2.4412719013627515,
"grad_norm": 0.3807729201699182,
"learning_rate": 1.0317460317460318e-05,
"loss": 0.3449,
"step": 1882
},
{
"epoch": 2.4425697598961715,
"grad_norm": 0.19473145374740133,
"learning_rate": 1.0293410293410295e-05,
"loss": 0.3412,
"step": 1883
},
{
"epoch": 2.443867618429591,
"grad_norm": 0.20643794857809838,
"learning_rate": 1.0269360269360271e-05,
"loss": 0.3548,
"step": 1884
},
{
"epoch": 2.445165476963011,
"grad_norm": 0.19967818102155932,
"learning_rate": 1.0245310245310246e-05,
"loss": 0.3251,
"step": 1885
},
{
"epoch": 2.446463335496431,
"grad_norm": 0.2068701964008534,
"learning_rate": 1.0221260221260222e-05,
"loss": 0.3503,
"step": 1886
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.19485423370421984,
"learning_rate": 1.0197210197210199e-05,
"loss": 0.3316,
"step": 1887
},
{
"epoch": 2.4490590525632707,
"grad_norm": 0.20945089504608722,
"learning_rate": 1.0173160173160174e-05,
"loss": 0.3723,
"step": 1888
},
{
"epoch": 2.4503569110966903,
"grad_norm": 0.21961631414794303,
"learning_rate": 1.014911014911015e-05,
"loss": 0.3555,
"step": 1889
},
{
"epoch": 2.4516547696301103,
"grad_norm": 0.20913009484803424,
"learning_rate": 1.0125060125060125e-05,
"loss": 0.3433,
"step": 1890
},
{
"epoch": 2.4529526281635303,
"grad_norm": 0.1975805461078145,
"learning_rate": 1.0101010101010101e-05,
"loss": 0.3522,
"step": 1891
},
{
"epoch": 2.45425048669695,
"grad_norm": 0.18824749115573988,
"learning_rate": 1.0076960076960078e-05,
"loss": 0.3262,
"step": 1892
},
{
"epoch": 2.45554834523037,
"grad_norm": 0.19363390712933798,
"learning_rate": 1.0052910052910053e-05,
"loss": 0.3304,
"step": 1893
},
{
"epoch": 2.45684620376379,
"grad_norm": 0.20877531448498393,
"learning_rate": 1.0028860028860029e-05,
"loss": 0.3502,
"step": 1894
},
{
"epoch": 2.4581440622972095,
"grad_norm": 0.2061242277033731,
"learning_rate": 1.0004810004810006e-05,
"loss": 0.3382,
"step": 1895
},
{
"epoch": 2.4594419208306295,
"grad_norm": 0.20527048632536887,
"learning_rate": 9.98075998075998e-06,
"loss": 0.3391,
"step": 1896
},
{
"epoch": 2.460739779364049,
"grad_norm": 0.20055534262640298,
"learning_rate": 9.956709956709957e-06,
"loss": 0.3564,
"step": 1897
},
{
"epoch": 2.462037637897469,
"grad_norm": 0.19785197665929594,
"learning_rate": 9.932659932659933e-06,
"loss": 0.3443,
"step": 1898
},
{
"epoch": 2.463335496430889,
"grad_norm": 0.2037702638037453,
"learning_rate": 9.908609908609908e-06,
"loss": 0.348,
"step": 1899
},
{
"epoch": 2.4646333549643087,
"grad_norm": 0.21220856247877268,
"learning_rate": 9.884559884559884e-06,
"loss": 0.3428,
"step": 1900
},
{
"epoch": 2.4659312134977287,
"grad_norm": 0.2219575251336399,
"learning_rate": 9.860509860509861e-06,
"loss": 0.3418,
"step": 1901
},
{
"epoch": 2.4672290720311487,
"grad_norm": 0.19647517582803717,
"learning_rate": 9.836459836459837e-06,
"loss": 0.3345,
"step": 1902
},
{
"epoch": 2.4685269305645683,
"grad_norm": 0.19654210291881336,
"learning_rate": 9.812409812409814e-06,
"loss": 0.3439,
"step": 1903
},
{
"epoch": 2.4698247890979883,
"grad_norm": 0.20215632118085225,
"learning_rate": 9.788359788359789e-06,
"loss": 0.343,
"step": 1904
},
{
"epoch": 2.4711226476314083,
"grad_norm": 0.21700717313986337,
"learning_rate": 9.764309764309765e-06,
"loss": 0.3422,
"step": 1905
},
{
"epoch": 2.472420506164828,
"grad_norm": 0.22639821333763582,
"learning_rate": 9.740259740259742e-06,
"loss": 0.3545,
"step": 1906
},
{
"epoch": 2.473718364698248,
"grad_norm": 0.20194895913978017,
"learning_rate": 9.716209716209716e-06,
"loss": 0.3389,
"step": 1907
},
{
"epoch": 2.475016223231668,
"grad_norm": 0.20577729584744323,
"learning_rate": 9.692159692159693e-06,
"loss": 0.3399,
"step": 1908
},
{
"epoch": 2.4763140817650875,
"grad_norm": 0.19912299573383097,
"learning_rate": 9.66810966810967e-06,
"loss": 0.3373,
"step": 1909
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.21269657354433175,
"learning_rate": 9.644059644059644e-06,
"loss": 0.3266,
"step": 1910
},
{
"epoch": 2.4789097988319275,
"grad_norm": 0.20736564955023581,
"learning_rate": 9.62000962000962e-06,
"loss": 0.3778,
"step": 1911
},
{
"epoch": 2.480207657365347,
"grad_norm": 0.20499397029892405,
"learning_rate": 9.595959595959595e-06,
"loss": 0.3476,
"step": 1912
},
{
"epoch": 2.481505515898767,
"grad_norm": 0.21222717155575171,
"learning_rate": 9.571909571909572e-06,
"loss": 0.3614,
"step": 1913
},
{
"epoch": 2.482803374432187,
"grad_norm": 0.19863505520154515,
"learning_rate": 9.547859547859548e-06,
"loss": 0.3405,
"step": 1914
},
{
"epoch": 2.4841012329656067,
"grad_norm": 0.18907377927162114,
"learning_rate": 9.523809523809523e-06,
"loss": 0.3286,
"step": 1915
},
{
"epoch": 2.4853990914990267,
"grad_norm": 0.21676854818994787,
"learning_rate": 9.4997594997595e-06,
"loss": 0.3464,
"step": 1916
},
{
"epoch": 2.4866969500324463,
"grad_norm": 0.20682249845168457,
"learning_rate": 9.475709475709476e-06,
"loss": 0.3482,
"step": 1917
},
{
"epoch": 2.4879948085658663,
"grad_norm": 0.20359288013276863,
"learning_rate": 9.451659451659452e-06,
"loss": 0.3361,
"step": 1918
},
{
"epoch": 2.4892926670992863,
"grad_norm": 0.20782718092266766,
"learning_rate": 9.427609427609429e-06,
"loss": 0.3437,
"step": 1919
},
{
"epoch": 2.490590525632706,
"grad_norm": 0.19751302818390967,
"learning_rate": 9.403559403559405e-06,
"loss": 0.3291,
"step": 1920
},
{
"epoch": 2.491888384166126,
"grad_norm": 0.21156577333606844,
"learning_rate": 9.37950937950938e-06,
"loss": 0.3402,
"step": 1921
},
{
"epoch": 2.493186242699546,
"grad_norm": 0.20413132600430678,
"learning_rate": 9.355459355459357e-06,
"loss": 0.3483,
"step": 1922
},
{
"epoch": 2.4944841012329655,
"grad_norm": 0.1966838578810161,
"learning_rate": 9.331409331409331e-06,
"loss": 0.3412,
"step": 1923
},
{
"epoch": 2.4957819597663855,
"grad_norm": 0.19973328891467468,
"learning_rate": 9.307359307359308e-06,
"loss": 0.3458,
"step": 1924
},
{
"epoch": 2.497079818299805,
"grad_norm": 0.2020638046963019,
"learning_rate": 9.283309283309284e-06,
"loss": 0.3393,
"step": 1925
},
{
"epoch": 2.498377676833225,
"grad_norm": 0.22766804942928376,
"learning_rate": 9.259259259259259e-06,
"loss": 0.3255,
"step": 1926
},
{
"epoch": 2.499675535366645,
"grad_norm": 0.1997890343589566,
"learning_rate": 9.235209235209236e-06,
"loss": 0.3379,
"step": 1927
},
{
"epoch": 2.5009733939000647,
"grad_norm": 0.19417057565689014,
"learning_rate": 9.211159211159212e-06,
"loss": 0.3362,
"step": 1928
},
{
"epoch": 2.5022712524334847,
"grad_norm": 0.2159121005299441,
"learning_rate": 9.187109187109187e-06,
"loss": 0.333,
"step": 1929
},
{
"epoch": 2.5035691109669047,
"grad_norm": 0.2152179356087276,
"learning_rate": 9.163059163059163e-06,
"loss": 0.3299,
"step": 1930
},
{
"epoch": 2.5048669695003243,
"grad_norm": 0.19905359149185228,
"learning_rate": 9.13900913900914e-06,
"loss": 0.3402,
"step": 1931
},
{
"epoch": 2.5061648280337443,
"grad_norm": 0.20691948697032309,
"learning_rate": 9.114959114959115e-06,
"loss": 0.3513,
"step": 1932
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.19894235448528472,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3288,
"step": 1933
},
{
"epoch": 2.508760545100584,
"grad_norm": 0.21082966995035995,
"learning_rate": 9.066859066859068e-06,
"loss": 0.3332,
"step": 1934
},
{
"epoch": 2.510058403634004,
"grad_norm": 0.20169901454411296,
"learning_rate": 9.042809042809042e-06,
"loss": 0.3223,
"step": 1935
},
{
"epoch": 2.511356262167424,
"grad_norm": 0.21866454354506568,
"learning_rate": 9.018759018759019e-06,
"loss": 0.3392,
"step": 1936
},
{
"epoch": 2.5126541207008435,
"grad_norm": 0.21257413698744998,
"learning_rate": 8.994708994708995e-06,
"loss": 0.3452,
"step": 1937
},
{
"epoch": 2.5139519792342635,
"grad_norm": 0.19946752360820091,
"learning_rate": 8.970658970658972e-06,
"loss": 0.3426,
"step": 1938
},
{
"epoch": 2.5152498377676835,
"grad_norm": 0.2048047114149695,
"learning_rate": 8.946608946608948e-06,
"loss": 0.3496,
"step": 1939
},
{
"epoch": 2.516547696301103,
"grad_norm": 0.2060713206544726,
"learning_rate": 8.922558922558923e-06,
"loss": 0.3466,
"step": 1940
},
{
"epoch": 2.517845554834523,
"grad_norm": 0.21499517524206285,
"learning_rate": 8.8985088985089e-06,
"loss": 0.3391,
"step": 1941
},
{
"epoch": 2.519143413367943,
"grad_norm": 0.2043639715896453,
"learning_rate": 8.874458874458876e-06,
"loss": 0.3463,
"step": 1942
},
{
"epoch": 2.5204412719013627,
"grad_norm": 0.2183414837407522,
"learning_rate": 8.85040885040885e-06,
"loss": 0.3427,
"step": 1943
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.20285948319637043,
"learning_rate": 8.826358826358827e-06,
"loss": 0.3377,
"step": 1944
},
{
"epoch": 2.5230369889682027,
"grad_norm": 0.19391653715006987,
"learning_rate": 8.802308802308802e-06,
"loss": 0.3431,
"step": 1945
},
{
"epoch": 2.5243348475016223,
"grad_norm": 0.19725005677345353,
"learning_rate": 8.778258778258778e-06,
"loss": 0.3402,
"step": 1946
},
{
"epoch": 2.5256327060350423,
"grad_norm": 0.20776675755315593,
"learning_rate": 8.754208754208755e-06,
"loss": 0.3432,
"step": 1947
},
{
"epoch": 2.526930564568462,
"grad_norm": 0.20770257485418178,
"learning_rate": 8.73015873015873e-06,
"loss": 0.3309,
"step": 1948
},
{
"epoch": 2.528228423101882,
"grad_norm": 0.19473645944952384,
"learning_rate": 8.706108706108706e-06,
"loss": 0.3335,
"step": 1949
},
{
"epoch": 2.529526281635302,
"grad_norm": 0.20760911826948453,
"learning_rate": 8.682058682058683e-06,
"loss": 0.3418,
"step": 1950
},
{
"epoch": 2.5308241401687215,
"grad_norm": 0.2094780858154728,
"learning_rate": 8.658008658008657e-06,
"loss": 0.3425,
"step": 1951
},
{
"epoch": 2.5321219987021415,
"grad_norm": 0.19840367931370975,
"learning_rate": 8.633958633958634e-06,
"loss": 0.3417,
"step": 1952
},
{
"epoch": 2.533419857235561,
"grad_norm": 0.19624410854082408,
"learning_rate": 8.60990860990861e-06,
"loss": 0.3456,
"step": 1953
},
{
"epoch": 2.534717715768981,
"grad_norm": 0.19653675310930965,
"learning_rate": 8.585858585858587e-06,
"loss": 0.3287,
"step": 1954
},
{
"epoch": 2.536015574302401,
"grad_norm": 0.2033148019089131,
"learning_rate": 8.561808561808563e-06,
"loss": 0.3416,
"step": 1955
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.19360893151616687,
"learning_rate": 8.537758537758538e-06,
"loss": 0.3554,
"step": 1956
},
{
"epoch": 2.5386112913692407,
"grad_norm": 0.18776659365965748,
"learning_rate": 8.513708513708514e-06,
"loss": 0.3394,
"step": 1957
},
{
"epoch": 2.5399091499026607,
"grad_norm": 0.21687377756718906,
"learning_rate": 8.489658489658491e-06,
"loss": 0.3447,
"step": 1958
},
{
"epoch": 2.5412070084360803,
"grad_norm": 0.19788163493461905,
"learning_rate": 8.465608465608466e-06,
"loss": 0.3383,
"step": 1959
},
{
"epoch": 2.5425048669695003,
"grad_norm": 0.20991331629937682,
"learning_rate": 8.441558441558442e-06,
"loss": 0.3405,
"step": 1960
},
{
"epoch": 2.5438027255029203,
"grad_norm": 0.20315647768031084,
"learning_rate": 8.417508417508419e-06,
"loss": 0.3454,
"step": 1961
},
{
"epoch": 2.54510058403634,
"grad_norm": 0.23523075305367147,
"learning_rate": 8.393458393458393e-06,
"loss": 0.3618,
"step": 1962
},
{
"epoch": 2.54639844256976,
"grad_norm": 0.2157971364103013,
"learning_rate": 8.36940836940837e-06,
"loss": 0.3497,
"step": 1963
},
{
"epoch": 2.54769630110318,
"grad_norm": 0.20499915960426765,
"learning_rate": 8.345358345358346e-06,
"loss": 0.3309,
"step": 1964
},
{
"epoch": 2.5489941596365995,
"grad_norm": 0.20698252650858404,
"learning_rate": 8.321308321308321e-06,
"loss": 0.3331,
"step": 1965
},
{
"epoch": 2.5502920181700195,
"grad_norm": 0.20188066664017376,
"learning_rate": 8.297258297258298e-06,
"loss": 0.346,
"step": 1966
},
{
"epoch": 2.5515898767034395,
"grad_norm": 0.22092684369504248,
"learning_rate": 8.273208273208272e-06,
"loss": 0.3487,
"step": 1967
},
{
"epoch": 2.552887735236859,
"grad_norm": 0.20340970119145418,
"learning_rate": 8.249158249158249e-06,
"loss": 0.3341,
"step": 1968
},
{
"epoch": 2.554185593770279,
"grad_norm": 0.20252572395415291,
"learning_rate": 8.225108225108225e-06,
"loss": 0.3332,
"step": 1969
},
{
"epoch": 2.555483452303699,
"grad_norm": 0.1879834295873596,
"learning_rate": 8.201058201058202e-06,
"loss": 0.3212,
"step": 1970
},
{
"epoch": 2.5567813108371187,
"grad_norm": 0.19299674074621231,
"learning_rate": 8.177008177008178e-06,
"loss": 0.3404,
"step": 1971
},
{
"epoch": 2.5580791693705387,
"grad_norm": 0.20796973098772337,
"learning_rate": 8.152958152958155e-06,
"loss": 0.3389,
"step": 1972
},
{
"epoch": 2.5593770279039587,
"grad_norm": 0.21058263937992205,
"learning_rate": 8.12890812890813e-06,
"loss": 0.3382,
"step": 1973
},
{
"epoch": 2.5606748864373783,
"grad_norm": 0.19698041890440957,
"learning_rate": 8.104858104858106e-06,
"loss": 0.3406,
"step": 1974
},
{
"epoch": 2.5619727449707983,
"grad_norm": 0.20441051012340986,
"learning_rate": 8.080808080808082e-06,
"loss": 0.3568,
"step": 1975
},
{
"epoch": 2.5632706035042183,
"grad_norm": 0.20023706231769592,
"learning_rate": 8.056758056758057e-06,
"loss": 0.3453,
"step": 1976
},
{
"epoch": 2.564568462037638,
"grad_norm": 0.19384483347770198,
"learning_rate": 8.032708032708034e-06,
"loss": 0.3541,
"step": 1977
},
{
"epoch": 2.565866320571058,
"grad_norm": 0.1902457140768143,
"learning_rate": 8.008658008658008e-06,
"loss": 0.3259,
"step": 1978
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.2065028347871094,
"learning_rate": 7.984607984607985e-06,
"loss": 0.3543,
"step": 1979
},
{
"epoch": 2.5684620376378975,
"grad_norm": 0.19772657046385608,
"learning_rate": 7.960557960557961e-06,
"loss": 0.3353,
"step": 1980
},
{
"epoch": 2.569759896171317,
"grad_norm": 0.18849021172503813,
"learning_rate": 7.936507936507936e-06,
"loss": 0.3223,
"step": 1981
},
{
"epoch": 2.571057754704737,
"grad_norm": 0.208888741548615,
"learning_rate": 7.912457912457913e-06,
"loss": 0.3444,
"step": 1982
},
{
"epoch": 2.572355613238157,
"grad_norm": 0.22525608656131163,
"learning_rate": 7.888407888407889e-06,
"loss": 0.3361,
"step": 1983
},
{
"epoch": 2.5736534717715767,
"grad_norm": 0.207515663353734,
"learning_rate": 7.864357864357864e-06,
"loss": 0.3368,
"step": 1984
},
{
"epoch": 2.5749513303049967,
"grad_norm": 0.2154517901626009,
"learning_rate": 7.84030784030784e-06,
"loss": 0.3345,
"step": 1985
},
{
"epoch": 2.5762491888384167,
"grad_norm": 0.2059272342950417,
"learning_rate": 7.816257816257815e-06,
"loss": 0.3657,
"step": 1986
},
{
"epoch": 2.5775470473718363,
"grad_norm": 0.20247815650907755,
"learning_rate": 7.792207792207792e-06,
"loss": 0.3248,
"step": 1987
},
{
"epoch": 2.5788449059052563,
"grad_norm": 0.18967243424535923,
"learning_rate": 7.768157768157768e-06,
"loss": 0.3345,
"step": 1988
},
{
"epoch": 2.5801427644386763,
"grad_norm": 0.20514510031749197,
"learning_rate": 7.744107744107745e-06,
"loss": 0.3407,
"step": 1989
},
{
"epoch": 2.581440622972096,
"grad_norm": 0.2047332936097657,
"learning_rate": 7.720057720057721e-06,
"loss": 0.3404,
"step": 1990
},
{
"epoch": 2.582738481505516,
"grad_norm": 0.19001160907023656,
"learning_rate": 7.696007696007697e-06,
"loss": 0.3363,
"step": 1991
},
{
"epoch": 2.584036340038936,
"grad_norm": 0.2007791373982576,
"learning_rate": 7.671957671957672e-06,
"loss": 0.345,
"step": 1992
},
{
"epoch": 2.5853341985723555,
"grad_norm": 0.1995014007677766,
"learning_rate": 7.647907647907649e-06,
"loss": 0.3373,
"step": 1993
},
{
"epoch": 2.5866320571057755,
"grad_norm": 0.22960201823541218,
"learning_rate": 7.623857623857625e-06,
"loss": 0.35,
"step": 1994
},
{
"epoch": 2.5879299156391955,
"grad_norm": 0.2041708033798508,
"learning_rate": 7.5998075998076e-06,
"loss": 0.3299,
"step": 1995
},
{
"epoch": 2.589227774172615,
"grad_norm": 0.19021132164070584,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.3434,
"step": 1996
},
{
"epoch": 2.590525632706035,
"grad_norm": 0.19160256744066273,
"learning_rate": 7.551707551707551e-06,
"loss": 0.3376,
"step": 1997
},
{
"epoch": 2.591823491239455,
"grad_norm": 0.1955416018816942,
"learning_rate": 7.527657527657528e-06,
"loss": 0.3407,
"step": 1998
},
{
"epoch": 2.5931213497728747,
"grad_norm": 0.1994363091424892,
"learning_rate": 7.503607503607504e-06,
"loss": 0.3271,
"step": 1999
},
{
"epoch": 2.5944192083062947,
"grad_norm": 0.19719206828799127,
"learning_rate": 7.47955747955748e-06,
"loss": 0.3491,
"step": 2000
},
{
"epoch": 2.5957170668397147,
"grad_norm": 0.21854928408023744,
"learning_rate": 7.455507455507456e-06,
"loss": 0.3559,
"step": 2001
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.2004433126098414,
"learning_rate": 7.431457431457433e-06,
"loss": 0.353,
"step": 2002
},
{
"epoch": 2.5983127839065543,
"grad_norm": 0.21521520240395595,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.3533,
"step": 2003
},
{
"epoch": 2.5996106424399743,
"grad_norm": 0.1904741360498635,
"learning_rate": 7.383357383357384e-06,
"loss": 0.3375,
"step": 2004
},
{
"epoch": 2.600908500973394,
"grad_norm": 0.20148594815991572,
"learning_rate": 7.35930735930736e-06,
"loss": 0.3404,
"step": 2005
},
{
"epoch": 2.602206359506814,
"grad_norm": 0.20222565117203797,
"learning_rate": 7.335257335257335e-06,
"loss": 0.3325,
"step": 2006
},
{
"epoch": 2.6035042180402335,
"grad_norm": 0.2086022602926794,
"learning_rate": 7.311207311207312e-06,
"loss": 0.3323,
"step": 2007
},
{
"epoch": 2.6048020765736535,
"grad_norm": 0.19695846789935645,
"learning_rate": 7.2871572871572864e-06,
"loss": 0.3419,
"step": 2008
},
{
"epoch": 2.6060999351070735,
"grad_norm": 0.2003548511519371,
"learning_rate": 7.263107263107263e-06,
"loss": 0.3356,
"step": 2009
},
{
"epoch": 2.607397793640493,
"grad_norm": 0.20970980037255266,
"learning_rate": 7.239057239057239e-06,
"loss": 0.3385,
"step": 2010
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.19898685255635887,
"learning_rate": 7.215007215007215e-06,
"loss": 0.3427,
"step": 2011
},
{
"epoch": 2.6099935107073327,
"grad_norm": 0.20226511821516943,
"learning_rate": 7.1909571909571915e-06,
"loss": 0.3272,
"step": 2012
},
{
"epoch": 2.6112913692407527,
"grad_norm": 0.18832236422091425,
"learning_rate": 7.166907166907168e-06,
"loss": 0.3354,
"step": 2013
},
{
"epoch": 2.6125892277741727,
"grad_norm": 0.18327010064211902,
"learning_rate": 7.142857142857143e-06,
"loss": 0.3286,
"step": 2014
},
{
"epoch": 2.6138870863075923,
"grad_norm": 0.19895899041097867,
"learning_rate": 7.118807118807119e-06,
"loss": 0.3297,
"step": 2015
},
{
"epoch": 2.6151849448410123,
"grad_norm": 0.2069075643758518,
"learning_rate": 7.094757094757096e-06,
"loss": 0.3302,
"step": 2016
},
{
"epoch": 2.6164828033744323,
"grad_norm": 0.19524313780033972,
"learning_rate": 7.0707070707070704e-06,
"loss": 0.3365,
"step": 2017
},
{
"epoch": 2.617780661907852,
"grad_norm": 0.19938059659766813,
"learning_rate": 7.046657046657047e-06,
"loss": 0.3338,
"step": 2018
},
{
"epoch": 2.619078520441272,
"grad_norm": 0.19493636421814658,
"learning_rate": 7.0226070226070225e-06,
"loss": 0.3315,
"step": 2019
},
{
"epoch": 2.620376378974692,
"grad_norm": 0.19839871511366805,
"learning_rate": 6.998556998556999e-06,
"loss": 0.3328,
"step": 2020
},
{
"epoch": 2.6216742375081115,
"grad_norm": 0.20131069199570611,
"learning_rate": 6.9745069745069755e-06,
"loss": 0.3509,
"step": 2021
},
{
"epoch": 2.6229720960415315,
"grad_norm": 0.19462988449384028,
"learning_rate": 6.95045695045695e-06,
"loss": 0.3491,
"step": 2022
},
{
"epoch": 2.6242699545749515,
"grad_norm": 0.21282789287292148,
"learning_rate": 6.926406926406927e-06,
"loss": 0.34,
"step": 2023
},
{
"epoch": 2.625567813108371,
"grad_norm": 0.1993074561130325,
"learning_rate": 6.902356902356903e-06,
"loss": 0.3466,
"step": 2024
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.20351150350698377,
"learning_rate": 6.878306878306878e-06,
"loss": 0.3598,
"step": 2025
},
{
"epoch": 2.628163530175211,
"grad_norm": 0.19771094464637703,
"learning_rate": 6.854256854256854e-06,
"loss": 0.336,
"step": 2026
},
{
"epoch": 2.6294613887086307,
"grad_norm": 0.21482701918153888,
"learning_rate": 6.830206830206831e-06,
"loss": 0.34,
"step": 2027
},
{
"epoch": 2.6307592472420507,
"grad_norm": 0.2003506016053636,
"learning_rate": 6.8061568061568065e-06,
"loss": 0.3337,
"step": 2028
},
{
"epoch": 2.6320571057754707,
"grad_norm": 0.19105711872310324,
"learning_rate": 6.782106782106783e-06,
"loss": 0.3384,
"step": 2029
},
{
"epoch": 2.6333549643088903,
"grad_norm": 0.20159758006212594,
"learning_rate": 6.758056758056758e-06,
"loss": 0.3375,
"step": 2030
},
{
"epoch": 2.6346528228423103,
"grad_norm": 0.20143060805909305,
"learning_rate": 6.734006734006734e-06,
"loss": 0.3491,
"step": 2031
},
{
"epoch": 2.6359506813757303,
"grad_norm": 0.19866829942887484,
"learning_rate": 6.709956709956711e-06,
"loss": 0.3376,
"step": 2032
},
{
"epoch": 2.63724853990915,
"grad_norm": 0.2037924476485615,
"learning_rate": 6.6859066859066855e-06,
"loss": 0.3353,
"step": 2033
},
{
"epoch": 2.63854639844257,
"grad_norm": 0.19840737231971423,
"learning_rate": 6.661856661856662e-06,
"loss": 0.3419,
"step": 2034
},
{
"epoch": 2.6398442569759895,
"grad_norm": 0.2012069800711804,
"learning_rate": 6.637806637806638e-06,
"loss": 0.3374,
"step": 2035
},
{
"epoch": 2.6411421155094095,
"grad_norm": 0.2083568986903432,
"learning_rate": 6.613756613756614e-06,
"loss": 0.3535,
"step": 2036
},
{
"epoch": 2.6424399740428295,
"grad_norm": 0.19697725314574688,
"learning_rate": 6.5897065897065905e-06,
"loss": 0.3449,
"step": 2037
},
{
"epoch": 2.643737832576249,
"grad_norm": 0.19818150221871475,
"learning_rate": 6.565656565656567e-06,
"loss": 0.3409,
"step": 2038
},
{
"epoch": 2.645035691109669,
"grad_norm": 0.1978527267292534,
"learning_rate": 6.541606541606542e-06,
"loss": 0.3373,
"step": 2039
},
{
"epoch": 2.6463335496430886,
"grad_norm": 0.18635435314872623,
"learning_rate": 6.517556517556518e-06,
"loss": 0.3342,
"step": 2040
},
{
"epoch": 2.6476314081765087,
"grad_norm": 0.19790356410543516,
"learning_rate": 6.493506493506493e-06,
"loss": 0.3436,
"step": 2041
},
{
"epoch": 2.6489292667099287,
"grad_norm": 0.20316872420087156,
"learning_rate": 6.4694564694564695e-06,
"loss": 0.3448,
"step": 2042
},
{
"epoch": 2.6502271252433482,
"grad_norm": 0.19838963723019196,
"learning_rate": 6.445406445406446e-06,
"loss": 0.3467,
"step": 2043
},
{
"epoch": 2.6515249837767683,
"grad_norm": 0.19768166770591664,
"learning_rate": 6.4213564213564216e-06,
"loss": 0.3412,
"step": 2044
},
{
"epoch": 2.6528228423101883,
"grad_norm": 0.18870632230633216,
"learning_rate": 6.397306397306398e-06,
"loss": 0.3451,
"step": 2045
},
{
"epoch": 2.654120700843608,
"grad_norm": 0.19129582515190627,
"learning_rate": 6.3732563732563745e-06,
"loss": 0.3454,
"step": 2046
},
{
"epoch": 2.655418559377028,
"grad_norm": 0.20596889818566627,
"learning_rate": 6.349206349206349e-06,
"loss": 0.3534,
"step": 2047
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.1951677844464869,
"learning_rate": 6.325156325156326e-06,
"loss": 0.3465,
"step": 2048
},
{
"epoch": 2.6580142764438675,
"grad_norm": 0.1839023960232542,
"learning_rate": 6.301106301106302e-06,
"loss": 0.3237,
"step": 2049
},
{
"epoch": 2.6593121349772875,
"grad_norm": 0.18411994909099402,
"learning_rate": 6.277056277056277e-06,
"loss": 0.3402,
"step": 2050
},
{
"epoch": 2.6606099935107075,
"grad_norm": 0.19031776842501638,
"learning_rate": 6.2530062530062535e-06,
"loss": 0.3354,
"step": 2051
},
{
"epoch": 2.661907852044127,
"grad_norm": 0.21384183187184516,
"learning_rate": 6.228956228956229e-06,
"loss": 0.342,
"step": 2052
},
{
"epoch": 2.663205710577547,
"grad_norm": 0.18478906620074495,
"learning_rate": 6.204906204906205e-06,
"loss": 0.3348,
"step": 2053
},
{
"epoch": 2.664503569110967,
"grad_norm": 0.20269895193238666,
"learning_rate": 6.180856180856181e-06,
"loss": 0.3351,
"step": 2054
},
{
"epoch": 2.6658014276443867,
"grad_norm": 0.20051102629513898,
"learning_rate": 6.156806156806158e-06,
"loss": 0.3385,
"step": 2055
},
{
"epoch": 2.6670992861778067,
"grad_norm": 0.18318606924324213,
"learning_rate": 6.132756132756133e-06,
"loss": 0.3364,
"step": 2056
},
{
"epoch": 2.6683971447112267,
"grad_norm": 0.19493171826087835,
"learning_rate": 6.108706108706109e-06,
"loss": 0.3348,
"step": 2057
},
{
"epoch": 2.6696950032446463,
"grad_norm": 0.18393836124851373,
"learning_rate": 6.0846560846560845e-06,
"loss": 0.3364,
"step": 2058
},
{
"epoch": 2.6709928617780663,
"grad_norm": 0.19573395789299228,
"learning_rate": 6.060606060606061e-06,
"loss": 0.3501,
"step": 2059
},
{
"epoch": 2.6722907203114863,
"grad_norm": 0.19289300424261566,
"learning_rate": 6.036556036556037e-06,
"loss": 0.3331,
"step": 2060
},
{
"epoch": 2.673588578844906,
"grad_norm": 0.20391450990957627,
"learning_rate": 6.012506012506012e-06,
"loss": 0.3523,
"step": 2061
},
{
"epoch": 2.674886437378326,
"grad_norm": 0.20029237980281211,
"learning_rate": 5.988455988455989e-06,
"loss": 0.3333,
"step": 2062
},
{
"epoch": 2.676184295911746,
"grad_norm": 0.1866620677693766,
"learning_rate": 5.964405964405965e-06,
"loss": 0.3311,
"step": 2063
},
{
"epoch": 2.6774821544451655,
"grad_norm": 0.19121945839733376,
"learning_rate": 5.940355940355941e-06,
"loss": 0.3411,
"step": 2064
},
{
"epoch": 2.6787800129785855,
"grad_norm": 0.19768175445567718,
"learning_rate": 5.916305916305916e-06,
"loss": 0.3331,
"step": 2065
},
{
"epoch": 2.680077871512005,
"grad_norm": 0.19564286501355985,
"learning_rate": 5.892255892255893e-06,
"loss": 0.3494,
"step": 2066
},
{
"epoch": 2.681375730045425,
"grad_norm": 0.18680263316102796,
"learning_rate": 5.8682058682058685e-06,
"loss": 0.3261,
"step": 2067
},
{
"epoch": 2.6826735885788446,
"grad_norm": 0.18888920820250896,
"learning_rate": 5.844155844155844e-06,
"loss": 0.3319,
"step": 2068
},
{
"epoch": 2.6839714471122647,
"grad_norm": 0.19397052933336428,
"learning_rate": 5.82010582010582e-06,
"loss": 0.3361,
"step": 2069
},
{
"epoch": 2.6852693056456847,
"grad_norm": 0.19077138187186174,
"learning_rate": 5.796055796055796e-06,
"loss": 0.3299,
"step": 2070
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.21127811092361148,
"learning_rate": 5.772005772005773e-06,
"loss": 0.3351,
"step": 2071
},
{
"epoch": 2.6878650227125243,
"grad_norm": 0.19439235797880994,
"learning_rate": 5.747955747955748e-06,
"loss": 0.3311,
"step": 2072
},
{
"epoch": 2.6891628812459443,
"grad_norm": 0.18768581482039637,
"learning_rate": 5.723905723905725e-06,
"loss": 0.3408,
"step": 2073
},
{
"epoch": 2.690460739779364,
"grad_norm": 0.20150498398104075,
"learning_rate": 5.6998556998557e-06,
"loss": 0.3297,
"step": 2074
},
{
"epoch": 2.691758598312784,
"grad_norm": 0.2030708714806736,
"learning_rate": 5.675805675805676e-06,
"loss": 0.3464,
"step": 2075
},
{
"epoch": 2.693056456846204,
"grad_norm": 0.19331515757587614,
"learning_rate": 5.651755651755652e-06,
"loss": 0.341,
"step": 2076
},
{
"epoch": 2.6943543153796234,
"grad_norm": 0.20425020368275043,
"learning_rate": 5.627705627705628e-06,
"loss": 0.3533,
"step": 2077
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.2332987934303732,
"learning_rate": 5.603655603655604e-06,
"loss": 0.3395,
"step": 2078
},
{
"epoch": 2.6969500324464635,
"grad_norm": 0.19691484544610982,
"learning_rate": 5.579605579605579e-06,
"loss": 0.3493,
"step": 2079
},
{
"epoch": 2.698247890979883,
"grad_norm": 0.2052051327790077,
"learning_rate": 5.555555555555556e-06,
"loss": 0.3622,
"step": 2080
},
{
"epoch": 2.699545749513303,
"grad_norm": 0.19298325427314583,
"learning_rate": 5.531505531505532e-06,
"loss": 0.3275,
"step": 2081
},
{
"epoch": 2.700843608046723,
"grad_norm": 0.19205634748728384,
"learning_rate": 5.507455507455508e-06,
"loss": 0.3364,
"step": 2082
},
{
"epoch": 2.7021414665801426,
"grad_norm": 0.199120001609843,
"learning_rate": 5.4834054834054835e-06,
"loss": 0.3604,
"step": 2083
},
{
"epoch": 2.7034393251135627,
"grad_norm": 0.19279039644707233,
"learning_rate": 5.45935545935546e-06,
"loss": 0.3531,
"step": 2084
},
{
"epoch": 2.7047371836469827,
"grad_norm": 0.19816454924229257,
"learning_rate": 5.435305435305436e-06,
"loss": 0.3386,
"step": 2085
},
{
"epoch": 2.7060350421804023,
"grad_norm": 0.1978192250026057,
"learning_rate": 5.411255411255411e-06,
"loss": 0.3353,
"step": 2086
},
{
"epoch": 2.7073329007138223,
"grad_norm": 0.1866947813546459,
"learning_rate": 5.387205387205387e-06,
"loss": 0.3361,
"step": 2087
},
{
"epoch": 2.7086307592472423,
"grad_norm": 0.19130243354442364,
"learning_rate": 5.363155363155363e-06,
"loss": 0.3339,
"step": 2088
},
{
"epoch": 2.709928617780662,
"grad_norm": 0.20123966523314857,
"learning_rate": 5.33910533910534e-06,
"loss": 0.352,
"step": 2089
},
{
"epoch": 2.711226476314082,
"grad_norm": 0.204082189254048,
"learning_rate": 5.3150553150553154e-06,
"loss": 0.3465,
"step": 2090
},
{
"epoch": 2.712524334847502,
"grad_norm": 0.18978137246025184,
"learning_rate": 5.291005291005291e-06,
"loss": 0.3436,
"step": 2091
},
{
"epoch": 2.7138221933809215,
"grad_norm": 0.18714999050678918,
"learning_rate": 5.2669552669552675e-06,
"loss": 0.3191,
"step": 2092
},
{
"epoch": 2.7151200519143415,
"grad_norm": 0.19155333104247205,
"learning_rate": 5.242905242905243e-06,
"loss": 0.3298,
"step": 2093
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.19864886304787258,
"learning_rate": 5.218855218855219e-06,
"loss": 0.337,
"step": 2094
},
{
"epoch": 2.717715768981181,
"grad_norm": 0.20900011549166994,
"learning_rate": 5.194805194805195e-06,
"loss": 0.3351,
"step": 2095
},
{
"epoch": 2.719013627514601,
"grad_norm": 0.19283552005981475,
"learning_rate": 5.170755170755171e-06,
"loss": 0.3415,
"step": 2096
},
{
"epoch": 2.7203114860480206,
"grad_norm": 0.19735002707288396,
"learning_rate": 5.146705146705147e-06,
"loss": 0.3511,
"step": 2097
},
{
"epoch": 2.7216093445814407,
"grad_norm": 0.19007560174944538,
"learning_rate": 5.122655122655123e-06,
"loss": 0.3337,
"step": 2098
},
{
"epoch": 2.7229072031148602,
"grad_norm": 0.1945528470127734,
"learning_rate": 5.0986050986050994e-06,
"loss": 0.3333,
"step": 2099
},
{
"epoch": 2.7242050616482802,
"grad_norm": 0.19541816607187268,
"learning_rate": 5.074555074555075e-06,
"loss": 0.3375,
"step": 2100
},
{
"epoch": 2.7255029201817003,
"grad_norm": 0.1965347736989204,
"learning_rate": 5.050505050505051e-06,
"loss": 0.3374,
"step": 2101
},
{
"epoch": 2.72680077871512,
"grad_norm": 0.20176319781405136,
"learning_rate": 5.026455026455026e-06,
"loss": 0.3292,
"step": 2102
},
{
"epoch": 2.72809863724854,
"grad_norm": 0.20334136636920452,
"learning_rate": 5.002405002405003e-06,
"loss": 0.3546,
"step": 2103
},
{
"epoch": 2.72939649578196,
"grad_norm": 0.20179646454271286,
"learning_rate": 4.978354978354978e-06,
"loss": 0.3401,
"step": 2104
},
{
"epoch": 2.7306943543153794,
"grad_norm": 0.20239489126680496,
"learning_rate": 4.954304954304954e-06,
"loss": 0.362,
"step": 2105
},
{
"epoch": 2.7319922128487995,
"grad_norm": 0.1870990703654753,
"learning_rate": 4.9302549302549305e-06,
"loss": 0.3443,
"step": 2106
},
{
"epoch": 2.7332900713822195,
"grad_norm": 0.18557759206233268,
"learning_rate": 4.906204906204907e-06,
"loss": 0.3313,
"step": 2107
},
{
"epoch": 2.734587929915639,
"grad_norm": 0.19525861207810466,
"learning_rate": 4.8821548821548826e-06,
"loss": 0.3471,
"step": 2108
},
{
"epoch": 2.735885788449059,
"grad_norm": 0.1863895368451734,
"learning_rate": 4.858104858104858e-06,
"loss": 0.3302,
"step": 2109
},
{
"epoch": 2.737183646982479,
"grad_norm": 0.19520260521522104,
"learning_rate": 4.834054834054835e-06,
"loss": 0.3285,
"step": 2110
},
{
"epoch": 2.7384815055158986,
"grad_norm": 0.19296947043696586,
"learning_rate": 4.81000481000481e-06,
"loss": 0.3493,
"step": 2111
},
{
"epoch": 2.7397793640493187,
"grad_norm": 0.19207052807282113,
"learning_rate": 4.785954785954786e-06,
"loss": 0.3557,
"step": 2112
},
{
"epoch": 2.7410772225827387,
"grad_norm": 0.19274032451154754,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.3363,
"step": 2113
},
{
"epoch": 2.7423750811161582,
"grad_norm": 0.1991025079179398,
"learning_rate": 4.737854737854738e-06,
"loss": 0.3383,
"step": 2114
},
{
"epoch": 2.7436729396495783,
"grad_norm": 0.20322726939044952,
"learning_rate": 4.7138047138047145e-06,
"loss": 0.3309,
"step": 2115
},
{
"epoch": 2.7449707981829983,
"grad_norm": 0.1953261545822402,
"learning_rate": 4.68975468975469e-06,
"loss": 0.3324,
"step": 2116
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.20572870381079225,
"learning_rate": 4.665704665704666e-06,
"loss": 0.3519,
"step": 2117
},
{
"epoch": 2.747566515249838,
"grad_norm": 0.1896514715456808,
"learning_rate": 4.641654641654642e-06,
"loss": 0.3269,
"step": 2118
},
{
"epoch": 2.748864373783258,
"grad_norm": 0.20395631514698148,
"learning_rate": 4.617604617604618e-06,
"loss": 0.3378,
"step": 2119
},
{
"epoch": 2.7501622323166774,
"grad_norm": 0.18572879939359394,
"learning_rate": 4.5935545935545934e-06,
"loss": 0.3577,
"step": 2120
},
{
"epoch": 2.7514600908500975,
"grad_norm": 0.1888314312446457,
"learning_rate": 4.56950456950457e-06,
"loss": 0.3323,
"step": 2121
},
{
"epoch": 2.752757949383517,
"grad_norm": 0.19430952155026918,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.3439,
"step": 2122
},
{
"epoch": 2.754055807916937,
"grad_norm": 0.42636074237494337,
"learning_rate": 4.521404521404521e-06,
"loss": 0.3473,
"step": 2123
},
{
"epoch": 2.755353666450357,
"grad_norm": 0.2086969030917497,
"learning_rate": 4.497354497354498e-06,
"loss": 0.3651,
"step": 2124
},
{
"epoch": 2.7566515249837766,
"grad_norm": 0.18713017378956393,
"learning_rate": 4.473304473304474e-06,
"loss": 0.3391,
"step": 2125
},
{
"epoch": 2.7579493835171967,
"grad_norm": 0.1997677484748378,
"learning_rate": 4.44925444925445e-06,
"loss": 0.3296,
"step": 2126
},
{
"epoch": 2.7592472420506162,
"grad_norm": 0.19085510775569323,
"learning_rate": 4.425204425204425e-06,
"loss": 0.3306,
"step": 2127
},
{
"epoch": 2.7605451005840362,
"grad_norm": 0.1897332096553123,
"learning_rate": 4.401154401154401e-06,
"loss": 0.3319,
"step": 2128
},
{
"epoch": 2.7618429591174563,
"grad_norm": 0.18447234964642742,
"learning_rate": 4.377104377104377e-06,
"loss": 0.3381,
"step": 2129
},
{
"epoch": 2.763140817650876,
"grad_norm": 0.19659685588771536,
"learning_rate": 4.353054353054353e-06,
"loss": 0.3411,
"step": 2130
},
{
"epoch": 2.764438676184296,
"grad_norm": 0.19285636988233915,
"learning_rate": 4.329004329004329e-06,
"loss": 0.3363,
"step": 2131
},
{
"epoch": 2.765736534717716,
"grad_norm": 0.1873294345390938,
"learning_rate": 4.304954304954305e-06,
"loss": 0.3479,
"step": 2132
},
{
"epoch": 2.7670343932511354,
"grad_norm": 0.20641352605961297,
"learning_rate": 4.280904280904282e-06,
"loss": 0.3479,
"step": 2133
},
{
"epoch": 2.7683322517845554,
"grad_norm": 0.19611830886976564,
"learning_rate": 4.256854256854257e-06,
"loss": 0.3598,
"step": 2134
},
{
"epoch": 2.7696301103179755,
"grad_norm": 0.19362363231720492,
"learning_rate": 4.232804232804233e-06,
"loss": 0.3587,
"step": 2135
},
{
"epoch": 2.770927968851395,
"grad_norm": 0.18330247717729053,
"learning_rate": 4.208754208754209e-06,
"loss": 0.3395,
"step": 2136
},
{
"epoch": 2.772225827384815,
"grad_norm": 0.194933200111678,
"learning_rate": 4.184704184704185e-06,
"loss": 0.3508,
"step": 2137
},
{
"epoch": 2.773523685918235,
"grad_norm": 0.21351784132569623,
"learning_rate": 4.1606541606541606e-06,
"loss": 0.3569,
"step": 2138
},
{
"epoch": 2.7748215444516546,
"grad_norm": 0.2022298125861802,
"learning_rate": 4.136604136604136e-06,
"loss": 0.3452,
"step": 2139
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.19008703004655647,
"learning_rate": 4.112554112554113e-06,
"loss": 0.3261,
"step": 2140
},
{
"epoch": 2.7774172615184947,
"grad_norm": 0.1996334755494925,
"learning_rate": 4.088504088504089e-06,
"loss": 0.3348,
"step": 2141
},
{
"epoch": 2.7787151200519142,
"grad_norm": 0.2016162252782046,
"learning_rate": 4.064454064454065e-06,
"loss": 0.339,
"step": 2142
},
{
"epoch": 2.7800129785853342,
"grad_norm": 0.20145152124370821,
"learning_rate": 4.040404040404041e-06,
"loss": 0.3384,
"step": 2143
},
{
"epoch": 2.7813108371187543,
"grad_norm": 0.20921251143803626,
"learning_rate": 4.016354016354017e-06,
"loss": 0.3449,
"step": 2144
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.18838640289015987,
"learning_rate": 3.9923039923039925e-06,
"loss": 0.3249,
"step": 2145
},
{
"epoch": 2.783906554185594,
"grad_norm": 0.203141862957814,
"learning_rate": 3.968253968253968e-06,
"loss": 0.3474,
"step": 2146
},
{
"epoch": 2.785204412719014,
"grad_norm": 0.18959922111635913,
"learning_rate": 3.9442039442039446e-06,
"loss": 0.3358,
"step": 2147
},
{
"epoch": 2.7865022712524334,
"grad_norm": 0.2034563222680623,
"learning_rate": 3.92015392015392e-06,
"loss": 0.3399,
"step": 2148
},
{
"epoch": 2.7878001297858535,
"grad_norm": 0.197654010004629,
"learning_rate": 3.896103896103896e-06,
"loss": 0.3384,
"step": 2149
},
{
"epoch": 2.7890979883192735,
"grad_norm": 0.20680720078235312,
"learning_rate": 3.872053872053872e-06,
"loss": 0.3326,
"step": 2150
},
{
"epoch": 2.790395846852693,
"grad_norm": 0.1927827514450044,
"learning_rate": 3.848003848003849e-06,
"loss": 0.3276,
"step": 2151
},
{
"epoch": 2.791693705386113,
"grad_norm": 0.1899459712119537,
"learning_rate": 3.823953823953824e-06,
"loss": 0.3368,
"step": 2152
},
{
"epoch": 2.7929915639195326,
"grad_norm": 0.18276333170806777,
"learning_rate": 3.7999037999038e-06,
"loss": 0.3252,
"step": 2153
},
{
"epoch": 2.7942894224529526,
"grad_norm": 0.19256844716061763,
"learning_rate": 3.7758537758537756e-06,
"loss": 0.3483,
"step": 2154
},
{
"epoch": 2.795587280986372,
"grad_norm": 0.18473354170932832,
"learning_rate": 3.751803751803752e-06,
"loss": 0.3329,
"step": 2155
},
{
"epoch": 2.7968851395197922,
"grad_norm": 0.20272211241013205,
"learning_rate": 3.727753727753728e-06,
"loss": 0.363,
"step": 2156
},
{
"epoch": 2.7981829980532122,
"grad_norm": 0.210319586938317,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.3547,
"step": 2157
},
{
"epoch": 2.799480856586632,
"grad_norm": 0.18619320502174647,
"learning_rate": 3.67965367965368e-06,
"loss": 0.3282,
"step": 2158
},
{
"epoch": 2.800778715120052,
"grad_norm": 0.1772781571540208,
"learning_rate": 3.655603655603656e-06,
"loss": 0.33,
"step": 2159
},
{
"epoch": 2.802076573653472,
"grad_norm": 0.19401337158226317,
"learning_rate": 3.6315536315536315e-06,
"loss": 0.3418,
"step": 2160
},
{
"epoch": 2.8033744321868914,
"grad_norm": 0.20093342296511638,
"learning_rate": 3.6075036075036075e-06,
"loss": 0.3324,
"step": 2161
},
{
"epoch": 2.8046722907203114,
"grad_norm": 0.18887641527097687,
"learning_rate": 3.583453583453584e-06,
"loss": 0.3256,
"step": 2162
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.17961558157640115,
"learning_rate": 3.5594035594035596e-06,
"loss": 0.3328,
"step": 2163
},
{
"epoch": 2.807268007787151,
"grad_norm": 0.20347532828593798,
"learning_rate": 3.5353535353535352e-06,
"loss": 0.3421,
"step": 2164
},
{
"epoch": 2.808565866320571,
"grad_norm": 0.18010386527024905,
"learning_rate": 3.5113035113035113e-06,
"loss": 0.326,
"step": 2165
},
{
"epoch": 2.809863724853991,
"grad_norm": 0.18682472145471216,
"learning_rate": 3.4872534872534877e-06,
"loss": 0.3277,
"step": 2166
},
{
"epoch": 2.8111615833874106,
"grad_norm": 0.18668496331694528,
"learning_rate": 3.4632034632034634e-06,
"loss": 0.3441,
"step": 2167
},
{
"epoch": 2.8124594419208306,
"grad_norm": 0.18876447344150002,
"learning_rate": 3.439153439153439e-06,
"loss": 0.3543,
"step": 2168
},
{
"epoch": 2.8137573004542507,
"grad_norm": 0.1880026989268264,
"learning_rate": 3.4151034151034154e-06,
"loss": 0.3419,
"step": 2169
},
{
"epoch": 2.8150551589876702,
"grad_norm": 0.19326058199934312,
"learning_rate": 3.3910533910533915e-06,
"loss": 0.3332,
"step": 2170
},
{
"epoch": 2.8163530175210902,
"grad_norm": 0.18329023377490067,
"learning_rate": 3.367003367003367e-06,
"loss": 0.3632,
"step": 2171
},
{
"epoch": 2.8176508760545103,
"grad_norm": 0.19371890235019304,
"learning_rate": 3.3429533429533427e-06,
"loss": 0.3377,
"step": 2172
},
{
"epoch": 2.81894873458793,
"grad_norm": 0.18600523979644987,
"learning_rate": 3.318903318903319e-06,
"loss": 0.3469,
"step": 2173
},
{
"epoch": 2.82024659312135,
"grad_norm": 0.19389227471137455,
"learning_rate": 3.2948532948532953e-06,
"loss": 0.3387,
"step": 2174
},
{
"epoch": 2.82154445165477,
"grad_norm": 0.18954299093028096,
"learning_rate": 3.270803270803271e-06,
"loss": 0.3279,
"step": 2175
},
{
"epoch": 2.8228423101881894,
"grad_norm": 0.19152410986871543,
"learning_rate": 3.2467532467532465e-06,
"loss": 0.3374,
"step": 2176
},
{
"epoch": 2.8241401687216094,
"grad_norm": 0.196063104407719,
"learning_rate": 3.222703222703223e-06,
"loss": 0.3385,
"step": 2177
},
{
"epoch": 2.8254380272550295,
"grad_norm": 0.18850542953314792,
"learning_rate": 3.198653198653199e-06,
"loss": 0.349,
"step": 2178
},
{
"epoch": 2.826735885788449,
"grad_norm": 0.20124590955928826,
"learning_rate": 3.1746031746031746e-06,
"loss": 0.3415,
"step": 2179
},
{
"epoch": 2.828033744321869,
"grad_norm": 0.20248550914830157,
"learning_rate": 3.150553150553151e-06,
"loss": 0.3461,
"step": 2180
},
{
"epoch": 2.8293316028552886,
"grad_norm": 0.20035006058516966,
"learning_rate": 3.1265031265031267e-06,
"loss": 0.3432,
"step": 2181
},
{
"epoch": 2.8306294613887086,
"grad_norm": 0.1855009910859687,
"learning_rate": 3.1024531024531023e-06,
"loss": 0.3187,
"step": 2182
},
{
"epoch": 2.8319273199221286,
"grad_norm": 0.2143582348750643,
"learning_rate": 3.078403078403079e-06,
"loss": 0.3405,
"step": 2183
},
{
"epoch": 2.833225178455548,
"grad_norm": 0.19882736391050926,
"learning_rate": 3.0543530543530544e-06,
"loss": 0.3437,
"step": 2184
},
{
"epoch": 2.8345230369889682,
"grad_norm": 0.19373263047996803,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.3391,
"step": 2185
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.19378870740750723,
"learning_rate": 3.006253006253006e-06,
"loss": 0.3478,
"step": 2186
},
{
"epoch": 2.837118754055808,
"grad_norm": 0.24107367256249349,
"learning_rate": 2.9822029822029826e-06,
"loss": 0.3458,
"step": 2187
},
{
"epoch": 2.838416612589228,
"grad_norm": 0.18521053064833032,
"learning_rate": 2.958152958152958e-06,
"loss": 0.3383,
"step": 2188
},
{
"epoch": 2.8397144711226474,
"grad_norm": 0.1881021394765785,
"learning_rate": 2.9341029341029342e-06,
"loss": 0.3517,
"step": 2189
},
{
"epoch": 2.8410123296560674,
"grad_norm": 0.20171700206005888,
"learning_rate": 2.91005291005291e-06,
"loss": 0.3689,
"step": 2190
},
{
"epoch": 2.8423101881894874,
"grad_norm": 0.18467275960567497,
"learning_rate": 2.8860028860028863e-06,
"loss": 0.3271,
"step": 2191
},
{
"epoch": 2.843608046722907,
"grad_norm": 0.1935758058556294,
"learning_rate": 2.8619528619528624e-06,
"loss": 0.3457,
"step": 2192
},
{
"epoch": 2.844905905256327,
"grad_norm": 0.20313128506718564,
"learning_rate": 2.837902837902838e-06,
"loss": 0.3315,
"step": 2193
},
{
"epoch": 2.846203763789747,
"grad_norm": 0.2694894225052093,
"learning_rate": 2.813852813852814e-06,
"loss": 0.3651,
"step": 2194
},
{
"epoch": 2.8475016223231666,
"grad_norm": 0.191890934429557,
"learning_rate": 2.7898027898027897e-06,
"loss": 0.3457,
"step": 2195
},
{
"epoch": 2.8487994808565866,
"grad_norm": 0.18753053839949832,
"learning_rate": 2.765752765752766e-06,
"loss": 0.3438,
"step": 2196
},
{
"epoch": 2.8500973393900066,
"grad_norm": 0.18148036317355992,
"learning_rate": 2.7417027417027418e-06,
"loss": 0.3271,
"step": 2197
},
{
"epoch": 2.851395197923426,
"grad_norm": 0.20235355112701617,
"learning_rate": 2.717652717652718e-06,
"loss": 0.3468,
"step": 2198
},
{
"epoch": 2.8526930564568462,
"grad_norm": 0.19882040438487505,
"learning_rate": 2.6936026936026934e-06,
"loss": 0.3649,
"step": 2199
},
{
"epoch": 2.8539909149902662,
"grad_norm": 0.1919921579971501,
"learning_rate": 2.66955266955267e-06,
"loss": 0.3373,
"step": 2200
},
{
"epoch": 2.855288773523686,
"grad_norm": 0.19166967508036267,
"learning_rate": 2.6455026455026455e-06,
"loss": 0.3407,
"step": 2201
},
{
"epoch": 2.856586632057106,
"grad_norm": 0.18413982998209266,
"learning_rate": 2.6214526214526216e-06,
"loss": 0.3281,
"step": 2202
},
{
"epoch": 2.857884490590526,
"grad_norm": 0.1963183914870018,
"learning_rate": 2.5974025974025976e-06,
"loss": 0.3519,
"step": 2203
},
{
"epoch": 2.8591823491239454,
"grad_norm": 0.19124327719338702,
"learning_rate": 2.5733525733525737e-06,
"loss": 0.3546,
"step": 2204
},
{
"epoch": 2.8604802076573654,
"grad_norm": 0.1908455405574935,
"learning_rate": 2.5493025493025497e-06,
"loss": 0.3441,
"step": 2205
},
{
"epoch": 2.8617780661907855,
"grad_norm": 0.20074510401322135,
"learning_rate": 2.5252525252525253e-06,
"loss": 0.3623,
"step": 2206
},
{
"epoch": 2.863075924724205,
"grad_norm": 0.19215908226190542,
"learning_rate": 2.5012025012025014e-06,
"loss": 0.3323,
"step": 2207
},
{
"epoch": 2.864373783257625,
"grad_norm": 0.18434791337783116,
"learning_rate": 2.477152477152477e-06,
"loss": 0.3319,
"step": 2208
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.19538722948283108,
"learning_rate": 2.4531024531024535e-06,
"loss": 0.3275,
"step": 2209
},
{
"epoch": 2.8669695003244646,
"grad_norm": 0.1882905413491712,
"learning_rate": 2.429052429052429e-06,
"loss": 0.3301,
"step": 2210
},
{
"epoch": 2.8682673588578846,
"grad_norm": 0.18413079624889964,
"learning_rate": 2.405002405002405e-06,
"loss": 0.3316,
"step": 2211
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.18114340544426985,
"learning_rate": 2.3809523809523808e-06,
"loss": 0.3293,
"step": 2212
},
{
"epoch": 2.8708630759247242,
"grad_norm": 0.18339952929530665,
"learning_rate": 2.3569023569023572e-06,
"loss": 0.3385,
"step": 2213
},
{
"epoch": 2.872160934458144,
"grad_norm": 0.19149046899206099,
"learning_rate": 2.332852332852333e-06,
"loss": 0.3362,
"step": 2214
},
{
"epoch": 2.873458792991564,
"grad_norm": 0.21144971266274312,
"learning_rate": 2.308802308802309e-06,
"loss": 0.3426,
"step": 2215
},
{
"epoch": 2.874756651524984,
"grad_norm": 0.18252025886810355,
"learning_rate": 2.284752284752285e-06,
"loss": 0.3461,
"step": 2216
},
{
"epoch": 2.8760545100584034,
"grad_norm": 0.18661014168815393,
"learning_rate": 2.2607022607022606e-06,
"loss": 0.3532,
"step": 2217
},
{
"epoch": 2.8773523685918234,
"grad_norm": 0.19921497008286657,
"learning_rate": 2.236652236652237e-06,
"loss": 0.3456,
"step": 2218
},
{
"epoch": 2.8786502271252434,
"grad_norm": 0.19199558097836697,
"learning_rate": 2.2126022126022127e-06,
"loss": 0.336,
"step": 2219
},
{
"epoch": 2.879948085658663,
"grad_norm": 0.18697397277980365,
"learning_rate": 2.1885521885521887e-06,
"loss": 0.3274,
"step": 2220
},
{
"epoch": 2.881245944192083,
"grad_norm": 0.18651465996633548,
"learning_rate": 2.1645021645021643e-06,
"loss": 0.3362,
"step": 2221
},
{
"epoch": 2.882543802725503,
"grad_norm": 0.18811253057615676,
"learning_rate": 2.140452140452141e-06,
"loss": 0.3441,
"step": 2222
},
{
"epoch": 2.8838416612589226,
"grad_norm": 0.1807114251046355,
"learning_rate": 2.1164021164021164e-06,
"loss": 0.3237,
"step": 2223
},
{
"epoch": 2.8851395197923426,
"grad_norm": 0.18668302514006135,
"learning_rate": 2.0923520923520925e-06,
"loss": 0.3556,
"step": 2224
},
{
"epoch": 2.8864373783257626,
"grad_norm": 0.1951670359448047,
"learning_rate": 2.068302068302068e-06,
"loss": 0.3181,
"step": 2225
},
{
"epoch": 2.887735236859182,
"grad_norm": 0.1874121175894903,
"learning_rate": 2.0442520442520446e-06,
"loss": 0.3447,
"step": 2226
},
{
"epoch": 2.8890330953926022,
"grad_norm": 0.18533011275342226,
"learning_rate": 2.0202020202020206e-06,
"loss": 0.3363,
"step": 2227
},
{
"epoch": 2.8903309539260222,
"grad_norm": 0.18793474012414535,
"learning_rate": 1.9961519961519962e-06,
"loss": 0.345,
"step": 2228
},
{
"epoch": 2.891628812459442,
"grad_norm": 0.18922319032385707,
"learning_rate": 1.9721019721019723e-06,
"loss": 0.3386,
"step": 2229
},
{
"epoch": 2.892926670992862,
"grad_norm": 0.19294630111421893,
"learning_rate": 1.948051948051948e-06,
"loss": 0.3513,
"step": 2230
},
{
"epoch": 2.894224529526282,
"grad_norm": 0.1831585914628477,
"learning_rate": 1.9240019240019244e-06,
"loss": 0.3185,
"step": 2231
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.18959540248425852,
"learning_rate": 1.8999518999519e-06,
"loss": 0.3448,
"step": 2232
},
{
"epoch": 2.8968202465931214,
"grad_norm": 0.18655567267427287,
"learning_rate": 1.875901875901876e-06,
"loss": 0.3262,
"step": 2233
},
{
"epoch": 2.8981181051265414,
"grad_norm": 0.18952603893507794,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.3282,
"step": 2234
},
{
"epoch": 2.899415963659961,
"grad_norm": 0.1876885696047965,
"learning_rate": 1.827801827801828e-06,
"loss": 0.3435,
"step": 2235
},
{
"epoch": 2.900713822193381,
"grad_norm": 0.1865624408496654,
"learning_rate": 1.8037518037518038e-06,
"loss": 0.3331,
"step": 2236
},
{
"epoch": 2.902011680726801,
"grad_norm": 0.19122160432352084,
"learning_rate": 1.7797017797017798e-06,
"loss": 0.3371,
"step": 2237
},
{
"epoch": 2.9033095392602206,
"grad_norm": 0.19352943277773518,
"learning_rate": 1.7556517556517556e-06,
"loss": 0.3405,
"step": 2238
},
{
"epoch": 2.9046073977936406,
"grad_norm": 0.19319000412284978,
"learning_rate": 1.7316017316017317e-06,
"loss": 0.3399,
"step": 2239
},
{
"epoch": 2.90590525632706,
"grad_norm": 0.19327464804923486,
"learning_rate": 1.7075517075517077e-06,
"loss": 0.3319,
"step": 2240
},
{
"epoch": 2.90720311486048,
"grad_norm": 0.19976992892290674,
"learning_rate": 1.6835016835016836e-06,
"loss": 0.3432,
"step": 2241
},
{
"epoch": 2.9085009733939,
"grad_norm": 0.1892168913000648,
"learning_rate": 1.6594516594516596e-06,
"loss": 0.3463,
"step": 2242
},
{
"epoch": 2.90979883192732,
"grad_norm": 0.19443589296751324,
"learning_rate": 1.6354016354016354e-06,
"loss": 0.357,
"step": 2243
},
{
"epoch": 2.91109669046074,
"grad_norm": 0.18449321307823713,
"learning_rate": 1.6113516113516115e-06,
"loss": 0.345,
"step": 2244
},
{
"epoch": 2.9123945489941594,
"grad_norm": 0.1858254182171351,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.3447,
"step": 2245
},
{
"epoch": 2.9136924075275794,
"grad_norm": 0.2004597208671706,
"learning_rate": 1.5632515632515634e-06,
"loss": 0.3366,
"step": 2246
},
{
"epoch": 2.9149902660609994,
"grad_norm": 0.18322062527491037,
"learning_rate": 1.5392015392015394e-06,
"loss": 0.3375,
"step": 2247
},
{
"epoch": 2.916288124594419,
"grad_norm": 0.1905983562403602,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.3411,
"step": 2248
},
{
"epoch": 2.917585983127839,
"grad_norm": 0.1867573407190652,
"learning_rate": 1.4911014911014913e-06,
"loss": 0.336,
"step": 2249
},
{
"epoch": 2.918883841661259,
"grad_norm": 0.19058206255135468,
"learning_rate": 1.4670514670514671e-06,
"loss": 0.3474,
"step": 2250
},
{
"epoch": 2.9201817001946786,
"grad_norm": 0.18685316734046567,
"learning_rate": 1.4430014430014432e-06,
"loss": 0.3502,
"step": 2251
},
{
"epoch": 2.9214795587280986,
"grad_norm": 0.18640666171276632,
"learning_rate": 1.418951418951419e-06,
"loss": 0.3317,
"step": 2252
},
{
"epoch": 2.9227774172615186,
"grad_norm": 0.19155464358225485,
"learning_rate": 1.3949013949013948e-06,
"loss": 0.3352,
"step": 2253
},
{
"epoch": 2.924075275794938,
"grad_norm": 0.1868855983928718,
"learning_rate": 1.3708513708513709e-06,
"loss": 0.328,
"step": 2254
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.18433362766279043,
"learning_rate": 1.3468013468013467e-06,
"loss": 0.3452,
"step": 2255
},
{
"epoch": 2.9266709928617782,
"grad_norm": 0.19259127006608057,
"learning_rate": 1.3227513227513228e-06,
"loss": 0.3404,
"step": 2256
},
{
"epoch": 2.927968851395198,
"grad_norm": 0.18080624245350022,
"learning_rate": 1.2987012987012988e-06,
"loss": 0.3266,
"step": 2257
},
{
"epoch": 2.929266709928618,
"grad_norm": 0.17871124072334996,
"learning_rate": 1.2746512746512749e-06,
"loss": 0.3395,
"step": 2258
},
{
"epoch": 2.930564568462038,
"grad_norm": 0.18709418548907147,
"learning_rate": 1.2506012506012507e-06,
"loss": 0.346,
"step": 2259
},
{
"epoch": 2.9318624269954574,
"grad_norm": 0.18683092960850883,
"learning_rate": 1.2265512265512267e-06,
"loss": 0.336,
"step": 2260
},
{
"epoch": 2.9331602855288774,
"grad_norm": 0.18777575130149565,
"learning_rate": 1.2025012025012026e-06,
"loss": 0.3368,
"step": 2261
},
{
"epoch": 2.9344581440622974,
"grad_norm": 0.18324636658598714,
"learning_rate": 1.1784511784511786e-06,
"loss": 0.3292,
"step": 2262
},
{
"epoch": 2.935756002595717,
"grad_norm": 0.1851227917603969,
"learning_rate": 1.1544011544011545e-06,
"loss": 0.3378,
"step": 2263
},
{
"epoch": 2.937053861129137,
"grad_norm": 0.19847788748302606,
"learning_rate": 1.1303511303511303e-06,
"loss": 0.3754,
"step": 2264
},
{
"epoch": 2.938351719662557,
"grad_norm": 0.177806134860686,
"learning_rate": 1.1063011063011063e-06,
"loss": 0.327,
"step": 2265
},
{
"epoch": 2.9396495781959766,
"grad_norm": 0.197095005642012,
"learning_rate": 1.0822510822510822e-06,
"loss": 0.344,
"step": 2266
},
{
"epoch": 2.9409474367293966,
"grad_norm": 0.18894739122645604,
"learning_rate": 1.0582010582010582e-06,
"loss": 0.3437,
"step": 2267
},
{
"epoch": 2.942245295262816,
"grad_norm": 0.1763401949490533,
"learning_rate": 1.034151034151034e-06,
"loss": 0.3285,
"step": 2268
},
{
"epoch": 2.943543153796236,
"grad_norm": 0.1786777654803748,
"learning_rate": 1.0101010101010103e-06,
"loss": 0.3264,
"step": 2269
},
{
"epoch": 2.9448410123296562,
"grad_norm": 0.18511698934832105,
"learning_rate": 9.860509860509861e-07,
"loss": 0.3403,
"step": 2270
},
{
"epoch": 2.946138870863076,
"grad_norm": 0.18872890425471747,
"learning_rate": 9.620009620009622e-07,
"loss": 0.3288,
"step": 2271
},
{
"epoch": 2.947436729396496,
"grad_norm": 0.18279278752067737,
"learning_rate": 9.37950937950938e-07,
"loss": 0.3422,
"step": 2272
},
{
"epoch": 2.9487345879299154,
"grad_norm": 0.18006141885171842,
"learning_rate": 9.13900913900914e-07,
"loss": 0.3446,
"step": 2273
},
{
"epoch": 2.9500324464633354,
"grad_norm": 0.19260565452121156,
"learning_rate": 8.898508898508899e-07,
"loss": 0.3398,
"step": 2274
},
{
"epoch": 2.9513303049967554,
"grad_norm": 0.21921401490874187,
"learning_rate": 8.658008658008658e-07,
"loss": 0.3568,
"step": 2275
},
{
"epoch": 2.952628163530175,
"grad_norm": 0.18842161857636638,
"learning_rate": 8.417508417508418e-07,
"loss": 0.3498,
"step": 2276
},
{
"epoch": 2.953926022063595,
"grad_norm": 0.17891619649785445,
"learning_rate": 8.177008177008177e-07,
"loss": 0.3272,
"step": 2277
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.17755022636746284,
"learning_rate": 7.936507936507937e-07,
"loss": 0.3231,
"step": 2278
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.1863595372909174,
"learning_rate": 7.696007696007697e-07,
"loss": 0.3408,
"step": 2279
},
{
"epoch": 2.9578195976638546,
"grad_norm": 0.17943740925178142,
"learning_rate": 7.455507455507456e-07,
"loss": 0.3222,
"step": 2280
},
{
"epoch": 2.9591174561972746,
"grad_norm": 0.1875857112899972,
"learning_rate": 7.215007215007216e-07,
"loss": 0.33,
"step": 2281
},
{
"epoch": 2.960415314730694,
"grad_norm": 0.18672707365536773,
"learning_rate": 6.974506974506974e-07,
"loss": 0.3377,
"step": 2282
},
{
"epoch": 2.961713173264114,
"grad_norm": 0.18854762948864245,
"learning_rate": 6.734006734006734e-07,
"loss": 0.3511,
"step": 2283
},
{
"epoch": 2.9630110317975342,
"grad_norm": 0.1846633145163194,
"learning_rate": 6.493506493506494e-07,
"loss": 0.3301,
"step": 2284
},
{
"epoch": 2.964308890330954,
"grad_norm": 0.1782259793697707,
"learning_rate": 6.253006253006253e-07,
"loss": 0.3327,
"step": 2285
},
{
"epoch": 2.965606748864374,
"grad_norm": 0.19406554087810526,
"learning_rate": 6.012506012506013e-07,
"loss": 0.3374,
"step": 2286
},
{
"epoch": 2.966904607397794,
"grad_norm": 0.18974085077422986,
"learning_rate": 5.772005772005772e-07,
"loss": 0.3193,
"step": 2287
},
{
"epoch": 2.9682024659312134,
"grad_norm": 0.19068994562305627,
"learning_rate": 5.531505531505532e-07,
"loss": 0.3387,
"step": 2288
},
{
"epoch": 2.9695003244646334,
"grad_norm": 0.17821215477258306,
"learning_rate": 5.291005291005291e-07,
"loss": 0.3328,
"step": 2289
},
{
"epoch": 2.9707981829980534,
"grad_norm": 0.18413236462451124,
"learning_rate": 5.050505050505052e-07,
"loss": 0.3362,
"step": 2290
},
{
"epoch": 2.972096041531473,
"grad_norm": 0.18085396718066815,
"learning_rate": 4.810004810004811e-07,
"loss": 0.3302,
"step": 2291
},
{
"epoch": 2.973393900064893,
"grad_norm": 0.18231587065998014,
"learning_rate": 4.56950456950457e-07,
"loss": 0.3338,
"step": 2292
},
{
"epoch": 2.974691758598313,
"grad_norm": 0.18433591352078926,
"learning_rate": 4.329004329004329e-07,
"loss": 0.3373,
"step": 2293
},
{
"epoch": 2.9759896171317326,
"grad_norm": 0.17897899773682865,
"learning_rate": 4.0885040885040886e-07,
"loss": 0.3303,
"step": 2294
},
{
"epoch": 2.9772874756651526,
"grad_norm": 0.18401504997308174,
"learning_rate": 3.8480038480038485e-07,
"loss": 0.3295,
"step": 2295
},
{
"epoch": 2.9785853341985726,
"grad_norm": 0.18601416230069387,
"learning_rate": 3.607503607503608e-07,
"loss": 0.3384,
"step": 2296
},
{
"epoch": 2.979883192731992,
"grad_norm": 0.18294283749703283,
"learning_rate": 3.367003367003367e-07,
"loss": 0.3414,
"step": 2297
},
{
"epoch": 2.981181051265412,
"grad_norm": 0.18248918456066632,
"learning_rate": 3.1265031265031267e-07,
"loss": 0.3522,
"step": 2298
},
{
"epoch": 2.982478909798832,
"grad_norm": 0.18259396477447506,
"learning_rate": 2.886002886002886e-07,
"loss": 0.3194,
"step": 2299
},
{
"epoch": 2.983776768332252,
"grad_norm": 0.1885739190189894,
"learning_rate": 2.6455026455026455e-07,
"loss": 0.3407,
"step": 2300
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.18564276265962515,
"learning_rate": 2.4050024050024055e-07,
"loss": 0.3431,
"step": 2301
},
{
"epoch": 2.9863724853990914,
"grad_norm": 0.17794361853046303,
"learning_rate": 2.1645021645021646e-07,
"loss": 0.3228,
"step": 2302
},
{
"epoch": 2.9876703439325114,
"grad_norm": 0.18130600626680995,
"learning_rate": 1.9240019240019243e-07,
"loss": 0.3336,
"step": 2303
},
{
"epoch": 2.988968202465931,
"grad_norm": 0.18435788490293198,
"learning_rate": 1.6835016835016834e-07,
"loss": 0.3308,
"step": 2304
},
{
"epoch": 2.990266060999351,
"grad_norm": 0.18127043211913135,
"learning_rate": 1.443001443001443e-07,
"loss": 0.3449,
"step": 2305
},
{
"epoch": 2.991563919532771,
"grad_norm": 0.1799885551759602,
"learning_rate": 1.2025012025012027e-07,
"loss": 0.3267,
"step": 2306
},
{
"epoch": 2.9928617780661906,
"grad_norm": 0.18020390891152432,
"learning_rate": 9.620009620009621e-08,
"loss": 0.3376,
"step": 2307
},
{
"epoch": 2.9941596365996106,
"grad_norm": 0.18299772015112434,
"learning_rate": 7.215007215007215e-08,
"loss": 0.3404,
"step": 2308
},
{
"epoch": 2.9954574951330306,
"grad_norm": 0.17991344814248053,
"learning_rate": 4.8100048100048107e-08,
"loss": 0.3293,
"step": 2309
},
{
"epoch": 2.99675535366645,
"grad_norm": 0.18545369820383786,
"learning_rate": 2.4050024050024053e-08,
"loss": 0.3376,
"step": 2310
},
{
"epoch": 2.99675535366645,
"step": 2310,
"total_flos": 2.5679880641918796e+19,
"train_loss": 0.5017085661361744,
"train_runtime": 66070.1568,
"train_samples_per_second": 0.56,
"train_steps_per_second": 0.035
}
],
"logging_steps": 1,
"max_steps": 2310,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5679880641918796e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}