SALAD_Malicious_Code / trainer_state.json
zwSyc's picture
Upload model
99cffda verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.996199941537562,
"eval_steps": 500,
"global_step": 639,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00467699503069278,
"grad_norm": 0.5310407876968384,
"learning_rate": 9.98435054773083e-06,
"loss": 15.7421,
"step": 1
},
{
"epoch": 0.00935399006138556,
"grad_norm": 0.6928378343582153,
"learning_rate": 9.96870109546166e-06,
"loss": 22.4837,
"step": 2
},
{
"epoch": 0.014030985092078339,
"grad_norm": 0.5985817909240723,
"learning_rate": 9.953051643192489e-06,
"loss": 19.2438,
"step": 3
},
{
"epoch": 0.01870798012277112,
"grad_norm": 0.3373333215713501,
"learning_rate": 9.937402190923318e-06,
"loss": 17.8898,
"step": 4
},
{
"epoch": 0.0233849751534639,
"grad_norm": 0.41831186413764954,
"learning_rate": 9.921752738654147e-06,
"loss": 17.4461,
"step": 5
},
{
"epoch": 0.028061970184156678,
"grad_norm": 0.3748932182788849,
"learning_rate": 9.906103286384977e-06,
"loss": 17.7855,
"step": 6
},
{
"epoch": 0.03273896521484946,
"grad_norm": 0.3773082494735718,
"learning_rate": 9.890453834115806e-06,
"loss": 16.9238,
"step": 7
},
{
"epoch": 0.03741596024554224,
"grad_norm": 0.3397878408432007,
"learning_rate": 9.874804381846637e-06,
"loss": 16.7176,
"step": 8
},
{
"epoch": 0.04209295527623502,
"grad_norm": 0.5136957168579102,
"learning_rate": 9.859154929577466e-06,
"loss": 15.4666,
"step": 9
},
{
"epoch": 0.0467699503069278,
"grad_norm": 0.3085887134075165,
"learning_rate": 9.843505477308296e-06,
"loss": 13.7508,
"step": 10
},
{
"epoch": 0.05144694533762058,
"grad_norm": 0.3942926228046417,
"learning_rate": 9.827856025039125e-06,
"loss": 17.3618,
"step": 11
},
{
"epoch": 0.056123940368313356,
"grad_norm": 0.27711015939712524,
"learning_rate": 9.812206572769954e-06,
"loss": 16.2495,
"step": 12
},
{
"epoch": 0.06080093539900614,
"grad_norm": 0.27834147214889526,
"learning_rate": 9.796557120500783e-06,
"loss": 15.9543,
"step": 13
},
{
"epoch": 0.06547793042969892,
"grad_norm": 0.24677161872386932,
"learning_rate": 9.780907668231613e-06,
"loss": 15.6386,
"step": 14
},
{
"epoch": 0.0701549254603917,
"grad_norm": 0.36406269669532776,
"learning_rate": 9.765258215962442e-06,
"loss": 15.6255,
"step": 15
},
{
"epoch": 0.07483192049108447,
"grad_norm": 0.307948499917984,
"learning_rate": 9.749608763693271e-06,
"loss": 14.9189,
"step": 16
},
{
"epoch": 0.07950891552177726,
"grad_norm": 0.2980886697769165,
"learning_rate": 9.7339593114241e-06,
"loss": 15.0406,
"step": 17
},
{
"epoch": 0.08418591055247004,
"grad_norm": 0.412708044052124,
"learning_rate": 9.71830985915493e-06,
"loss": 14.7356,
"step": 18
},
{
"epoch": 0.08886290558316282,
"grad_norm": 0.2903729975223541,
"learning_rate": 9.70266040688576e-06,
"loss": 14.8173,
"step": 19
},
{
"epoch": 0.0935399006138556,
"grad_norm": 0.2171318084001541,
"learning_rate": 9.687010954616589e-06,
"loss": 15.0565,
"step": 20
},
{
"epoch": 0.09821689564454837,
"grad_norm": 0.4166527986526489,
"learning_rate": 9.671361502347418e-06,
"loss": 14.1942,
"step": 21
},
{
"epoch": 0.10289389067524116,
"grad_norm": 0.2564053237438202,
"learning_rate": 9.655712050078247e-06,
"loss": 14.3627,
"step": 22
},
{
"epoch": 0.10757088570593394,
"grad_norm": 0.254341185092926,
"learning_rate": 9.640062597809078e-06,
"loss": 13.7315,
"step": 23
},
{
"epoch": 0.11224788073662671,
"grad_norm": 0.24184982478618622,
"learning_rate": 9.624413145539908e-06,
"loss": 13.9785,
"step": 24
},
{
"epoch": 0.1169248757673195,
"grad_norm": 0.3118051588535309,
"learning_rate": 9.608763693270737e-06,
"loss": 15.1744,
"step": 25
},
{
"epoch": 0.12160187079801228,
"grad_norm": 0.2545301020145416,
"learning_rate": 9.593114241001566e-06,
"loss": 15.9676,
"step": 26
},
{
"epoch": 0.12627886582870507,
"grad_norm": 0.2265356183052063,
"learning_rate": 9.577464788732394e-06,
"loss": 14.8985,
"step": 27
},
{
"epoch": 0.13095586085939784,
"grad_norm": 0.2141331285238266,
"learning_rate": 9.561815336463225e-06,
"loss": 13.6498,
"step": 28
},
{
"epoch": 0.1356328558900906,
"grad_norm": 0.27572301030158997,
"learning_rate": 9.546165884194054e-06,
"loss": 13.9124,
"step": 29
},
{
"epoch": 0.1403098509207834,
"grad_norm": 0.1987282633781433,
"learning_rate": 9.530516431924883e-06,
"loss": 12.9095,
"step": 30
},
{
"epoch": 0.14498684595147618,
"grad_norm": 0.2444925159215927,
"learning_rate": 9.514866979655713e-06,
"loss": 13.365,
"step": 31
},
{
"epoch": 0.14966384098216895,
"grad_norm": 0.4400818645954132,
"learning_rate": 9.499217527386542e-06,
"loss": 13.2832,
"step": 32
},
{
"epoch": 0.15434083601286175,
"grad_norm": 0.2764039933681488,
"learning_rate": 9.483568075117371e-06,
"loss": 14.3228,
"step": 33
},
{
"epoch": 0.15901783104355452,
"grad_norm": 0.21101799607276917,
"learning_rate": 9.4679186228482e-06,
"loss": 14.156,
"step": 34
},
{
"epoch": 0.1636948260742473,
"grad_norm": 0.267008513212204,
"learning_rate": 9.45226917057903e-06,
"loss": 14.1084,
"step": 35
},
{
"epoch": 0.1683718211049401,
"grad_norm": 0.2759203016757965,
"learning_rate": 9.43661971830986e-06,
"loss": 13.5008,
"step": 36
},
{
"epoch": 0.17304881613563286,
"grad_norm": 0.2793346643447876,
"learning_rate": 9.42097026604069e-06,
"loss": 14.2747,
"step": 37
},
{
"epoch": 0.17772581116632563,
"grad_norm": 0.25120246410369873,
"learning_rate": 9.40532081377152e-06,
"loss": 13.4347,
"step": 38
},
{
"epoch": 0.1824028061970184,
"grad_norm": 0.1591794341802597,
"learning_rate": 9.389671361502349e-06,
"loss": 12.619,
"step": 39
},
{
"epoch": 0.1870798012277112,
"grad_norm": 0.2054363638162613,
"learning_rate": 9.374021909233178e-06,
"loss": 11.8876,
"step": 40
},
{
"epoch": 0.19175679625840397,
"grad_norm": 0.23818843066692352,
"learning_rate": 9.358372456964007e-06,
"loss": 13.4683,
"step": 41
},
{
"epoch": 0.19643379128909674,
"grad_norm": 0.32269319891929626,
"learning_rate": 9.342723004694837e-06,
"loss": 12.5351,
"step": 42
},
{
"epoch": 0.20111078631978954,
"grad_norm": 0.29193466901779175,
"learning_rate": 9.327073552425666e-06,
"loss": 11.917,
"step": 43
},
{
"epoch": 0.2057877813504823,
"grad_norm": 0.20844891667366028,
"learning_rate": 9.311424100156495e-06,
"loss": 12.0984,
"step": 44
},
{
"epoch": 0.21046477638117508,
"grad_norm": 0.26920032501220703,
"learning_rate": 9.295774647887325e-06,
"loss": 14.1542,
"step": 45
},
{
"epoch": 0.21514177141186788,
"grad_norm": 0.20874425768852234,
"learning_rate": 9.280125195618154e-06,
"loss": 13.9397,
"step": 46
},
{
"epoch": 0.21981876644256065,
"grad_norm": 0.28703245520591736,
"learning_rate": 9.264475743348983e-06,
"loss": 12.7704,
"step": 47
},
{
"epoch": 0.22449576147325342,
"grad_norm": 0.23402653634548187,
"learning_rate": 9.248826291079813e-06,
"loss": 12.8326,
"step": 48
},
{
"epoch": 0.22917275650394622,
"grad_norm": 0.28065574169158936,
"learning_rate": 9.233176838810642e-06,
"loss": 11.4735,
"step": 49
},
{
"epoch": 0.233849751534639,
"grad_norm": 0.21932877600193024,
"learning_rate": 9.217527386541471e-06,
"loss": 12.2491,
"step": 50
},
{
"epoch": 0.23852674656533177,
"grad_norm": 0.24466539919376373,
"learning_rate": 9.2018779342723e-06,
"loss": 12.3501,
"step": 51
},
{
"epoch": 0.24320374159602456,
"grad_norm": 0.17424331605434418,
"learning_rate": 9.186228482003131e-06,
"loss": 12.6445,
"step": 52
},
{
"epoch": 0.24788073662671734,
"grad_norm": 0.2298133671283722,
"learning_rate": 9.17057902973396e-06,
"loss": 12.5759,
"step": 53
},
{
"epoch": 0.25255773165741013,
"grad_norm": 0.30562305450439453,
"learning_rate": 9.15492957746479e-06,
"loss": 13.4988,
"step": 54
},
{
"epoch": 0.2572347266881029,
"grad_norm": 0.21225547790527344,
"learning_rate": 9.13928012519562e-06,
"loss": 13.3909,
"step": 55
},
{
"epoch": 0.2619117217187957,
"grad_norm": 0.3120986223220825,
"learning_rate": 9.123630672926449e-06,
"loss": 13.8276,
"step": 56
},
{
"epoch": 0.2665887167494885,
"grad_norm": 0.18036110699176788,
"learning_rate": 9.107981220657278e-06,
"loss": 13.7724,
"step": 57
},
{
"epoch": 0.2712657117801812,
"grad_norm": 0.22987115383148193,
"learning_rate": 9.092331768388107e-06,
"loss": 12.2669,
"step": 58
},
{
"epoch": 0.275942706810874,
"grad_norm": 0.23878921568393707,
"learning_rate": 9.076682316118937e-06,
"loss": 12.7097,
"step": 59
},
{
"epoch": 0.2806197018415668,
"grad_norm": 0.20319631695747375,
"learning_rate": 9.061032863849766e-06,
"loss": 12.2795,
"step": 60
},
{
"epoch": 0.28529669687225956,
"grad_norm": 0.18609336018562317,
"learning_rate": 9.045383411580595e-06,
"loss": 11.2712,
"step": 61
},
{
"epoch": 0.28997369190295236,
"grad_norm": 0.21320512890815735,
"learning_rate": 9.029733959311425e-06,
"loss": 11.6637,
"step": 62
},
{
"epoch": 0.29465068693364516,
"grad_norm": 0.23330001533031464,
"learning_rate": 9.014084507042254e-06,
"loss": 12.9509,
"step": 63
},
{
"epoch": 0.2993276819643379,
"grad_norm": 0.21313583850860596,
"learning_rate": 8.998435054773083e-06,
"loss": 13.8547,
"step": 64
},
{
"epoch": 0.3040046769950307,
"grad_norm": 0.20739194750785828,
"learning_rate": 8.982785602503912e-06,
"loss": 12.735,
"step": 65
},
{
"epoch": 0.3086816720257235,
"grad_norm": 0.2453576922416687,
"learning_rate": 8.967136150234742e-06,
"loss": 12.7951,
"step": 66
},
{
"epoch": 0.31335866705641624,
"grad_norm": 0.21135878562927246,
"learning_rate": 8.951486697965573e-06,
"loss": 13.7611,
"step": 67
},
{
"epoch": 0.31803566208710904,
"grad_norm": 0.2257193773984909,
"learning_rate": 8.935837245696402e-06,
"loss": 11.3833,
"step": 68
},
{
"epoch": 0.32271265711780184,
"grad_norm": 0.1934535950422287,
"learning_rate": 8.920187793427231e-06,
"loss": 11.4428,
"step": 69
},
{
"epoch": 0.3273896521484946,
"grad_norm": 0.19537678360939026,
"learning_rate": 8.90453834115806e-06,
"loss": 13.1129,
"step": 70
},
{
"epoch": 0.3320666471791874,
"grad_norm": 0.2596362233161926,
"learning_rate": 8.888888888888888e-06,
"loss": 11.9323,
"step": 71
},
{
"epoch": 0.3367436422098802,
"grad_norm": 0.28119221329689026,
"learning_rate": 8.87323943661972e-06,
"loss": 12.1397,
"step": 72
},
{
"epoch": 0.3414206372405729,
"grad_norm": 0.2443932145833969,
"learning_rate": 8.857589984350549e-06,
"loss": 11.1756,
"step": 73
},
{
"epoch": 0.3460976322712657,
"grad_norm": 0.23586861789226532,
"learning_rate": 8.841940532081378e-06,
"loss": 12.2808,
"step": 74
},
{
"epoch": 0.3507746273019585,
"grad_norm": 0.2984711229801178,
"learning_rate": 8.826291079812207e-06,
"loss": 11.8437,
"step": 75
},
{
"epoch": 0.35545162233265126,
"grad_norm": 0.2404984086751938,
"learning_rate": 8.810641627543037e-06,
"loss": 11.6321,
"step": 76
},
{
"epoch": 0.36012861736334406,
"grad_norm": 0.22745920717716217,
"learning_rate": 8.794992175273866e-06,
"loss": 12.969,
"step": 77
},
{
"epoch": 0.3648056123940368,
"grad_norm": 0.22989057004451752,
"learning_rate": 8.779342723004695e-06,
"loss": 12.1793,
"step": 78
},
{
"epoch": 0.3694826074247296,
"grad_norm": 0.22097162902355194,
"learning_rate": 8.763693270735524e-06,
"loss": 12.5693,
"step": 79
},
{
"epoch": 0.3741596024554224,
"grad_norm": 0.19985444843769073,
"learning_rate": 8.748043818466354e-06,
"loss": 13.3868,
"step": 80
},
{
"epoch": 0.37883659748611515,
"grad_norm": 0.2339348942041397,
"learning_rate": 8.732394366197183e-06,
"loss": 11.5905,
"step": 81
},
{
"epoch": 0.38351359251680794,
"grad_norm": 0.28241512179374695,
"learning_rate": 8.716744913928014e-06,
"loss": 12.6998,
"step": 82
},
{
"epoch": 0.38819058754750074,
"grad_norm": 0.2848986089229584,
"learning_rate": 8.701095461658843e-06,
"loss": 11.3058,
"step": 83
},
{
"epoch": 0.3928675825781935,
"grad_norm": 0.2118872106075287,
"learning_rate": 8.685446009389673e-06,
"loss": 10.4664,
"step": 84
},
{
"epoch": 0.3975445776088863,
"grad_norm": 0.16718249022960663,
"learning_rate": 8.669796557120502e-06,
"loss": 13.2492,
"step": 85
},
{
"epoch": 0.4022215726395791,
"grad_norm": 0.2131660282611847,
"learning_rate": 8.65414710485133e-06,
"loss": 12.0166,
"step": 86
},
{
"epoch": 0.4068985676702718,
"grad_norm": 0.2012370079755783,
"learning_rate": 8.63849765258216e-06,
"loss": 12.5326,
"step": 87
},
{
"epoch": 0.4115755627009646,
"grad_norm": 0.2684880793094635,
"learning_rate": 8.62284820031299e-06,
"loss": 12.8071,
"step": 88
},
{
"epoch": 0.4162525577316574,
"grad_norm": 0.2500629127025604,
"learning_rate": 8.60719874804382e-06,
"loss": 12.1628,
"step": 89
},
{
"epoch": 0.42092955276235017,
"grad_norm": 0.18125677108764648,
"learning_rate": 8.591549295774648e-06,
"loss": 11.3137,
"step": 90
},
{
"epoch": 0.42560654779304297,
"grad_norm": 0.1830630898475647,
"learning_rate": 8.575899843505478e-06,
"loss": 11.8507,
"step": 91
},
{
"epoch": 0.43028354282373577,
"grad_norm": 0.1481466144323349,
"learning_rate": 8.560250391236307e-06,
"loss": 10.8795,
"step": 92
},
{
"epoch": 0.4349605378544285,
"grad_norm": 0.18768347799777985,
"learning_rate": 8.544600938967136e-06,
"loss": 11.2509,
"step": 93
},
{
"epoch": 0.4396375328851213,
"grad_norm": 0.22724182903766632,
"learning_rate": 8.528951486697966e-06,
"loss": 11.6564,
"step": 94
},
{
"epoch": 0.4443145279158141,
"grad_norm": 0.1806531399488449,
"learning_rate": 8.513302034428795e-06,
"loss": 11.9111,
"step": 95
},
{
"epoch": 0.44899152294650685,
"grad_norm": 0.2578674554824829,
"learning_rate": 8.497652582159626e-06,
"loss": 13.1609,
"step": 96
},
{
"epoch": 0.45366851797719965,
"grad_norm": 0.21666157245635986,
"learning_rate": 8.482003129890455e-06,
"loss": 12.3285,
"step": 97
},
{
"epoch": 0.45834551300789245,
"grad_norm": 0.2574619948863983,
"learning_rate": 8.466353677621285e-06,
"loss": 11.4998,
"step": 98
},
{
"epoch": 0.4630225080385852,
"grad_norm": 0.28588882088661194,
"learning_rate": 8.450704225352114e-06,
"loss": 11.0233,
"step": 99
},
{
"epoch": 0.467699503069278,
"grad_norm": 0.28356659412384033,
"learning_rate": 8.435054773082943e-06,
"loss": 10.9355,
"step": 100
},
{
"epoch": 0.4723764980999708,
"grad_norm": 0.18748782575130463,
"learning_rate": 8.419405320813773e-06,
"loss": 13.5926,
"step": 101
},
{
"epoch": 0.47705349313066353,
"grad_norm": 0.17172126471996307,
"learning_rate": 8.403755868544602e-06,
"loss": 11.4017,
"step": 102
},
{
"epoch": 0.48173048816135633,
"grad_norm": 0.1956973671913147,
"learning_rate": 8.388106416275431e-06,
"loss": 12.1463,
"step": 103
},
{
"epoch": 0.48640748319204913,
"grad_norm": 0.30823975801467896,
"learning_rate": 8.37245696400626e-06,
"loss": 10.2949,
"step": 104
},
{
"epoch": 0.49108447822274187,
"grad_norm": 0.23158958554267883,
"learning_rate": 8.35680751173709e-06,
"loss": 11.2003,
"step": 105
},
{
"epoch": 0.49576147325343467,
"grad_norm": 0.23977261781692505,
"learning_rate": 8.341158059467919e-06,
"loss": 11.5904,
"step": 106
},
{
"epoch": 0.5004384682841274,
"grad_norm": 0.17250728607177734,
"learning_rate": 8.325508607198748e-06,
"loss": 11.2648,
"step": 107
},
{
"epoch": 0.5051154633148203,
"grad_norm": 0.23300261795520782,
"learning_rate": 8.309859154929578e-06,
"loss": 11.9646,
"step": 108
},
{
"epoch": 0.509792458345513,
"grad_norm": 0.2430488020181656,
"learning_rate": 8.294209702660407e-06,
"loss": 12.046,
"step": 109
},
{
"epoch": 0.5144694533762058,
"grad_norm": 0.18206799030303955,
"learning_rate": 8.278560250391236e-06,
"loss": 12.0767,
"step": 110
},
{
"epoch": 0.5191464484068986,
"grad_norm": 0.25876322388648987,
"learning_rate": 8.262910798122067e-06,
"loss": 11.7794,
"step": 111
},
{
"epoch": 0.5238234434375914,
"grad_norm": 0.28936639428138733,
"learning_rate": 8.247261345852897e-06,
"loss": 10.3819,
"step": 112
},
{
"epoch": 0.5285004384682841,
"grad_norm": 0.214036762714386,
"learning_rate": 8.231611893583726e-06,
"loss": 10.3209,
"step": 113
},
{
"epoch": 0.533177433498977,
"grad_norm": 0.23764470219612122,
"learning_rate": 8.215962441314555e-06,
"loss": 10.8417,
"step": 114
},
{
"epoch": 0.5378544285296697,
"grad_norm": 0.2604602575302124,
"learning_rate": 8.200312989045383e-06,
"loss": 12.534,
"step": 115
},
{
"epoch": 0.5425314235603624,
"grad_norm": 0.24597330391407013,
"learning_rate": 8.184663536776214e-06,
"loss": 12.348,
"step": 116
},
{
"epoch": 0.5472084185910553,
"grad_norm": 0.2204928994178772,
"learning_rate": 8.169014084507043e-06,
"loss": 10.979,
"step": 117
},
{
"epoch": 0.551885413621748,
"grad_norm": 0.15487593412399292,
"learning_rate": 8.153364632237872e-06,
"loss": 11.0756,
"step": 118
},
{
"epoch": 0.5565624086524408,
"grad_norm": 0.23864871263504028,
"learning_rate": 8.137715179968702e-06,
"loss": 11.66,
"step": 119
},
{
"epoch": 0.5612394036831336,
"grad_norm": 0.22024200856685638,
"learning_rate": 8.122065727699531e-06,
"loss": 10.7713,
"step": 120
},
{
"epoch": 0.5659163987138264,
"grad_norm": 0.19292014837265015,
"learning_rate": 8.10641627543036e-06,
"loss": 9.4704,
"step": 121
},
{
"epoch": 0.5705933937445191,
"grad_norm": 0.16765080392360687,
"learning_rate": 8.09076682316119e-06,
"loss": 10.7993,
"step": 122
},
{
"epoch": 0.575270388775212,
"grad_norm": 0.26758840680122375,
"learning_rate": 8.075117370892019e-06,
"loss": 11.354,
"step": 123
},
{
"epoch": 0.5799473838059047,
"grad_norm": 0.25225985050201416,
"learning_rate": 8.059467918622848e-06,
"loss": 11.2162,
"step": 124
},
{
"epoch": 0.5846243788365975,
"grad_norm": 0.22062422335147858,
"learning_rate": 8.043818466353678e-06,
"loss": 9.9452,
"step": 125
},
{
"epoch": 0.5893013738672903,
"grad_norm": 0.2589726746082306,
"learning_rate": 8.028169014084509e-06,
"loss": 11.6098,
"step": 126
},
{
"epoch": 0.5939783688979831,
"grad_norm": 0.23492346704006195,
"learning_rate": 8.012519561815338e-06,
"loss": 10.6918,
"step": 127
},
{
"epoch": 0.5986553639286758,
"grad_norm": 0.29631978273391724,
"learning_rate": 7.996870109546167e-06,
"loss": 11.4451,
"step": 128
},
{
"epoch": 0.6033323589593687,
"grad_norm": 0.195633202791214,
"learning_rate": 7.981220657276996e-06,
"loss": 11.3396,
"step": 129
},
{
"epoch": 0.6080093539900614,
"grad_norm": 0.14094115793704987,
"learning_rate": 7.965571205007824e-06,
"loss": 10.9388,
"step": 130
},
{
"epoch": 0.6126863490207541,
"grad_norm": 0.2307533323764801,
"learning_rate": 7.949921752738655e-06,
"loss": 12.2129,
"step": 131
},
{
"epoch": 0.617363344051447,
"grad_norm": 0.2004641741514206,
"learning_rate": 7.934272300469484e-06,
"loss": 9.9139,
"step": 132
},
{
"epoch": 0.6220403390821397,
"grad_norm": 0.22784000635147095,
"learning_rate": 7.918622848200314e-06,
"loss": 10.2306,
"step": 133
},
{
"epoch": 0.6267173341128325,
"grad_norm": 0.21663011610507965,
"learning_rate": 7.902973395931143e-06,
"loss": 9.9467,
"step": 134
},
{
"epoch": 0.6313943291435253,
"grad_norm": 0.18714800477027893,
"learning_rate": 7.887323943661972e-06,
"loss": 9.7232,
"step": 135
},
{
"epoch": 0.6360713241742181,
"grad_norm": 0.23525570333003998,
"learning_rate": 7.871674491392802e-06,
"loss": 9.9539,
"step": 136
},
{
"epoch": 0.6407483192049108,
"grad_norm": 0.22870206832885742,
"learning_rate": 7.856025039123631e-06,
"loss": 11.9964,
"step": 137
},
{
"epoch": 0.6454253142356037,
"grad_norm": 0.19730104506015778,
"learning_rate": 7.84037558685446e-06,
"loss": 10.8391,
"step": 138
},
{
"epoch": 0.6501023092662964,
"grad_norm": 0.1873929351568222,
"learning_rate": 7.82472613458529e-06,
"loss": 10.7179,
"step": 139
},
{
"epoch": 0.6547793042969892,
"grad_norm": 0.14801403880119324,
"learning_rate": 7.809076682316119e-06,
"loss": 10.9041,
"step": 140
},
{
"epoch": 0.659456299327682,
"grad_norm": 0.21909023821353912,
"learning_rate": 7.79342723004695e-06,
"loss": 11.5497,
"step": 141
},
{
"epoch": 0.6641332943583748,
"grad_norm": 0.20469622313976288,
"learning_rate": 7.77777777777778e-06,
"loss": 11.0387,
"step": 142
},
{
"epoch": 0.6688102893890675,
"grad_norm": 0.20616918802261353,
"learning_rate": 7.762128325508608e-06,
"loss": 9.5392,
"step": 143
},
{
"epoch": 0.6734872844197604,
"grad_norm": 0.1846546232700348,
"learning_rate": 7.746478873239436e-06,
"loss": 11.5538,
"step": 144
},
{
"epoch": 0.6781642794504531,
"grad_norm": 0.17778314650058746,
"learning_rate": 7.730829420970265e-06,
"loss": 12.8435,
"step": 145
},
{
"epoch": 0.6828412744811458,
"grad_norm": 0.24238605797290802,
"learning_rate": 7.715179968701096e-06,
"loss": 9.4674,
"step": 146
},
{
"epoch": 0.6875182695118387,
"grad_norm": 0.20961545407772064,
"learning_rate": 7.699530516431926e-06,
"loss": 10.1325,
"step": 147
},
{
"epoch": 0.6921952645425314,
"grad_norm": 0.20476683974266052,
"learning_rate": 7.683881064162755e-06,
"loss": 11.1375,
"step": 148
},
{
"epoch": 0.6968722595732242,
"grad_norm": 0.22241833806037903,
"learning_rate": 7.668231611893584e-06,
"loss": 9.6296,
"step": 149
},
{
"epoch": 0.701549254603917,
"grad_norm": 0.2302970439195633,
"learning_rate": 7.652582159624414e-06,
"loss": 10.8763,
"step": 150
},
{
"epoch": 0.7062262496346098,
"grad_norm": 0.20484097301959991,
"learning_rate": 7.636932707355243e-06,
"loss": 9.0306,
"step": 151
},
{
"epoch": 0.7109032446653025,
"grad_norm": 0.20411114394664764,
"learning_rate": 7.621283255086073e-06,
"loss": 11.5865,
"step": 152
},
{
"epoch": 0.7155802396959953,
"grad_norm": 0.37148869037628174,
"learning_rate": 7.6056338028169015e-06,
"loss": 10.4929,
"step": 153
},
{
"epoch": 0.7202572347266881,
"grad_norm": 0.19864030182361603,
"learning_rate": 7.589984350547731e-06,
"loss": 10.4561,
"step": 154
},
{
"epoch": 0.7249342297573809,
"grad_norm": 0.21187515556812286,
"learning_rate": 7.574334898278561e-06,
"loss": 9.6848,
"step": 155
},
{
"epoch": 0.7296112247880736,
"grad_norm": 0.18564990162849426,
"learning_rate": 7.55868544600939e-06,
"loss": 11.2932,
"step": 156
},
{
"epoch": 0.7342882198187665,
"grad_norm": 0.21274517476558685,
"learning_rate": 7.54303599374022e-06,
"loss": 10.2206,
"step": 157
},
{
"epoch": 0.7389652148494592,
"grad_norm": 0.23622578382492065,
"learning_rate": 7.527386541471049e-06,
"loss": 9.4342,
"step": 158
},
{
"epoch": 0.743642209880152,
"grad_norm": 0.21262332797050476,
"learning_rate": 7.511737089201878e-06,
"loss": 11.4181,
"step": 159
},
{
"epoch": 0.7483192049108448,
"grad_norm": 0.22142890095710754,
"learning_rate": 7.496087636932708e-06,
"loss": 10.4912,
"step": 160
},
{
"epoch": 0.7529961999415375,
"grad_norm": 0.219626322388649,
"learning_rate": 7.480438184663538e-06,
"loss": 10.902,
"step": 161
},
{
"epoch": 0.7576731949722303,
"grad_norm": 0.19913645088672638,
"learning_rate": 7.464788732394367e-06,
"loss": 8.9078,
"step": 162
},
{
"epoch": 0.7623501900029231,
"grad_norm": 0.19409991800785065,
"learning_rate": 7.449139280125196e-06,
"loss": 10.7111,
"step": 163
},
{
"epoch": 0.7670271850336159,
"grad_norm": 0.20056220889091492,
"learning_rate": 7.433489827856026e-06,
"loss": 11.438,
"step": 164
},
{
"epoch": 0.7717041800643086,
"grad_norm": 0.19502754509449005,
"learning_rate": 7.417840375586856e-06,
"loss": 10.1837,
"step": 165
},
{
"epoch": 0.7763811750950015,
"grad_norm": 0.17272567749023438,
"learning_rate": 7.402190923317685e-06,
"loss": 10.7406,
"step": 166
},
{
"epoch": 0.7810581701256942,
"grad_norm": 0.19558610022068024,
"learning_rate": 7.386541471048514e-06,
"loss": 10.1322,
"step": 167
},
{
"epoch": 0.785735165156387,
"grad_norm": 0.2161480039358139,
"learning_rate": 7.370892018779343e-06,
"loss": 9.7506,
"step": 168
},
{
"epoch": 0.7904121601870798,
"grad_norm": 0.25595343112945557,
"learning_rate": 7.355242566510172e-06,
"loss": 11.0059,
"step": 169
},
{
"epoch": 0.7950891552177726,
"grad_norm": 0.21218866109848022,
"learning_rate": 7.339593114241002e-06,
"loss": 11.2122,
"step": 170
},
{
"epoch": 0.7997661502484653,
"grad_norm": 0.1922176331281662,
"learning_rate": 7.3239436619718316e-06,
"loss": 11.0585,
"step": 171
},
{
"epoch": 0.8044431452791582,
"grad_norm": 0.1726471334695816,
"learning_rate": 7.308294209702661e-06,
"loss": 11.3007,
"step": 172
},
{
"epoch": 0.8091201403098509,
"grad_norm": 0.20865805447101593,
"learning_rate": 7.29264475743349e-06,
"loss": 12.5848,
"step": 173
},
{
"epoch": 0.8137971353405437,
"grad_norm": 0.2097303569316864,
"learning_rate": 7.2769953051643195e-06,
"loss": 11.694,
"step": 174
},
{
"epoch": 0.8184741303712365,
"grad_norm": 0.22343699634075165,
"learning_rate": 7.26134585289515e-06,
"loss": 9.9861,
"step": 175
},
{
"epoch": 0.8231511254019293,
"grad_norm": 0.19908592104911804,
"learning_rate": 7.245696400625979e-06,
"loss": 10.7263,
"step": 176
},
{
"epoch": 0.827828120432622,
"grad_norm": 0.2062506079673767,
"learning_rate": 7.230046948356808e-06,
"loss": 10.7234,
"step": 177
},
{
"epoch": 0.8325051154633148,
"grad_norm": 0.23186688125133514,
"learning_rate": 7.2143974960876376e-06,
"loss": 10.7846,
"step": 178
},
{
"epoch": 0.8371821104940076,
"grad_norm": 0.20528610050678253,
"learning_rate": 7.198748043818467e-06,
"loss": 10.6732,
"step": 179
},
{
"epoch": 0.8418591055247003,
"grad_norm": 0.21028846502304077,
"learning_rate": 7.183098591549297e-06,
"loss": 9.5007,
"step": 180
},
{
"epoch": 0.8465361005553932,
"grad_norm": 0.1943686306476593,
"learning_rate": 7.167449139280126e-06,
"loss": 10.6163,
"step": 181
},
{
"epoch": 0.8512130955860859,
"grad_norm": 0.15791501104831696,
"learning_rate": 7.151799687010955e-06,
"loss": 10.4564,
"step": 182
},
{
"epoch": 0.8558900906167787,
"grad_norm": 0.15603427588939667,
"learning_rate": 7.136150234741784e-06,
"loss": 11.4006,
"step": 183
},
{
"epoch": 0.8605670856474715,
"grad_norm": 0.1737872064113617,
"learning_rate": 7.120500782472613e-06,
"loss": 10.2583,
"step": 184
},
{
"epoch": 0.8652440806781643,
"grad_norm": 0.16742144525051117,
"learning_rate": 7.1048513302034435e-06,
"loss": 9.6543,
"step": 185
},
{
"epoch": 0.869921075708857,
"grad_norm": 0.2204071581363678,
"learning_rate": 7.089201877934273e-06,
"loss": 10.6068,
"step": 186
},
{
"epoch": 0.8745980707395499,
"grad_norm": 0.17526549100875854,
"learning_rate": 7.073552425665102e-06,
"loss": 10.5927,
"step": 187
},
{
"epoch": 0.8792750657702426,
"grad_norm": 0.18857762217521667,
"learning_rate": 7.0579029733959315e-06,
"loss": 10.0686,
"step": 188
},
{
"epoch": 0.8839520608009354,
"grad_norm": 0.16617538034915924,
"learning_rate": 7.042253521126761e-06,
"loss": 11.0356,
"step": 189
},
{
"epoch": 0.8886290558316282,
"grad_norm": 0.20443867146968842,
"learning_rate": 7.026604068857591e-06,
"loss": 9.764,
"step": 190
},
{
"epoch": 0.893306050862321,
"grad_norm": 0.16466206312179565,
"learning_rate": 7.01095461658842e-06,
"loss": 8.9783,
"step": 191
},
{
"epoch": 0.8979830458930137,
"grad_norm": 0.2051703780889511,
"learning_rate": 6.9953051643192495e-06,
"loss": 10.5345,
"step": 192
},
{
"epoch": 0.9026600409237066,
"grad_norm": 0.19935429096221924,
"learning_rate": 6.979655712050079e-06,
"loss": 10.1047,
"step": 193
},
{
"epoch": 0.9073370359543993,
"grad_norm": 0.14471961557865143,
"learning_rate": 6.964006259780907e-06,
"loss": 8.9315,
"step": 194
},
{
"epoch": 0.912014030985092,
"grad_norm": 0.21026520431041718,
"learning_rate": 6.948356807511738e-06,
"loss": 11.0192,
"step": 195
},
{
"epoch": 0.9166910260157849,
"grad_norm": 0.22124925255775452,
"learning_rate": 6.932707355242568e-06,
"loss": 10.7211,
"step": 196
},
{
"epoch": 0.9213680210464776,
"grad_norm": 0.6166573166847229,
"learning_rate": 6.917057902973396e-06,
"loss": 10.1654,
"step": 197
},
{
"epoch": 0.9260450160771704,
"grad_norm": 0.14892670512199402,
"learning_rate": 6.901408450704225e-06,
"loss": 9.6949,
"step": 198
},
{
"epoch": 0.9307220111078632,
"grad_norm": 0.17058013379573822,
"learning_rate": 6.885758998435055e-06,
"loss": 9.9864,
"step": 199
},
{
"epoch": 0.935399006138556,
"grad_norm": 0.19176752865314484,
"learning_rate": 6.870109546165885e-06,
"loss": 9.7219,
"step": 200
},
{
"epoch": 0.9400760011692487,
"grad_norm": 0.1923060268163681,
"learning_rate": 6.854460093896714e-06,
"loss": 9.0111,
"step": 201
},
{
"epoch": 0.9447529961999416,
"grad_norm": 0.22771762311458588,
"learning_rate": 6.8388106416275434e-06,
"loss": 9.9277,
"step": 202
},
{
"epoch": 0.9494299912306343,
"grad_norm": 0.21972382068634033,
"learning_rate": 6.823161189358373e-06,
"loss": 10.5451,
"step": 203
},
{
"epoch": 0.9541069862613271,
"grad_norm": 0.32944294810295105,
"learning_rate": 6.807511737089203e-06,
"loss": 9.8053,
"step": 204
},
{
"epoch": 0.9587839812920199,
"grad_norm": 0.1875985562801361,
"learning_rate": 6.791862284820032e-06,
"loss": 10.3256,
"step": 205
},
{
"epoch": 0.9634609763227127,
"grad_norm": 0.17583012580871582,
"learning_rate": 6.7762128325508615e-06,
"loss": 10.4922,
"step": 206
},
{
"epoch": 0.9681379713534054,
"grad_norm": 0.22149552404880524,
"learning_rate": 6.760563380281691e-06,
"loss": 10.1547,
"step": 207
},
{
"epoch": 0.9728149663840983,
"grad_norm": 0.18506276607513428,
"learning_rate": 6.74491392801252e-06,
"loss": 10.5188,
"step": 208
},
{
"epoch": 0.977491961414791,
"grad_norm": 0.21199573576450348,
"learning_rate": 6.72926447574335e-06,
"loss": 11.3258,
"step": 209
},
{
"epoch": 0.9821689564454837,
"grad_norm": 0.18747669458389282,
"learning_rate": 6.71361502347418e-06,
"loss": 10.251,
"step": 210
},
{
"epoch": 0.9868459514761766,
"grad_norm": 0.1887262761592865,
"learning_rate": 6.697965571205008e-06,
"loss": 9.2012,
"step": 211
},
{
"epoch": 0.9915229465068693,
"grad_norm": 0.16557927429676056,
"learning_rate": 6.682316118935837e-06,
"loss": 9.2171,
"step": 212
},
{
"epoch": 0.9961999415375621,
"grad_norm": 0.19340123236179352,
"learning_rate": 6.666666666666667e-06,
"loss": 9.4988,
"step": 213
},
{
"epoch": 1.0046769950306929,
"grad_norm": 0.3001099228858948,
"learning_rate": 6.651017214397497e-06,
"loss": 11.8577,
"step": 214
},
{
"epoch": 1.0093539900613855,
"grad_norm": 0.18085287511348724,
"learning_rate": 6.635367762128326e-06,
"loss": 10.4356,
"step": 215
},
{
"epoch": 1.0140309850920783,
"grad_norm": 0.17791183292865753,
"learning_rate": 6.619718309859155e-06,
"loss": 10.3929,
"step": 216
},
{
"epoch": 1.0187079801227712,
"grad_norm": 0.20649202167987823,
"learning_rate": 6.604068857589985e-06,
"loss": 9.342,
"step": 217
},
{
"epoch": 1.0233849751534638,
"grad_norm": 0.2049955129623413,
"learning_rate": 6.588419405320814e-06,
"loss": 10.656,
"step": 218
},
{
"epoch": 1.0280619701841567,
"grad_norm": 0.18064165115356445,
"learning_rate": 6.572769953051644e-06,
"loss": 10.1633,
"step": 219
},
{
"epoch": 1.0327389652148495,
"grad_norm": 0.1652020812034607,
"learning_rate": 6.5571205007824735e-06,
"loss": 8.9937,
"step": 220
},
{
"epoch": 1.0374159602455422,
"grad_norm": 0.16658996045589447,
"learning_rate": 6.541471048513303e-06,
"loss": 11.0051,
"step": 221
},
{
"epoch": 1.042092955276235,
"grad_norm": 0.1875378042459488,
"learning_rate": 6.525821596244132e-06,
"loss": 9.7089,
"step": 222
},
{
"epoch": 1.0467699503069279,
"grad_norm": 0.19267050921916962,
"learning_rate": 6.510172143974961e-06,
"loss": 10.0252,
"step": 223
},
{
"epoch": 1.0514469453376205,
"grad_norm": 0.2656681537628174,
"learning_rate": 6.4945226917057916e-06,
"loss": 9.7082,
"step": 224
},
{
"epoch": 1.0561239403683134,
"grad_norm": 0.16058804094791412,
"learning_rate": 6.478873239436621e-06,
"loss": 9.6689,
"step": 225
},
{
"epoch": 1.0608009353990062,
"grad_norm": 0.14145280420780182,
"learning_rate": 6.463223787167449e-06,
"loss": 8.6923,
"step": 226
},
{
"epoch": 1.0654779304296988,
"grad_norm": 0.14217382669448853,
"learning_rate": 6.447574334898279e-06,
"loss": 10.4302,
"step": 227
},
{
"epoch": 1.0701549254603917,
"grad_norm": 0.18387371301651,
"learning_rate": 6.431924882629108e-06,
"loss": 9.5514,
"step": 228
},
{
"epoch": 1.0748319204910846,
"grad_norm": 0.15731996297836304,
"learning_rate": 6.416275430359938e-06,
"loss": 9.2854,
"step": 229
},
{
"epoch": 1.0795089155217772,
"grad_norm": 0.1794990450143814,
"learning_rate": 6.400625978090767e-06,
"loss": 11.0837,
"step": 230
},
{
"epoch": 1.08418591055247,
"grad_norm": 0.19289837777614594,
"learning_rate": 6.384976525821597e-06,
"loss": 9.3129,
"step": 231
},
{
"epoch": 1.088862905583163,
"grad_norm": 0.1858958899974823,
"learning_rate": 6.369327073552426e-06,
"loss": 10.7238,
"step": 232
},
{
"epoch": 1.0935399006138555,
"grad_norm": 0.26388686895370483,
"learning_rate": 6.353677621283255e-06,
"loss": 9.2242,
"step": 233
},
{
"epoch": 1.0982168956445484,
"grad_norm": 0.17551296949386597,
"learning_rate": 6.3380281690140855e-06,
"loss": 8.3665,
"step": 234
},
{
"epoch": 1.1028938906752412,
"grad_norm": 0.20290863513946533,
"learning_rate": 6.322378716744915e-06,
"loss": 9.6916,
"step": 235
},
{
"epoch": 1.1075708857059339,
"grad_norm": 0.11323179304599762,
"learning_rate": 6.306729264475744e-06,
"loss": 10.218,
"step": 236
},
{
"epoch": 1.1122478807366267,
"grad_norm": 0.22893109917640686,
"learning_rate": 6.291079812206573e-06,
"loss": 10.3068,
"step": 237
},
{
"epoch": 1.1169248757673196,
"grad_norm": 0.1943362057209015,
"learning_rate": 6.275430359937402e-06,
"loss": 9.738,
"step": 238
},
{
"epoch": 1.1216018707980122,
"grad_norm": 0.22017931938171387,
"learning_rate": 6.259780907668233e-06,
"loss": 8.5765,
"step": 239
},
{
"epoch": 1.126278865828705,
"grad_norm": 0.1584814190864563,
"learning_rate": 6.244131455399062e-06,
"loss": 11.0436,
"step": 240
},
{
"epoch": 1.130955860859398,
"grad_norm": 0.182816743850708,
"learning_rate": 6.228482003129891e-06,
"loss": 11.1518,
"step": 241
},
{
"epoch": 1.1356328558900906,
"grad_norm": 0.21375828981399536,
"learning_rate": 6.21283255086072e-06,
"loss": 10.0972,
"step": 242
},
{
"epoch": 1.1403098509207834,
"grad_norm": 0.1926356703042984,
"learning_rate": 6.197183098591549e-06,
"loss": 9.0861,
"step": 243
},
{
"epoch": 1.1449868459514763,
"grad_norm": 0.13788476586341858,
"learning_rate": 6.181533646322379e-06,
"loss": 9.1896,
"step": 244
},
{
"epoch": 1.149663840982169,
"grad_norm": 0.24886344373226166,
"learning_rate": 6.165884194053209e-06,
"loss": 8.9126,
"step": 245
},
{
"epoch": 1.1543408360128617,
"grad_norm": 0.21492387354373932,
"learning_rate": 6.150234741784038e-06,
"loss": 9.1809,
"step": 246
},
{
"epoch": 1.1590178310435546,
"grad_norm": 0.20666466653347015,
"learning_rate": 6.134585289514867e-06,
"loss": 9.8609,
"step": 247
},
{
"epoch": 1.1636948260742472,
"grad_norm": 0.12884530425071716,
"learning_rate": 6.118935837245697e-06,
"loss": 9.0015,
"step": 248
},
{
"epoch": 1.16837182110494,
"grad_norm": 0.2109869420528412,
"learning_rate": 6.103286384976527e-06,
"loss": 8.4398,
"step": 249
},
{
"epoch": 1.173048816135633,
"grad_norm": 0.1602170467376709,
"learning_rate": 6.087636932707356e-06,
"loss": 8.9123,
"step": 250
},
{
"epoch": 1.1777258111663256,
"grad_norm": 0.1901443898677826,
"learning_rate": 6.071987480438185e-06,
"loss": 9.3279,
"step": 251
},
{
"epoch": 1.1824028061970184,
"grad_norm": 0.12106055021286011,
"learning_rate": 6.056338028169015e-06,
"loss": 8.8215,
"step": 252
},
{
"epoch": 1.1870798012277113,
"grad_norm": 0.15600277483463287,
"learning_rate": 6.040688575899843e-06,
"loss": 9.5461,
"step": 253
},
{
"epoch": 1.191756796258404,
"grad_norm": 0.211564302444458,
"learning_rate": 6.025039123630674e-06,
"loss": 9.9196,
"step": 254
},
{
"epoch": 1.1964337912890968,
"grad_norm": 0.16480544209480286,
"learning_rate": 6.0093896713615026e-06,
"loss": 9.488,
"step": 255
},
{
"epoch": 1.2011107863197896,
"grad_norm": 0.22194457054138184,
"learning_rate": 5.993740219092332e-06,
"loss": 10.415,
"step": 256
},
{
"epoch": 1.2057877813504823,
"grad_norm": 0.27972927689552307,
"learning_rate": 5.978090766823161e-06,
"loss": 9.3022,
"step": 257
},
{
"epoch": 1.2104647763811751,
"grad_norm": 0.23484700918197632,
"learning_rate": 5.9624413145539905e-06,
"loss": 8.218,
"step": 258
},
{
"epoch": 1.215141771411868,
"grad_norm": 0.20119240880012512,
"learning_rate": 5.946791862284821e-06,
"loss": 8.204,
"step": 259
},
{
"epoch": 1.2198187664425606,
"grad_norm": 0.19867953658103943,
"learning_rate": 5.93114241001565e-06,
"loss": 9.4491,
"step": 260
},
{
"epoch": 1.2244957614732535,
"grad_norm": 0.19878610968589783,
"learning_rate": 5.915492957746479e-06,
"loss": 10.683,
"step": 261
},
{
"epoch": 1.2291727565039463,
"grad_norm": 0.18710929155349731,
"learning_rate": 5.8998435054773086e-06,
"loss": 10.2426,
"step": 262
},
{
"epoch": 1.233849751534639,
"grad_norm": 0.1873483806848526,
"learning_rate": 5.884194053208139e-06,
"loss": 10.1553,
"step": 263
},
{
"epoch": 1.2385267465653318,
"grad_norm": 0.26153287291526794,
"learning_rate": 5.868544600938968e-06,
"loss": 9.8046,
"step": 264
},
{
"epoch": 1.2432037415960246,
"grad_norm": 0.17956022918224335,
"learning_rate": 5.852895148669797e-06,
"loss": 9.2137,
"step": 265
},
{
"epoch": 1.2478807366267173,
"grad_norm": 0.15572352707386017,
"learning_rate": 5.837245696400627e-06,
"loss": 9.2382,
"step": 266
},
{
"epoch": 1.2525577316574101,
"grad_norm": 0.16768573224544525,
"learning_rate": 5.821596244131456e-06,
"loss": 10.1462,
"step": 267
},
{
"epoch": 1.257234726688103,
"grad_norm": 0.14606249332427979,
"learning_rate": 5.805946791862286e-06,
"loss": 9.6735,
"step": 268
},
{
"epoch": 1.2619117217187956,
"grad_norm": 0.20985975861549377,
"learning_rate": 5.790297339593115e-06,
"loss": 10.9061,
"step": 269
},
{
"epoch": 1.2665887167494885,
"grad_norm": 0.17635460197925568,
"learning_rate": 5.774647887323944e-06,
"loss": 9.1385,
"step": 270
},
{
"epoch": 1.271265711780181,
"grad_norm": 0.19080878794193268,
"learning_rate": 5.758998435054773e-06,
"loss": 9.8189,
"step": 271
},
{
"epoch": 1.275942706810874,
"grad_norm": 0.1511276364326477,
"learning_rate": 5.7433489827856025e-06,
"loss": 9.9191,
"step": 272
},
{
"epoch": 1.2806197018415668,
"grad_norm": 0.2525511085987091,
"learning_rate": 5.727699530516433e-06,
"loss": 8.7398,
"step": 273
},
{
"epoch": 1.2852966968722597,
"grad_norm": 0.18259669840335846,
"learning_rate": 5.712050078247262e-06,
"loss": 10.7875,
"step": 274
},
{
"epoch": 1.2899736919029523,
"grad_norm": 0.2251911461353302,
"learning_rate": 5.696400625978091e-06,
"loss": 8.9997,
"step": 275
},
{
"epoch": 1.2946506869336452,
"grad_norm": 0.17306119203567505,
"learning_rate": 5.6807511737089205e-06,
"loss": 10.0071,
"step": 276
},
{
"epoch": 1.2993276819643378,
"grad_norm": 0.23585619032382965,
"learning_rate": 5.66510172143975e-06,
"loss": 9.5575,
"step": 277
},
{
"epoch": 1.3040046769950306,
"grad_norm": 0.2100452035665512,
"learning_rate": 5.64945226917058e-06,
"loss": 9.6862,
"step": 278
},
{
"epoch": 1.3086816720257235,
"grad_norm": 0.19781209528446198,
"learning_rate": 5.633802816901409e-06,
"loss": 9.6712,
"step": 279
},
{
"epoch": 1.3133586670564164,
"grad_norm": 0.20990189909934998,
"learning_rate": 5.618153364632239e-06,
"loss": 9.1145,
"step": 280
},
{
"epoch": 1.318035662087109,
"grad_norm": 0.14471188187599182,
"learning_rate": 5.602503912363068e-06,
"loss": 10.0124,
"step": 281
},
{
"epoch": 1.3227126571178018,
"grad_norm": 0.181657612323761,
"learning_rate": 5.586854460093896e-06,
"loss": 8.5702,
"step": 282
},
{
"epoch": 1.3273896521484945,
"grad_norm": 0.28895941376686096,
"learning_rate": 5.571205007824727e-06,
"loss": 8.7288,
"step": 283
},
{
"epoch": 1.3320666471791873,
"grad_norm": 0.19658011198043823,
"learning_rate": 5.555555555555557e-06,
"loss": 10.2721,
"step": 284
},
{
"epoch": 1.3367436422098802,
"grad_norm": 0.1778428554534912,
"learning_rate": 5.539906103286385e-06,
"loss": 8.6042,
"step": 285
},
{
"epoch": 1.341420637240573,
"grad_norm": 0.1622474491596222,
"learning_rate": 5.5242566510172144e-06,
"loss": 9.0871,
"step": 286
},
{
"epoch": 1.3460976322712657,
"grad_norm": 0.17768928408622742,
"learning_rate": 5.508607198748044e-06,
"loss": 9.1438,
"step": 287
},
{
"epoch": 1.3507746273019585,
"grad_norm": 0.15472590923309326,
"learning_rate": 5.492957746478874e-06,
"loss": 8.1626,
"step": 288
},
{
"epoch": 1.3554516223326512,
"grad_norm": 0.151944100856781,
"learning_rate": 5.477308294209703e-06,
"loss": 10.6628,
"step": 289
},
{
"epoch": 1.360128617363344,
"grad_norm": 0.2412179410457611,
"learning_rate": 5.4616588419405325e-06,
"loss": 10.0811,
"step": 290
},
{
"epoch": 1.3648056123940369,
"grad_norm": 0.1254899650812149,
"learning_rate": 5.446009389671362e-06,
"loss": 8.7967,
"step": 291
},
{
"epoch": 1.3694826074247297,
"grad_norm": 0.1940433233976364,
"learning_rate": 5.430359937402191e-06,
"loss": 10.7896,
"step": 292
},
{
"epoch": 1.3741596024554223,
"grad_norm": 0.23099660873413086,
"learning_rate": 5.414710485133021e-06,
"loss": 10.3398,
"step": 293
},
{
"epoch": 1.3788365974861152,
"grad_norm": 0.14648781716823578,
"learning_rate": 5.3990610328638506e-06,
"loss": 9.3573,
"step": 294
},
{
"epoch": 1.3835135925168078,
"grad_norm": 0.18853303790092468,
"learning_rate": 5.38341158059468e-06,
"loss": 9.8656,
"step": 295
},
{
"epoch": 1.3881905875475007,
"grad_norm": 0.20366129279136658,
"learning_rate": 5.367762128325509e-06,
"loss": 10.2061,
"step": 296
},
{
"epoch": 1.3928675825781935,
"grad_norm": 0.18720601499080658,
"learning_rate": 5.352112676056338e-06,
"loss": 8.4737,
"step": 297
},
{
"epoch": 1.3975445776088864,
"grad_norm": 0.1396239697933197,
"learning_rate": 5.336463223787169e-06,
"loss": 9.3009,
"step": 298
},
{
"epoch": 1.402221572639579,
"grad_norm": 0.19741852581501007,
"learning_rate": 5.320813771517997e-06,
"loss": 9.7318,
"step": 299
},
{
"epoch": 1.4068985676702719,
"grad_norm": 0.1550920307636261,
"learning_rate": 5.305164319248826e-06,
"loss": 9.0948,
"step": 300
},
{
"epoch": 1.4115755627009645,
"grad_norm": 0.20845593512058258,
"learning_rate": 5.289514866979656e-06,
"loss": 8.555,
"step": 301
},
{
"epoch": 1.4162525577316574,
"grad_norm": 0.15616929531097412,
"learning_rate": 5.273865414710485e-06,
"loss": 9.293,
"step": 302
},
{
"epoch": 1.4209295527623502,
"grad_norm": 0.18581336736679077,
"learning_rate": 5.258215962441315e-06,
"loss": 8.6798,
"step": 303
},
{
"epoch": 1.425606547793043,
"grad_norm": 0.14762163162231445,
"learning_rate": 5.2425665101721445e-06,
"loss": 7.7574,
"step": 304
},
{
"epoch": 1.4302835428237357,
"grad_norm": 0.11617639660835266,
"learning_rate": 5.226917057902974e-06,
"loss": 9.9937,
"step": 305
},
{
"epoch": 1.4349605378544286,
"grad_norm": 0.12888303399085999,
"learning_rate": 5.211267605633803e-06,
"loss": 9.5393,
"step": 306
},
{
"epoch": 1.4396375328851212,
"grad_norm": 0.14450183510780334,
"learning_rate": 5.195618153364632e-06,
"loss": 10.9441,
"step": 307
},
{
"epoch": 1.444314527915814,
"grad_norm": 0.20856888592243195,
"learning_rate": 5.1799687010954625e-06,
"loss": 9.6833,
"step": 308
},
{
"epoch": 1.448991522946507,
"grad_norm": 0.23422713577747345,
"learning_rate": 5.164319248826292e-06,
"loss": 9.2532,
"step": 309
},
{
"epoch": 1.4536685179771998,
"grad_norm": 0.19145800173282623,
"learning_rate": 5.148669796557121e-06,
"loss": 9.7285,
"step": 310
},
{
"epoch": 1.4583455130078924,
"grad_norm": 0.1990247666835785,
"learning_rate": 5.1330203442879505e-06,
"loss": 7.6512,
"step": 311
},
{
"epoch": 1.4630225080385852,
"grad_norm": 0.17829596996307373,
"learning_rate": 5.117370892018779e-06,
"loss": 9.5529,
"step": 312
},
{
"epoch": 1.4676995030692779,
"grad_norm": 0.162981778383255,
"learning_rate": 5.10172143974961e-06,
"loss": 10.0274,
"step": 313
},
{
"epoch": 1.4723764980999707,
"grad_norm": 0.17965111136436462,
"learning_rate": 5.086071987480438e-06,
"loss": 9.2513,
"step": 314
},
{
"epoch": 1.4770534931306636,
"grad_norm": 0.28804492950439453,
"learning_rate": 5.070422535211268e-06,
"loss": 10.0194,
"step": 315
},
{
"epoch": 1.4817304881613564,
"grad_norm": 0.1571478545665741,
"learning_rate": 5.054773082942097e-06,
"loss": 10.0889,
"step": 316
},
{
"epoch": 1.486407483192049,
"grad_norm": 0.2101372927427292,
"learning_rate": 5.039123630672926e-06,
"loss": 8.6775,
"step": 317
},
{
"epoch": 1.491084478222742,
"grad_norm": 0.20323887467384338,
"learning_rate": 5.0234741784037565e-06,
"loss": 9.8082,
"step": 318
},
{
"epoch": 1.4957614732534346,
"grad_norm": 0.16192995011806488,
"learning_rate": 5.007824726134586e-06,
"loss": 8.0025,
"step": 319
},
{
"epoch": 1.5004384682841274,
"grad_norm": 0.16440463066101074,
"learning_rate": 4.992175273865415e-06,
"loss": 9.579,
"step": 320
},
{
"epoch": 1.5051154633148203,
"grad_norm": 0.19055482745170593,
"learning_rate": 4.976525821596244e-06,
"loss": 8.7398,
"step": 321
},
{
"epoch": 1.5097924583455131,
"grad_norm": 0.17318573594093323,
"learning_rate": 4.960876369327074e-06,
"loss": 9.7488,
"step": 322
},
{
"epoch": 1.5144694533762058,
"grad_norm": 0.24867770075798035,
"learning_rate": 4.945226917057903e-06,
"loss": 10.5706,
"step": 323
},
{
"epoch": 1.5191464484068986,
"grad_norm": 0.1796032041311264,
"learning_rate": 4.929577464788733e-06,
"loss": 9.4351,
"step": 324
},
{
"epoch": 1.5238234434375912,
"grad_norm": 0.21675661206245422,
"learning_rate": 4.9139280125195624e-06,
"loss": 10.6771,
"step": 325
},
{
"epoch": 1.528500438468284,
"grad_norm": 0.17892418801784515,
"learning_rate": 4.898278560250392e-06,
"loss": 7.6976,
"step": 326
},
{
"epoch": 1.533177433498977,
"grad_norm": 0.16854748129844666,
"learning_rate": 4.882629107981221e-06,
"loss": 9.0202,
"step": 327
},
{
"epoch": 1.5378544285296698,
"grad_norm": 0.20898739993572235,
"learning_rate": 4.86697965571205e-06,
"loss": 9.3772,
"step": 328
},
{
"epoch": 1.5425314235603624,
"grad_norm": 0.2980878949165344,
"learning_rate": 4.85133020344288e-06,
"loss": 10.5012,
"step": 329
},
{
"epoch": 1.5472084185910553,
"grad_norm": 0.12076615542173386,
"learning_rate": 4.835680751173709e-06,
"loss": 10.1389,
"step": 330
},
{
"epoch": 1.551885413621748,
"grad_norm": 0.1814320981502533,
"learning_rate": 4.820031298904539e-06,
"loss": 8.4015,
"step": 331
},
{
"epoch": 1.5565624086524408,
"grad_norm": 0.16422027349472046,
"learning_rate": 4.8043818466353684e-06,
"loss": 8.4772,
"step": 332
},
{
"epoch": 1.5612394036831336,
"grad_norm": 0.12222316116094589,
"learning_rate": 4.788732394366197e-06,
"loss": 8.7358,
"step": 333
},
{
"epoch": 1.5659163987138265,
"grad_norm": 0.20471377670764923,
"learning_rate": 4.773082942097027e-06,
"loss": 8.9805,
"step": 334
},
{
"epoch": 1.5705933937445191,
"grad_norm": 0.1602873057126999,
"learning_rate": 4.757433489827856e-06,
"loss": 7.7731,
"step": 335
},
{
"epoch": 1.575270388775212,
"grad_norm": 0.1620335578918457,
"learning_rate": 4.741784037558686e-06,
"loss": 8.5971,
"step": 336
},
{
"epoch": 1.5799473838059046,
"grad_norm": 0.14822766184806824,
"learning_rate": 4.726134585289515e-06,
"loss": 8.1521,
"step": 337
},
{
"epoch": 1.5846243788365975,
"grad_norm": 0.16832107305526733,
"learning_rate": 4.710485133020345e-06,
"loss": 9.0838,
"step": 338
},
{
"epoch": 1.5893013738672903,
"grad_norm": 0.1385219246149063,
"learning_rate": 4.694835680751174e-06,
"loss": 7.4367,
"step": 339
},
{
"epoch": 1.5939783688979832,
"grad_norm": 0.13664643466472626,
"learning_rate": 4.679186228482004e-06,
"loss": 8.5027,
"step": 340
},
{
"epoch": 1.5986553639286758,
"grad_norm": 0.18891537189483643,
"learning_rate": 4.663536776212833e-06,
"loss": 8.6301,
"step": 341
},
{
"epoch": 1.6033323589593687,
"grad_norm": 0.19962970912456512,
"learning_rate": 4.647887323943662e-06,
"loss": 10.6293,
"step": 342
},
{
"epoch": 1.6080093539900613,
"grad_norm": 0.18747878074645996,
"learning_rate": 4.632237871674492e-06,
"loss": 10.0322,
"step": 343
},
{
"epoch": 1.6126863490207541,
"grad_norm": 0.3010605573654175,
"learning_rate": 4.616588419405321e-06,
"loss": 9.1209,
"step": 344
},
{
"epoch": 1.617363344051447,
"grad_norm": 0.11245454847812653,
"learning_rate": 4.60093896713615e-06,
"loss": 8.0594,
"step": 345
},
{
"epoch": 1.6220403390821398,
"grad_norm": 0.20886649191379547,
"learning_rate": 4.58528951486698e-06,
"loss": 9.1715,
"step": 346
},
{
"epoch": 1.6267173341128325,
"grad_norm": 0.14630508422851562,
"learning_rate": 4.56964006259781e-06,
"loss": 8.7735,
"step": 347
},
{
"epoch": 1.6313943291435253,
"grad_norm": 0.21093368530273438,
"learning_rate": 4.553990610328639e-06,
"loss": 8.2183,
"step": 348
},
{
"epoch": 1.636071324174218,
"grad_norm": 0.22136329114437103,
"learning_rate": 4.538341158059468e-06,
"loss": 9.067,
"step": 349
},
{
"epoch": 1.6407483192049108,
"grad_norm": 0.15906454622745514,
"learning_rate": 4.522691705790298e-06,
"loss": 9.3209,
"step": 350
},
{
"epoch": 1.6454253142356037,
"grad_norm": 0.2312268763780594,
"learning_rate": 4.507042253521127e-06,
"loss": 9.2316,
"step": 351
},
{
"epoch": 1.6501023092662965,
"grad_norm": 0.24528440833091736,
"learning_rate": 4.491392801251956e-06,
"loss": 9.0482,
"step": 352
},
{
"epoch": 1.6547793042969892,
"grad_norm": 0.19777342677116394,
"learning_rate": 4.475743348982786e-06,
"loss": 10.1556,
"step": 353
},
{
"epoch": 1.659456299327682,
"grad_norm": 0.2033587247133255,
"learning_rate": 4.460093896713616e-06,
"loss": 8.9973,
"step": 354
},
{
"epoch": 1.6641332943583746,
"grad_norm": 0.16927585005760193,
"learning_rate": 4.444444444444444e-06,
"loss": 9.5144,
"step": 355
},
{
"epoch": 1.6688102893890675,
"grad_norm": 0.16959340870380402,
"learning_rate": 4.428794992175274e-06,
"loss": 9.5447,
"step": 356
},
{
"epoch": 1.6734872844197604,
"grad_norm": 0.18593505024909973,
"learning_rate": 4.413145539906104e-06,
"loss": 9.6471,
"step": 357
},
{
"epoch": 1.6781642794504532,
"grad_norm": 0.16945506632328033,
"learning_rate": 4.397496087636933e-06,
"loss": 8.5418,
"step": 358
},
{
"epoch": 1.6828412744811458,
"grad_norm": 0.16277293860912323,
"learning_rate": 4.381846635367762e-06,
"loss": 9.2884,
"step": 359
},
{
"epoch": 1.6875182695118387,
"grad_norm": 0.2155790776014328,
"learning_rate": 4.3661971830985915e-06,
"loss": 9.4547,
"step": 360
},
{
"epoch": 1.6921952645425313,
"grad_norm": 0.19257700443267822,
"learning_rate": 4.350547730829422e-06,
"loss": 8.7859,
"step": 361
},
{
"epoch": 1.6968722595732242,
"grad_norm": 0.21113352477550507,
"learning_rate": 4.334898278560251e-06,
"loss": 9.3654,
"step": 362
},
{
"epoch": 1.701549254603917,
"grad_norm": 0.17781415581703186,
"learning_rate": 4.31924882629108e-06,
"loss": 9.5482,
"step": 363
},
{
"epoch": 1.70622624963461,
"grad_norm": 0.14610658586025238,
"learning_rate": 4.30359937402191e-06,
"loss": 9.2182,
"step": 364
},
{
"epoch": 1.7109032446653025,
"grad_norm": 0.19297371804714203,
"learning_rate": 4.287949921752739e-06,
"loss": 8.5858,
"step": 365
},
{
"epoch": 1.7155802396959952,
"grad_norm": 0.16764657199382782,
"learning_rate": 4.272300469483568e-06,
"loss": 8.6679,
"step": 366
},
{
"epoch": 1.720257234726688,
"grad_norm": 0.1740255355834961,
"learning_rate": 4.2566510172143975e-06,
"loss": 8.3984,
"step": 367
},
{
"epoch": 1.7249342297573809,
"grad_norm": 0.2171589732170105,
"learning_rate": 4.241001564945228e-06,
"loss": 8.6767,
"step": 368
},
{
"epoch": 1.7296112247880737,
"grad_norm": 0.15334008634090424,
"learning_rate": 4.225352112676057e-06,
"loss": 9.0357,
"step": 369
},
{
"epoch": 1.7342882198187666,
"grad_norm": 0.1901715248823166,
"learning_rate": 4.209702660406886e-06,
"loss": 9.1397,
"step": 370
},
{
"epoch": 1.7389652148494592,
"grad_norm": 0.14479465782642365,
"learning_rate": 4.194053208137716e-06,
"loss": 8.0689,
"step": 371
},
{
"epoch": 1.7436422098801518,
"grad_norm": 0.13776177167892456,
"learning_rate": 4.178403755868545e-06,
"loss": 8.2216,
"step": 372
},
{
"epoch": 1.7483192049108447,
"grad_norm": 0.13980716466903687,
"learning_rate": 4.162754303599374e-06,
"loss": 10.2694,
"step": 373
},
{
"epoch": 1.7529961999415375,
"grad_norm": 0.15243536233901978,
"learning_rate": 4.1471048513302035e-06,
"loss": 8.4832,
"step": 374
},
{
"epoch": 1.7576731949722304,
"grad_norm": 0.1408737152814865,
"learning_rate": 4.131455399061034e-06,
"loss": 10.5995,
"step": 375
},
{
"epoch": 1.7623501900029233,
"grad_norm": 0.16743288934230804,
"learning_rate": 4.115805946791863e-06,
"loss": 9.0306,
"step": 376
},
{
"epoch": 1.7670271850336159,
"grad_norm": 0.13096289336681366,
"learning_rate": 4.100156494522691e-06,
"loss": 8.799,
"step": 377
},
{
"epoch": 1.7717041800643085,
"grad_norm": 0.18536189198493958,
"learning_rate": 4.0845070422535216e-06,
"loss": 8.6714,
"step": 378
},
{
"epoch": 1.7763811750950014,
"grad_norm": 0.21224500238895416,
"learning_rate": 4.068857589984351e-06,
"loss": 8.8822,
"step": 379
},
{
"epoch": 1.7810581701256942,
"grad_norm": 0.15303047001361847,
"learning_rate": 4.05320813771518e-06,
"loss": 8.8666,
"step": 380
},
{
"epoch": 1.785735165156387,
"grad_norm": 0.14419591426849365,
"learning_rate": 4.0375586854460095e-06,
"loss": 8.916,
"step": 381
},
{
"epoch": 1.79041216018708,
"grad_norm": 0.1363951712846756,
"learning_rate": 4.021909233176839e-06,
"loss": 8.3857,
"step": 382
},
{
"epoch": 1.7950891552177726,
"grad_norm": 0.20621058344841003,
"learning_rate": 4.006259780907669e-06,
"loss": 10.1237,
"step": 383
},
{
"epoch": 1.7997661502484652,
"grad_norm": 0.21105414628982544,
"learning_rate": 3.990610328638498e-06,
"loss": 9.5554,
"step": 384
},
{
"epoch": 1.804443145279158,
"grad_norm": 0.21915097534656525,
"learning_rate": 3.9749608763693276e-06,
"loss": 7.717,
"step": 385
},
{
"epoch": 1.809120140309851,
"grad_norm": 0.17555522918701172,
"learning_rate": 3.959311424100157e-06,
"loss": 9.1899,
"step": 386
},
{
"epoch": 1.8137971353405438,
"grad_norm": 0.1890765279531479,
"learning_rate": 3.943661971830986e-06,
"loss": 8.0672,
"step": 387
},
{
"epoch": 1.8184741303712366,
"grad_norm": 0.16451717913150787,
"learning_rate": 3.9280125195618155e-06,
"loss": 8.8205,
"step": 388
},
{
"epoch": 1.8231511254019293,
"grad_norm": 0.16023708879947662,
"learning_rate": 3.912363067292645e-06,
"loss": 9.319,
"step": 389
},
{
"epoch": 1.8278281204326219,
"grad_norm": 0.15548115968704224,
"learning_rate": 3.896713615023475e-06,
"loss": 8.2246,
"step": 390
},
{
"epoch": 1.8325051154633147,
"grad_norm": 0.21226494014263153,
"learning_rate": 3.881064162754304e-06,
"loss": 9.135,
"step": 391
},
{
"epoch": 1.8371821104940076,
"grad_norm": 0.14461496472358704,
"learning_rate": 3.865414710485133e-06,
"loss": 8.962,
"step": 392
},
{
"epoch": 1.8418591055247004,
"grad_norm": 0.20766492187976837,
"learning_rate": 3.849765258215963e-06,
"loss": 8.8991,
"step": 393
},
{
"epoch": 1.8465361005553933,
"grad_norm": 0.20327630639076233,
"learning_rate": 3.834115805946792e-06,
"loss": 9.1291,
"step": 394
},
{
"epoch": 1.851213095586086,
"grad_norm": 0.23052388429641724,
"learning_rate": 3.8184663536776215e-06,
"loss": 8.3602,
"step": 395
},
{
"epoch": 1.8558900906167786,
"grad_norm": 0.16140541434288025,
"learning_rate": 3.8028169014084508e-06,
"loss": 9.3176,
"step": 396
},
{
"epoch": 1.8605670856474714,
"grad_norm": 0.17049185931682587,
"learning_rate": 3.7871674491392805e-06,
"loss": 8.6602,
"step": 397
},
{
"epoch": 1.8652440806781643,
"grad_norm": 0.11496849358081818,
"learning_rate": 3.77151799687011e-06,
"loss": 10.3293,
"step": 398
},
{
"epoch": 1.8699210757088571,
"grad_norm": 0.1907191127538681,
"learning_rate": 3.755868544600939e-06,
"loss": 8.4035,
"step": 399
},
{
"epoch": 1.87459807073955,
"grad_norm": 0.16409359872341156,
"learning_rate": 3.740219092331769e-06,
"loss": 8.9062,
"step": 400
},
{
"epoch": 1.8792750657702426,
"grad_norm": 0.15642918646335602,
"learning_rate": 3.724569640062598e-06,
"loss": 8.8751,
"step": 401
},
{
"epoch": 1.8839520608009352,
"grad_norm": 0.1641726940870285,
"learning_rate": 3.708920187793428e-06,
"loss": 8.3851,
"step": 402
},
{
"epoch": 1.888629055831628,
"grad_norm": 0.15342937409877777,
"learning_rate": 3.693270735524257e-06,
"loss": 9.3965,
"step": 403
},
{
"epoch": 1.893306050862321,
"grad_norm": 0.15916384756565094,
"learning_rate": 3.677621283255086e-06,
"loss": 8.7446,
"step": 404
},
{
"epoch": 1.8979830458930138,
"grad_norm": 0.21401815116405487,
"learning_rate": 3.6619718309859158e-06,
"loss": 8.8994,
"step": 405
},
{
"epoch": 1.9026600409237067,
"grad_norm": 0.19148550927639008,
"learning_rate": 3.646322378716745e-06,
"loss": 8.5996,
"step": 406
},
{
"epoch": 1.9073370359543993,
"grad_norm": 0.1755845844745636,
"learning_rate": 3.630672926447575e-06,
"loss": 8.7611,
"step": 407
},
{
"epoch": 1.912014030985092,
"grad_norm": 0.17193089425563812,
"learning_rate": 3.615023474178404e-06,
"loss": 8.9488,
"step": 408
},
{
"epoch": 1.9166910260157848,
"grad_norm": 0.17173364758491516,
"learning_rate": 3.5993740219092334e-06,
"loss": 8.0517,
"step": 409
},
{
"epoch": 1.9213680210464776,
"grad_norm": 0.22657723724842072,
"learning_rate": 3.583724569640063e-06,
"loss": 8.7361,
"step": 410
},
{
"epoch": 1.9260450160771705,
"grad_norm": 0.21941417455673218,
"learning_rate": 3.568075117370892e-06,
"loss": 9.2343,
"step": 411
},
{
"epoch": 1.9307220111078633,
"grad_norm": 0.18514755368232727,
"learning_rate": 3.5524256651017218e-06,
"loss": 8.2767,
"step": 412
},
{
"epoch": 1.935399006138556,
"grad_norm": 0.13066066801548004,
"learning_rate": 3.536776212832551e-06,
"loss": 8.7371,
"step": 413
},
{
"epoch": 1.9400760011692486,
"grad_norm": 0.16903606057167053,
"learning_rate": 3.5211267605633804e-06,
"loss": 9.3067,
"step": 414
},
{
"epoch": 1.9447529961999415,
"grad_norm": 0.14286428689956665,
"learning_rate": 3.50547730829421e-06,
"loss": 7.8586,
"step": 415
},
{
"epoch": 1.9494299912306343,
"grad_norm": 0.1969095915555954,
"learning_rate": 3.4898278560250394e-06,
"loss": 9.6053,
"step": 416
},
{
"epoch": 1.9541069862613272,
"grad_norm": 0.1750202775001526,
"learning_rate": 3.474178403755869e-06,
"loss": 9.0714,
"step": 417
},
{
"epoch": 1.95878398129202,
"grad_norm": 0.21293002367019653,
"learning_rate": 3.458528951486698e-06,
"loss": 9.8726,
"step": 418
},
{
"epoch": 1.9634609763227127,
"grad_norm": 0.1672164648771286,
"learning_rate": 3.4428794992175273e-06,
"loss": 9.5275,
"step": 419
},
{
"epoch": 1.9681379713534053,
"grad_norm": 0.17561869323253632,
"learning_rate": 3.427230046948357e-06,
"loss": 7.2097,
"step": 420
},
{
"epoch": 1.9728149663840981,
"grad_norm": 0.16326965391635895,
"learning_rate": 3.4115805946791864e-06,
"loss": 9.3302,
"step": 421
},
{
"epoch": 1.977491961414791,
"grad_norm": 0.15163388848304749,
"learning_rate": 3.395931142410016e-06,
"loss": 9.1933,
"step": 422
},
{
"epoch": 1.9821689564454839,
"grad_norm": 0.16277414560317993,
"learning_rate": 3.3802816901408454e-06,
"loss": 8.3196,
"step": 423
},
{
"epoch": 1.9868459514761767,
"grad_norm": 0.18385657668113708,
"learning_rate": 3.364632237871675e-06,
"loss": 8.1472,
"step": 424
},
{
"epoch": 1.9915229465068693,
"grad_norm": 0.1768423169851303,
"learning_rate": 3.348982785602504e-06,
"loss": 8.3639,
"step": 425
},
{
"epoch": 1.996199941537562,
"grad_norm": 0.2325451821088791,
"learning_rate": 3.3333333333333333e-06,
"loss": 9.265,
"step": 426
},
{
"epoch": 2.004676995030693,
"grad_norm": 0.22825832664966583,
"learning_rate": 3.317683881064163e-06,
"loss": 10.0732,
"step": 427
},
{
"epoch": 2.0093539900613857,
"grad_norm": 0.16034899652004242,
"learning_rate": 3.3020344287949924e-06,
"loss": 7.7232,
"step": 428
},
{
"epoch": 2.014030985092078,
"grad_norm": 0.1737372726202011,
"learning_rate": 3.286384976525822e-06,
"loss": 8.0928,
"step": 429
},
{
"epoch": 2.018707980122771,
"grad_norm": 0.20644846558570862,
"learning_rate": 3.2707355242566514e-06,
"loss": 8.6956,
"step": 430
},
{
"epoch": 2.023384975153464,
"grad_norm": 0.3140431344509125,
"learning_rate": 3.2550860719874807e-06,
"loss": 9.0442,
"step": 431
},
{
"epoch": 2.0280619701841567,
"grad_norm": 0.2457619458436966,
"learning_rate": 3.2394366197183104e-06,
"loss": 8.6256,
"step": 432
},
{
"epoch": 2.0327389652148495,
"grad_norm": 0.2014688104391098,
"learning_rate": 3.2237871674491393e-06,
"loss": 9.7276,
"step": 433
},
{
"epoch": 2.0374159602455424,
"grad_norm": 0.1970800757408142,
"learning_rate": 3.208137715179969e-06,
"loss": 8.19,
"step": 434
},
{
"epoch": 2.0420929552762352,
"grad_norm": 0.12662629783153534,
"learning_rate": 3.1924882629107983e-06,
"loss": 9.235,
"step": 435
},
{
"epoch": 2.0467699503069277,
"grad_norm": 0.15353932976722717,
"learning_rate": 3.1768388106416277e-06,
"loss": 8.8255,
"step": 436
},
{
"epoch": 2.0514469453376205,
"grad_norm": 0.2180812507867813,
"learning_rate": 3.1611893583724574e-06,
"loss": 9.1142,
"step": 437
},
{
"epoch": 2.0561239403683134,
"grad_norm": 0.18303510546684265,
"learning_rate": 3.1455399061032867e-06,
"loss": 10.1061,
"step": 438
},
{
"epoch": 2.060800935399006,
"grad_norm": 0.15254124999046326,
"learning_rate": 3.1298904538341164e-06,
"loss": 8.5431,
"step": 439
},
{
"epoch": 2.065477930429699,
"grad_norm": 0.16063688695430756,
"learning_rate": 3.1142410015649453e-06,
"loss": 8.4382,
"step": 440
},
{
"epoch": 2.0701549254603915,
"grad_norm": 0.20583708584308624,
"learning_rate": 3.0985915492957746e-06,
"loss": 8.1778,
"step": 441
},
{
"epoch": 2.0748319204910843,
"grad_norm": 0.11699045449495316,
"learning_rate": 3.0829420970266043e-06,
"loss": 7.8459,
"step": 442
},
{
"epoch": 2.079508915521777,
"grad_norm": 0.1605014204978943,
"learning_rate": 3.0672926447574336e-06,
"loss": 8.4224,
"step": 443
},
{
"epoch": 2.08418591055247,
"grad_norm": 0.14405608177185059,
"learning_rate": 3.0516431924882634e-06,
"loss": 8.3442,
"step": 444
},
{
"epoch": 2.088862905583163,
"grad_norm": 0.17145852744579315,
"learning_rate": 3.0359937402190927e-06,
"loss": 8.7685,
"step": 445
},
{
"epoch": 2.0935399006138558,
"grad_norm": 0.14711640775203705,
"learning_rate": 3.0203442879499216e-06,
"loss": 7.3568,
"step": 446
},
{
"epoch": 2.098216895644548,
"grad_norm": 0.13734185695648193,
"learning_rate": 3.0046948356807513e-06,
"loss": 8.4425,
"step": 447
},
{
"epoch": 2.102893890675241,
"grad_norm": 0.1571117639541626,
"learning_rate": 2.9890453834115806e-06,
"loss": 7.6952,
"step": 448
},
{
"epoch": 2.107570885705934,
"grad_norm": 0.15319029986858368,
"learning_rate": 2.9733959311424103e-06,
"loss": 7.9937,
"step": 449
},
{
"epoch": 2.1122478807366267,
"grad_norm": 0.18363691866397858,
"learning_rate": 2.9577464788732396e-06,
"loss": 8.4406,
"step": 450
},
{
"epoch": 2.1169248757673196,
"grad_norm": 0.1433074176311493,
"learning_rate": 2.9420970266040694e-06,
"loss": 9.7219,
"step": 451
},
{
"epoch": 2.1216018707980124,
"grad_norm": 0.14841365814208984,
"learning_rate": 2.9264475743348987e-06,
"loss": 8.2459,
"step": 452
},
{
"epoch": 2.126278865828705,
"grad_norm": 0.18753403425216675,
"learning_rate": 2.910798122065728e-06,
"loss": 8.7057,
"step": 453
},
{
"epoch": 2.1309558608593977,
"grad_norm": 0.1748085618019104,
"learning_rate": 2.8951486697965577e-06,
"loss": 8.5651,
"step": 454
},
{
"epoch": 2.1356328558900906,
"grad_norm": 0.17874014377593994,
"learning_rate": 2.8794992175273866e-06,
"loss": 8.5838,
"step": 455
},
{
"epoch": 2.1403098509207834,
"grad_norm": 0.16495150327682495,
"learning_rate": 2.8638497652582163e-06,
"loss": 9.8249,
"step": 456
},
{
"epoch": 2.1449868459514763,
"grad_norm": 0.12347421795129776,
"learning_rate": 2.8482003129890456e-06,
"loss": 7.1875,
"step": 457
},
{
"epoch": 2.149663840982169,
"grad_norm": 0.1617746353149414,
"learning_rate": 2.832550860719875e-06,
"loss": 7.7209,
"step": 458
},
{
"epoch": 2.154340836012862,
"grad_norm": 0.160769984126091,
"learning_rate": 2.8169014084507046e-06,
"loss": 7.7851,
"step": 459
},
{
"epoch": 2.1590178310435544,
"grad_norm": 0.14725424349308014,
"learning_rate": 2.801251956181534e-06,
"loss": 7.8194,
"step": 460
},
{
"epoch": 2.1636948260742472,
"grad_norm": 0.11912764608860016,
"learning_rate": 2.7856025039123637e-06,
"loss": 7.7984,
"step": 461
},
{
"epoch": 2.16837182110494,
"grad_norm": 0.17748208343982697,
"learning_rate": 2.7699530516431926e-06,
"loss": 8.0672,
"step": 462
},
{
"epoch": 2.173048816135633,
"grad_norm": 0.1708259880542755,
"learning_rate": 2.754303599374022e-06,
"loss": 9.2099,
"step": 463
},
{
"epoch": 2.177725811166326,
"grad_norm": 0.15187622606754303,
"learning_rate": 2.7386541471048516e-06,
"loss": 8.3165,
"step": 464
},
{
"epoch": 2.182402806197018,
"grad_norm": 0.18263490498065948,
"learning_rate": 2.723004694835681e-06,
"loss": 9.9331,
"step": 465
},
{
"epoch": 2.187079801227711,
"grad_norm": 0.12427602708339691,
"learning_rate": 2.7073552425665106e-06,
"loss": 8.5229,
"step": 466
},
{
"epoch": 2.191756796258404,
"grad_norm": 0.13961510360240936,
"learning_rate": 2.69170579029734e-06,
"loss": 8.3661,
"step": 467
},
{
"epoch": 2.1964337912890968,
"grad_norm": 0.14999401569366455,
"learning_rate": 2.676056338028169e-06,
"loss": 7.2095,
"step": 468
},
{
"epoch": 2.2011107863197896,
"grad_norm": 0.14472222328186035,
"learning_rate": 2.6604068857589986e-06,
"loss": 8.6861,
"step": 469
},
{
"epoch": 2.2057877813504825,
"grad_norm": 0.14089444279670715,
"learning_rate": 2.644757433489828e-06,
"loss": 7.8008,
"step": 470
},
{
"epoch": 2.210464776381175,
"grad_norm": 0.13669133186340332,
"learning_rate": 2.6291079812206576e-06,
"loss": 8.5063,
"step": 471
},
{
"epoch": 2.2151417714118677,
"grad_norm": 0.1669352799654007,
"learning_rate": 2.613458528951487e-06,
"loss": 8.7907,
"step": 472
},
{
"epoch": 2.2198187664425606,
"grad_norm": 0.15821270644664764,
"learning_rate": 2.597809076682316e-06,
"loss": 8.0308,
"step": 473
},
{
"epoch": 2.2244957614732535,
"grad_norm": 0.21483926475048065,
"learning_rate": 2.582159624413146e-06,
"loss": 8.502,
"step": 474
},
{
"epoch": 2.2291727565039463,
"grad_norm": 0.18459928035736084,
"learning_rate": 2.5665101721439752e-06,
"loss": 9.5206,
"step": 475
},
{
"epoch": 2.233849751534639,
"grad_norm": 0.1487099826335907,
"learning_rate": 2.550860719874805e-06,
"loss": 6.9168,
"step": 476
},
{
"epoch": 2.2385267465653316,
"grad_norm": 0.2513448894023895,
"learning_rate": 2.535211267605634e-06,
"loss": 9.3783,
"step": 477
},
{
"epoch": 2.2432037415960244,
"grad_norm": 0.1873185932636261,
"learning_rate": 2.519561815336463e-06,
"loss": 8.2886,
"step": 478
},
{
"epoch": 2.2478807366267173,
"grad_norm": 0.19832056760787964,
"learning_rate": 2.503912363067293e-06,
"loss": 8.1959,
"step": 479
},
{
"epoch": 2.25255773165741,
"grad_norm": 0.20701546967029572,
"learning_rate": 2.488262910798122e-06,
"loss": 8.1702,
"step": 480
},
{
"epoch": 2.257234726688103,
"grad_norm": 0.12690390646457672,
"learning_rate": 2.4726134585289515e-06,
"loss": 8.7747,
"step": 481
},
{
"epoch": 2.261911721718796,
"grad_norm": 0.1636572629213333,
"learning_rate": 2.4569640062597812e-06,
"loss": 7.8555,
"step": 482
},
{
"epoch": 2.2665887167494887,
"grad_norm": 0.12632915377616882,
"learning_rate": 2.4413145539906105e-06,
"loss": 7.9758,
"step": 483
},
{
"epoch": 2.271265711780181,
"grad_norm": 0.16761943697929382,
"learning_rate": 2.42566510172144e-06,
"loss": 8.0032,
"step": 484
},
{
"epoch": 2.275942706810874,
"grad_norm": 0.15796944499015808,
"learning_rate": 2.4100156494522696e-06,
"loss": 8.8154,
"step": 485
},
{
"epoch": 2.280619701841567,
"grad_norm": 0.16528886556625366,
"learning_rate": 2.3943661971830984e-06,
"loss": 7.3999,
"step": 486
},
{
"epoch": 2.2852966968722597,
"grad_norm": 0.14766015112400055,
"learning_rate": 2.378716744913928e-06,
"loss": 7.8343,
"step": 487
},
{
"epoch": 2.2899736919029525,
"grad_norm": 0.12624794244766235,
"learning_rate": 2.3630672926447575e-06,
"loss": 8.0017,
"step": 488
},
{
"epoch": 2.294650686933645,
"grad_norm": 0.16594719886779785,
"learning_rate": 2.347417840375587e-06,
"loss": 7.7649,
"step": 489
},
{
"epoch": 2.299327681964338,
"grad_norm": 0.1574728637933731,
"learning_rate": 2.3317683881064165e-06,
"loss": 9.2884,
"step": 490
},
{
"epoch": 2.3040046769950306,
"grad_norm": 0.1298084557056427,
"learning_rate": 2.316118935837246e-06,
"loss": 8.4339,
"step": 491
},
{
"epoch": 2.3086816720257235,
"grad_norm": 0.15643304586410522,
"learning_rate": 2.300469483568075e-06,
"loss": 8.0997,
"step": 492
},
{
"epoch": 2.3133586670564164,
"grad_norm": 0.13263966143131256,
"learning_rate": 2.284820031298905e-06,
"loss": 8.109,
"step": 493
},
{
"epoch": 2.318035662087109,
"grad_norm": 0.21980319917201996,
"learning_rate": 2.269170579029734e-06,
"loss": 8.2741,
"step": 494
},
{
"epoch": 2.322712657117802,
"grad_norm": 0.13680629432201385,
"learning_rate": 2.2535211267605635e-06,
"loss": 8.5315,
"step": 495
},
{
"epoch": 2.3273896521484945,
"grad_norm": 0.1529272496700287,
"learning_rate": 2.237871674491393e-06,
"loss": 8.0531,
"step": 496
},
{
"epoch": 2.3320666471791873,
"grad_norm": 0.174594908952713,
"learning_rate": 2.222222222222222e-06,
"loss": 7.7507,
"step": 497
},
{
"epoch": 2.33674364220988,
"grad_norm": 0.17085200548171997,
"learning_rate": 2.206572769953052e-06,
"loss": 7.1328,
"step": 498
},
{
"epoch": 2.341420637240573,
"grad_norm": 0.14975635707378387,
"learning_rate": 2.190923317683881e-06,
"loss": 9.8064,
"step": 499
},
{
"epoch": 2.346097632271266,
"grad_norm": 0.15309952199459076,
"learning_rate": 2.175273865414711e-06,
"loss": 8.6898,
"step": 500
},
{
"epoch": 2.3507746273019583,
"grad_norm": 0.13084295392036438,
"learning_rate": 2.15962441314554e-06,
"loss": 8.1384,
"step": 501
},
{
"epoch": 2.355451622332651,
"grad_norm": 0.16496095061302185,
"learning_rate": 2.1439749608763695e-06,
"loss": 8.9057,
"step": 502
},
{
"epoch": 2.360128617363344,
"grad_norm": 0.157500758767128,
"learning_rate": 2.1283255086071988e-06,
"loss": 7.967,
"step": 503
},
{
"epoch": 2.364805612394037,
"grad_norm": 0.1988188475370407,
"learning_rate": 2.1126760563380285e-06,
"loss": 7.4129,
"step": 504
},
{
"epoch": 2.3694826074247297,
"grad_norm": 0.21104207634925842,
"learning_rate": 2.097026604068858e-06,
"loss": 7.5442,
"step": 505
},
{
"epoch": 2.3741596024554226,
"grad_norm": 0.20285457372665405,
"learning_rate": 2.081377151799687e-06,
"loss": 7.3523,
"step": 506
},
{
"epoch": 2.378836597486115,
"grad_norm": 0.24479469656944275,
"learning_rate": 2.065727699530517e-06,
"loss": 7.8577,
"step": 507
},
{
"epoch": 2.383513592516808,
"grad_norm": 0.150054469704628,
"learning_rate": 2.0500782472613457e-06,
"loss": 8.2585,
"step": 508
},
{
"epoch": 2.3881905875475007,
"grad_norm": 0.12602077424526215,
"learning_rate": 2.0344287949921754e-06,
"loss": 7.7554,
"step": 509
},
{
"epoch": 2.3928675825781935,
"grad_norm": 0.18626457452774048,
"learning_rate": 2.0187793427230047e-06,
"loss": 7.5464,
"step": 510
},
{
"epoch": 2.3975445776088864,
"grad_norm": 0.20931190252304077,
"learning_rate": 2.0031298904538345e-06,
"loss": 7.5159,
"step": 511
},
{
"epoch": 2.4022215726395793,
"grad_norm": 0.2555796802043915,
"learning_rate": 1.9874804381846638e-06,
"loss": 7.5645,
"step": 512
},
{
"epoch": 2.4068985676702717,
"grad_norm": 0.17398537695407867,
"learning_rate": 1.971830985915493e-06,
"loss": 8.211,
"step": 513
},
{
"epoch": 2.4115755627009645,
"grad_norm": 0.19993047416210175,
"learning_rate": 1.9561815336463224e-06,
"loss": 8.3602,
"step": 514
},
{
"epoch": 2.4162525577316574,
"grad_norm": 0.15980151295661926,
"learning_rate": 1.940532081377152e-06,
"loss": 7.6245,
"step": 515
},
{
"epoch": 2.4209295527623502,
"grad_norm": 0.16947968304157257,
"learning_rate": 1.9248826291079814e-06,
"loss": 8.2847,
"step": 516
},
{
"epoch": 2.425606547793043,
"grad_norm": 0.1670764982700348,
"learning_rate": 1.9092331768388107e-06,
"loss": 8.4169,
"step": 517
},
{
"epoch": 2.430283542823736,
"grad_norm": 0.17053499817848206,
"learning_rate": 1.8935837245696402e-06,
"loss": 8.8886,
"step": 518
},
{
"epoch": 2.4349605378544283,
"grad_norm": 0.16047680377960205,
"learning_rate": 1.8779342723004696e-06,
"loss": 7.9574,
"step": 519
},
{
"epoch": 2.439637532885121,
"grad_norm": 0.2619805932044983,
"learning_rate": 1.862284820031299e-06,
"loss": 8.203,
"step": 520
},
{
"epoch": 2.444314527915814,
"grad_norm": 0.2122809886932373,
"learning_rate": 1.8466353677621286e-06,
"loss": 7.8094,
"step": 521
},
{
"epoch": 2.448991522946507,
"grad_norm": 0.15507692098617554,
"learning_rate": 1.8309859154929579e-06,
"loss": 7.7085,
"step": 522
},
{
"epoch": 2.4536685179771998,
"grad_norm": 0.1406126171350479,
"learning_rate": 1.8153364632237874e-06,
"loss": 8.483,
"step": 523
},
{
"epoch": 2.4583455130078926,
"grad_norm": 0.19436419010162354,
"learning_rate": 1.7996870109546167e-06,
"loss": 8.271,
"step": 524
},
{
"epoch": 2.463022508038585,
"grad_norm": 0.17198602855205536,
"learning_rate": 1.784037558685446e-06,
"loss": 8.3665,
"step": 525
},
{
"epoch": 2.467699503069278,
"grad_norm": 0.28165027499198914,
"learning_rate": 1.7683881064162755e-06,
"loss": 7.8636,
"step": 526
},
{
"epoch": 2.4723764980999707,
"grad_norm": 0.2032092958688736,
"learning_rate": 1.752738654147105e-06,
"loss": 7.5732,
"step": 527
},
{
"epoch": 2.4770534931306636,
"grad_norm": 0.13977749645709991,
"learning_rate": 1.7370892018779346e-06,
"loss": 7.2479,
"step": 528
},
{
"epoch": 2.4817304881613564,
"grad_norm": 0.13071084022521973,
"learning_rate": 1.7214397496087637e-06,
"loss": 7.008,
"step": 529
},
{
"epoch": 2.4864074831920493,
"grad_norm": 0.15741536021232605,
"learning_rate": 1.7057902973395932e-06,
"loss": 8.0612,
"step": 530
},
{
"epoch": 2.4910844782227417,
"grad_norm": 0.16548508405685425,
"learning_rate": 1.6901408450704227e-06,
"loss": 8.0312,
"step": 531
},
{
"epoch": 2.4957614732534346,
"grad_norm": 0.16299135982990265,
"learning_rate": 1.674491392801252e-06,
"loss": 8.8502,
"step": 532
},
{
"epoch": 2.5004384682841274,
"grad_norm": 0.159685879945755,
"learning_rate": 1.6588419405320815e-06,
"loss": 9.5205,
"step": 533
},
{
"epoch": 2.5051154633148203,
"grad_norm": 0.1804819405078888,
"learning_rate": 1.643192488262911e-06,
"loss": 7.683,
"step": 534
},
{
"epoch": 2.509792458345513,
"grad_norm": 0.16809211671352386,
"learning_rate": 1.6275430359937403e-06,
"loss": 8.6418,
"step": 535
},
{
"epoch": 2.514469453376206,
"grad_norm": 0.17984607815742493,
"learning_rate": 1.6118935837245697e-06,
"loss": 7.68,
"step": 536
},
{
"epoch": 2.5191464484068984,
"grad_norm": 0.17649582028388977,
"learning_rate": 1.5962441314553992e-06,
"loss": 8.1753,
"step": 537
},
{
"epoch": 2.5238234434375912,
"grad_norm": 0.16467247903347015,
"learning_rate": 1.5805946791862287e-06,
"loss": 7.6117,
"step": 538
},
{
"epoch": 2.528500438468284,
"grad_norm": 0.17968781292438507,
"learning_rate": 1.5649452269170582e-06,
"loss": 8.549,
"step": 539
},
{
"epoch": 2.533177433498977,
"grad_norm": 0.15423156321048737,
"learning_rate": 1.5492957746478873e-06,
"loss": 8.7104,
"step": 540
},
{
"epoch": 2.53785442852967,
"grad_norm": 0.14077003300189972,
"learning_rate": 1.5336463223787168e-06,
"loss": 8.934,
"step": 541
},
{
"epoch": 2.542531423560362,
"grad_norm": 0.16637051105499268,
"learning_rate": 1.5179968701095463e-06,
"loss": 7.4252,
"step": 542
},
{
"epoch": 2.5472084185910555,
"grad_norm": 0.1724003106355667,
"learning_rate": 1.5023474178403756e-06,
"loss": 7.9955,
"step": 543
},
{
"epoch": 2.551885413621748,
"grad_norm": 0.19609539210796356,
"learning_rate": 1.4866979655712052e-06,
"loss": 8.3348,
"step": 544
},
{
"epoch": 2.5565624086524408,
"grad_norm": 0.12707825005054474,
"learning_rate": 1.4710485133020347e-06,
"loss": 8.0848,
"step": 545
},
{
"epoch": 2.5612394036831336,
"grad_norm": 0.2031966894865036,
"learning_rate": 1.455399061032864e-06,
"loss": 9.8729,
"step": 546
},
{
"epoch": 2.5659163987138265,
"grad_norm": 0.18515604734420776,
"learning_rate": 1.4397496087636933e-06,
"loss": 7.8293,
"step": 547
},
{
"epoch": 2.5705933937445193,
"grad_norm": 0.15621398389339447,
"learning_rate": 1.4241001564945228e-06,
"loss": 9.3478,
"step": 548
},
{
"epoch": 2.5752703887752117,
"grad_norm": 0.22210869193077087,
"learning_rate": 1.4084507042253523e-06,
"loss": 8.1303,
"step": 549
},
{
"epoch": 2.5799473838059046,
"grad_norm": 0.27393949031829834,
"learning_rate": 1.3928012519561818e-06,
"loss": 8.4729,
"step": 550
},
{
"epoch": 2.5846243788365975,
"grad_norm": 0.13042934238910675,
"learning_rate": 1.377151799687011e-06,
"loss": 8.2335,
"step": 551
},
{
"epoch": 2.5893013738672903,
"grad_norm": 0.207389697432518,
"learning_rate": 1.3615023474178405e-06,
"loss": 8.0168,
"step": 552
},
{
"epoch": 2.593978368897983,
"grad_norm": 0.14343053102493286,
"learning_rate": 1.34585289514867e-06,
"loss": 7.9552,
"step": 553
},
{
"epoch": 2.5986553639286756,
"grad_norm": 0.1722148060798645,
"learning_rate": 1.3302034428794993e-06,
"loss": 7.6877,
"step": 554
},
{
"epoch": 2.603332358959369,
"grad_norm": 0.18076814711093903,
"learning_rate": 1.3145539906103288e-06,
"loss": 8.0741,
"step": 555
},
{
"epoch": 2.6080093539900613,
"grad_norm": 0.14633478224277496,
"learning_rate": 1.298904538341158e-06,
"loss": 7.3683,
"step": 556
},
{
"epoch": 2.612686349020754,
"grad_norm": 0.14783795177936554,
"learning_rate": 1.2832550860719876e-06,
"loss": 7.8992,
"step": 557
},
{
"epoch": 2.617363344051447,
"grad_norm": 0.15360093116760254,
"learning_rate": 1.267605633802817e-06,
"loss": 8.8425,
"step": 558
},
{
"epoch": 2.62204033908214,
"grad_norm": 0.1691809445619583,
"learning_rate": 1.2519561815336464e-06,
"loss": 8.669,
"step": 559
},
{
"epoch": 2.6267173341128327,
"grad_norm": 0.16426807641983032,
"learning_rate": 1.2363067292644757e-06,
"loss": 9.1265,
"step": 560
},
{
"epoch": 2.631394329143525,
"grad_norm": 0.1331864446401596,
"learning_rate": 1.2206572769953053e-06,
"loss": 7.4892,
"step": 561
},
{
"epoch": 2.636071324174218,
"grad_norm": 0.1330748349428177,
"learning_rate": 1.2050078247261348e-06,
"loss": 8.6181,
"step": 562
},
{
"epoch": 2.640748319204911,
"grad_norm": 0.14942462742328644,
"learning_rate": 1.189358372456964e-06,
"loss": 7.8301,
"step": 563
},
{
"epoch": 2.6454253142356037,
"grad_norm": 0.16964685916900635,
"learning_rate": 1.1737089201877936e-06,
"loss": 7.1293,
"step": 564
},
{
"epoch": 2.6501023092662965,
"grad_norm": 0.1727379858493805,
"learning_rate": 1.158059467918623e-06,
"loss": 7.1773,
"step": 565
},
{
"epoch": 2.654779304296989,
"grad_norm": 0.14950168132781982,
"learning_rate": 1.1424100156494524e-06,
"loss": 7.5172,
"step": 566
},
{
"epoch": 2.6594562993276822,
"grad_norm": 0.16068300604820251,
"learning_rate": 1.1267605633802817e-06,
"loss": 8.7739,
"step": 567
},
{
"epoch": 2.6641332943583746,
"grad_norm": 0.18006567656993866,
"learning_rate": 1.111111111111111e-06,
"loss": 8.2067,
"step": 568
},
{
"epoch": 2.6688102893890675,
"grad_norm": 0.19861166179180145,
"learning_rate": 1.0954616588419406e-06,
"loss": 7.1208,
"step": 569
},
{
"epoch": 2.6734872844197604,
"grad_norm": 0.13374726474285126,
"learning_rate": 1.07981220657277e-06,
"loss": 7.8631,
"step": 570
},
{
"epoch": 2.678164279450453,
"grad_norm": 0.17814220488071442,
"learning_rate": 1.0641627543035994e-06,
"loss": 7.4765,
"step": 571
},
{
"epoch": 2.682841274481146,
"grad_norm": 0.22474409639835358,
"learning_rate": 1.048513302034429e-06,
"loss": 7.0754,
"step": 572
},
{
"epoch": 2.6875182695118385,
"grad_norm": 0.16655339300632477,
"learning_rate": 1.0328638497652584e-06,
"loss": 7.2505,
"step": 573
},
{
"epoch": 2.6921952645425313,
"grad_norm": 0.172933891415596,
"learning_rate": 1.0172143974960877e-06,
"loss": 8.0832,
"step": 574
},
{
"epoch": 2.696872259573224,
"grad_norm": 0.14097332954406738,
"learning_rate": 1.0015649452269172e-06,
"loss": 8.0197,
"step": 575
},
{
"epoch": 2.701549254603917,
"grad_norm": 0.1363203376531601,
"learning_rate": 9.859154929577465e-07,
"loss": 7.2466,
"step": 576
},
{
"epoch": 2.70622624963461,
"grad_norm": 0.17508287727832794,
"learning_rate": 9.70266040688576e-07,
"loss": 9.3567,
"step": 577
},
{
"epoch": 2.7109032446653023,
"grad_norm": 0.169004425406456,
"learning_rate": 9.546165884194054e-07,
"loss": 8.0132,
"step": 578
},
{
"epoch": 2.715580239695995,
"grad_norm": 0.14103683829307556,
"learning_rate": 9.389671361502348e-07,
"loss": 9.0192,
"step": 579
},
{
"epoch": 2.720257234726688,
"grad_norm": 0.197422057390213,
"learning_rate": 9.233176838810643e-07,
"loss": 7.2779,
"step": 580
},
{
"epoch": 2.724934229757381,
"grad_norm": 0.1950581669807434,
"learning_rate": 9.076682316118937e-07,
"loss": 7.6089,
"step": 581
},
{
"epoch": 2.7296112247880737,
"grad_norm": 0.23691439628601074,
"learning_rate": 8.92018779342723e-07,
"loss": 7.9658,
"step": 582
},
{
"epoch": 2.7342882198187666,
"grad_norm": 0.2558799684047699,
"learning_rate": 8.763693270735525e-07,
"loss": 8.4791,
"step": 583
},
{
"epoch": 2.7389652148494594,
"grad_norm": 0.17010214924812317,
"learning_rate": 8.607198748043818e-07,
"loss": 8.4854,
"step": 584
},
{
"epoch": 2.743642209880152,
"grad_norm": 0.13403132557868958,
"learning_rate": 8.450704225352114e-07,
"loss": 6.7996,
"step": 585
},
{
"epoch": 2.7483192049108447,
"grad_norm": 0.14201347529888153,
"learning_rate": 8.294209702660408e-07,
"loss": 8.3937,
"step": 586
},
{
"epoch": 2.7529961999415375,
"grad_norm": 0.28258565068244934,
"learning_rate": 8.137715179968702e-07,
"loss": 9.3709,
"step": 587
},
{
"epoch": 2.7576731949722304,
"grad_norm": 0.17337313294410706,
"learning_rate": 7.981220657276996e-07,
"loss": 7.6751,
"step": 588
},
{
"epoch": 2.7623501900029233,
"grad_norm": 0.1940070241689682,
"learning_rate": 7.824726134585291e-07,
"loss": 7.3925,
"step": 589
},
{
"epoch": 2.7670271850336157,
"grad_norm": 0.14429809153079987,
"learning_rate": 7.668231611893584e-07,
"loss": 7.4402,
"step": 590
},
{
"epoch": 2.7717041800643085,
"grad_norm": 0.17765949666500092,
"learning_rate": 7.511737089201878e-07,
"loss": 7.6954,
"step": 591
},
{
"epoch": 2.7763811750950014,
"grad_norm": 0.15836399793624878,
"learning_rate": 7.355242566510173e-07,
"loss": 7.6582,
"step": 592
},
{
"epoch": 2.7810581701256942,
"grad_norm": 0.14881688356399536,
"learning_rate": 7.198748043818466e-07,
"loss": 8.7099,
"step": 593
},
{
"epoch": 2.785735165156387,
"grad_norm": 0.21536029875278473,
"learning_rate": 7.042253521126762e-07,
"loss": 8.3015,
"step": 594
},
{
"epoch": 2.79041216018708,
"grad_norm": 0.14025098085403442,
"learning_rate": 6.885758998435055e-07,
"loss": 8.7512,
"step": 595
},
{
"epoch": 2.795089155217773,
"grad_norm": 0.13290052115917206,
"learning_rate": 6.72926447574335e-07,
"loss": 7.5792,
"step": 596
},
{
"epoch": 2.799766150248465,
"grad_norm": 0.3149656057357788,
"learning_rate": 6.572769953051644e-07,
"loss": 8.4982,
"step": 597
},
{
"epoch": 2.804443145279158,
"grad_norm": 0.16543497145175934,
"learning_rate": 6.416275430359938e-07,
"loss": 8.3097,
"step": 598
},
{
"epoch": 2.809120140309851,
"grad_norm": 0.17708784341812134,
"learning_rate": 6.259780907668232e-07,
"loss": 8.8047,
"step": 599
},
{
"epoch": 2.8137971353405438,
"grad_norm": 0.14560888707637787,
"learning_rate": 6.103286384976526e-07,
"loss": 8.0925,
"step": 600
},
{
"epoch": 2.8184741303712366,
"grad_norm": 0.1902446448802948,
"learning_rate": 5.94679186228482e-07,
"loss": 7.7629,
"step": 601
},
{
"epoch": 2.823151125401929,
"grad_norm": 0.1473388820886612,
"learning_rate": 5.790297339593115e-07,
"loss": 8.1077,
"step": 602
},
{
"epoch": 2.827828120432622,
"grad_norm": 0.16258402168750763,
"learning_rate": 5.633802816901409e-07,
"loss": 7.8582,
"step": 603
},
{
"epoch": 2.8325051154633147,
"grad_norm": 0.1769980639219284,
"learning_rate": 5.477308294209703e-07,
"loss": 7.1382,
"step": 604
},
{
"epoch": 2.8371821104940076,
"grad_norm": 0.1444021314382553,
"learning_rate": 5.320813771517997e-07,
"loss": 7.1383,
"step": 605
},
{
"epoch": 2.8418591055247004,
"grad_norm": 0.21616753935813904,
"learning_rate": 5.164319248826292e-07,
"loss": 7.605,
"step": 606
},
{
"epoch": 2.8465361005553933,
"grad_norm": 0.20384635031223297,
"learning_rate": 5.007824726134586e-07,
"loss": 8.8131,
"step": 607
},
{
"epoch": 2.851213095586086,
"grad_norm": 0.1579245626926422,
"learning_rate": 4.85133020344288e-07,
"loss": 7.8475,
"step": 608
},
{
"epoch": 2.8558900906167786,
"grad_norm": 0.20689930021762848,
"learning_rate": 4.694835680751174e-07,
"loss": 7.1765,
"step": 609
},
{
"epoch": 2.8605670856474714,
"grad_norm": 0.1589430868625641,
"learning_rate": 4.5383411580594685e-07,
"loss": 8.8879,
"step": 610
},
{
"epoch": 2.8652440806781643,
"grad_norm": 0.16509409248828888,
"learning_rate": 4.3818466353677626e-07,
"loss": 7.6589,
"step": 611
},
{
"epoch": 2.869921075708857,
"grad_norm": 0.1439896821975708,
"learning_rate": 4.225352112676057e-07,
"loss": 8.0296,
"step": 612
},
{
"epoch": 2.87459807073955,
"grad_norm": 0.19501394033432007,
"learning_rate": 4.068857589984351e-07,
"loss": 8.5349,
"step": 613
},
{
"epoch": 2.8792750657702424,
"grad_norm": 0.18828211724758148,
"learning_rate": 3.9123630672926455e-07,
"loss": 7.3593,
"step": 614
},
{
"epoch": 2.8839520608009352,
"grad_norm": 0.15072734653949738,
"learning_rate": 3.755868544600939e-07,
"loss": 8.2512,
"step": 615
},
{
"epoch": 2.888629055831628,
"grad_norm": 0.1598856896162033,
"learning_rate": 3.599374021909233e-07,
"loss": 7.3246,
"step": 616
},
{
"epoch": 2.893306050862321,
"grad_norm": 0.15382905304431915,
"learning_rate": 3.4428794992175273e-07,
"loss": 7.674,
"step": 617
},
{
"epoch": 2.897983045893014,
"grad_norm": 0.13851745426654816,
"learning_rate": 3.286384976525822e-07,
"loss": 7.6664,
"step": 618
},
{
"epoch": 2.9026600409237067,
"grad_norm": 0.12572415173053741,
"learning_rate": 3.129890453834116e-07,
"loss": 8.6634,
"step": 619
},
{
"epoch": 2.9073370359543995,
"grad_norm": 0.181121364235878,
"learning_rate": 2.97339593114241e-07,
"loss": 7.5301,
"step": 620
},
{
"epoch": 2.912014030985092,
"grad_norm": 0.18877944350242615,
"learning_rate": 2.8169014084507043e-07,
"loss": 7.4353,
"step": 621
},
{
"epoch": 2.916691026015785,
"grad_norm": 0.1800297349691391,
"learning_rate": 2.6604068857589984e-07,
"loss": 8.0261,
"step": 622
},
{
"epoch": 2.9213680210464776,
"grad_norm": 0.1459706425666809,
"learning_rate": 2.503912363067293e-07,
"loss": 8.5898,
"step": 623
},
{
"epoch": 2.9260450160771705,
"grad_norm": 0.19272330403327942,
"learning_rate": 2.347417840375587e-07,
"loss": 7.5655,
"step": 624
},
{
"epoch": 2.9307220111078633,
"grad_norm": 0.13995127379894257,
"learning_rate": 2.1909233176838813e-07,
"loss": 8.807,
"step": 625
},
{
"epoch": 2.9353990061385558,
"grad_norm": 0.19578878581523895,
"learning_rate": 2.0344287949921754e-07,
"loss": 8.2283,
"step": 626
},
{
"epoch": 2.9400760011692486,
"grad_norm": 0.18744409084320068,
"learning_rate": 1.8779342723004696e-07,
"loss": 7.8586,
"step": 627
},
{
"epoch": 2.9447529961999415,
"grad_norm": 0.18906202912330627,
"learning_rate": 1.7214397496087637e-07,
"loss": 8.9175,
"step": 628
},
{
"epoch": 2.9494299912306343,
"grad_norm": 0.2817856967449188,
"learning_rate": 1.564945226917058e-07,
"loss": 7.8464,
"step": 629
},
{
"epoch": 2.954106986261327,
"grad_norm": 0.1482636034488678,
"learning_rate": 1.4084507042253522e-07,
"loss": 8.0478,
"step": 630
},
{
"epoch": 2.95878398129202,
"grad_norm": 0.1729574054479599,
"learning_rate": 1.2519561815336465e-07,
"loss": 8.3514,
"step": 631
},
{
"epoch": 2.963460976322713,
"grad_norm": 0.23052264750003815,
"learning_rate": 1.0954616588419407e-07,
"loss": 8.397,
"step": 632
},
{
"epoch": 2.9681379713534053,
"grad_norm": 0.16747911274433136,
"learning_rate": 9.389671361502348e-08,
"loss": 8.0443,
"step": 633
},
{
"epoch": 2.972814966384098,
"grad_norm": 0.14860796928405762,
"learning_rate": 7.82472613458529e-08,
"loss": 7.317,
"step": 634
},
{
"epoch": 2.977491961414791,
"grad_norm": 0.13674141466617584,
"learning_rate": 6.259780907668233e-08,
"loss": 7.6038,
"step": 635
},
{
"epoch": 2.982168956445484,
"grad_norm": 0.163039892911911,
"learning_rate": 4.694835680751174e-08,
"loss": 7.4459,
"step": 636
},
{
"epoch": 2.9868459514761767,
"grad_norm": 0.1598978042602539,
"learning_rate": 3.1298904538341164e-08,
"loss": 9.9334,
"step": 637
},
{
"epoch": 2.991522946506869,
"grad_norm": 0.16937094926834106,
"learning_rate": 1.5649452269170582e-08,
"loss": 7.9632,
"step": 638
},
{
"epoch": 2.996199941537562,
"grad_norm": 0.11614558100700378,
"learning_rate": 0.0,
"loss": 7.3632,
"step": 639
},
{
"epoch": 2.996199941537562,
"step": 639,
"total_flos": 2.8450474856619704e+18,
"train_loss": 9.811768141524146,
"train_runtime": 60574.6064,
"train_samples_per_second": 1.355,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1.0,
"max_steps": 639,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8450474856619704e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}