mo13_sft_td_baseline_seed0 / trainer_state.json
jprivera44's picture
upload baseline_seed0/checkpoint-938 (full checkpoint incl. optimizer state)
eb46daf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 938,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005330490405117271,
"grad_norm": 0.3734353482723236,
"learning_rate": 2e-05,
"loss": 1.7104,
"step": 5
},
{
"epoch": 0.010660980810234541,
"grad_norm": 0.2635837197303772,
"learning_rate": 2e-05,
"loss": 1.6538,
"step": 10
},
{
"epoch": 0.015991471215351813,
"grad_norm": 0.19964125752449036,
"learning_rate": 2e-05,
"loss": 1.4818,
"step": 15
},
{
"epoch": 0.021321961620469083,
"grad_norm": 0.2099446952342987,
"learning_rate": 2e-05,
"loss": 1.3886,
"step": 20
},
{
"epoch": 0.026652452025586353,
"grad_norm": 0.34373781085014343,
"learning_rate": 2e-05,
"loss": 1.3364,
"step": 25
},
{
"epoch": 0.031982942430703626,
"grad_norm": 0.18734169006347656,
"learning_rate": 2e-05,
"loss": 1.2899,
"step": 30
},
{
"epoch": 0.03731343283582089,
"grad_norm": 0.16565224528312683,
"learning_rate": 2e-05,
"loss": 1.2788,
"step": 35
},
{
"epoch": 0.042643923240938165,
"grad_norm": 0.20943662524223328,
"learning_rate": 2e-05,
"loss": 1.2335,
"step": 40
},
{
"epoch": 0.04797441364605544,
"grad_norm": 0.3209005892276764,
"learning_rate": 2e-05,
"loss": 1.1854,
"step": 45
},
{
"epoch": 0.053304904051172705,
"grad_norm": 0.3170112669467926,
"learning_rate": 2e-05,
"loss": 1.0846,
"step": 50
},
{
"epoch": 0.05863539445628998,
"grad_norm": 0.19068752229213715,
"learning_rate": 2e-05,
"loss": 1.0976,
"step": 55
},
{
"epoch": 0.06396588486140725,
"grad_norm": 0.19504626095294952,
"learning_rate": 2e-05,
"loss": 1.0597,
"step": 60
},
{
"epoch": 0.06929637526652452,
"grad_norm": 0.2150755077600479,
"learning_rate": 2e-05,
"loss": 1.0369,
"step": 65
},
{
"epoch": 0.07462686567164178,
"grad_norm": 0.3036380410194397,
"learning_rate": 2e-05,
"loss": 0.992,
"step": 70
},
{
"epoch": 0.07995735607675906,
"grad_norm": 0.3687366545200348,
"learning_rate": 2e-05,
"loss": 0.9768,
"step": 75
},
{
"epoch": 0.08528784648187633,
"grad_norm": 0.20827943086624146,
"learning_rate": 2e-05,
"loss": 1.0187,
"step": 80
},
{
"epoch": 0.0906183368869936,
"grad_norm": 0.19415774941444397,
"learning_rate": 2e-05,
"loss": 0.9108,
"step": 85
},
{
"epoch": 0.09594882729211088,
"grad_norm": 0.23162966966629028,
"learning_rate": 2e-05,
"loss": 0.9096,
"step": 90
},
{
"epoch": 0.10127931769722814,
"grad_norm": 0.30539438128471375,
"learning_rate": 2e-05,
"loss": 0.9119,
"step": 95
},
{
"epoch": 0.10660980810234541,
"grad_norm": 0.3614450693130493,
"learning_rate": 2e-05,
"loss": 0.9107,
"step": 100
},
{
"epoch": 0.11194029850746269,
"grad_norm": 0.20569603145122528,
"learning_rate": 2e-05,
"loss": 0.9069,
"step": 105
},
{
"epoch": 0.11727078891257996,
"grad_norm": 0.1889234036207199,
"learning_rate": 2e-05,
"loss": 0.867,
"step": 110
},
{
"epoch": 0.12260127931769722,
"grad_norm": 0.26535317301750183,
"learning_rate": 2e-05,
"loss": 0.9013,
"step": 115
},
{
"epoch": 0.1279317697228145,
"grad_norm": 0.32258930802345276,
"learning_rate": 2e-05,
"loss": 0.8905,
"step": 120
},
{
"epoch": 0.13326226012793177,
"grad_norm": 0.3419113755226135,
"learning_rate": 2e-05,
"loss": 0.8779,
"step": 125
},
{
"epoch": 0.13859275053304904,
"grad_norm": 0.23869968950748444,
"learning_rate": 2e-05,
"loss": 0.894,
"step": 130
},
{
"epoch": 0.1439232409381663,
"grad_norm": 0.2865942418575287,
"learning_rate": 2e-05,
"loss": 0.8571,
"step": 135
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.2303633689880371,
"learning_rate": 2e-05,
"loss": 0.8787,
"step": 140
},
{
"epoch": 0.15458422174840086,
"grad_norm": 0.3257729113101959,
"learning_rate": 2e-05,
"loss": 0.8533,
"step": 145
},
{
"epoch": 0.15991471215351813,
"grad_norm": 0.34180137515068054,
"learning_rate": 2e-05,
"loss": 0.8456,
"step": 150
},
{
"epoch": 0.1652452025586354,
"grad_norm": 0.22115589678287506,
"learning_rate": 2e-05,
"loss": 0.8752,
"step": 155
},
{
"epoch": 0.17057569296375266,
"grad_norm": 0.2092457115650177,
"learning_rate": 2e-05,
"loss": 0.8318,
"step": 160
},
{
"epoch": 0.17590618336886993,
"grad_norm": 0.24170993268489838,
"learning_rate": 2e-05,
"loss": 0.8441,
"step": 165
},
{
"epoch": 0.1812366737739872,
"grad_norm": 0.3601376414299011,
"learning_rate": 2e-05,
"loss": 0.8457,
"step": 170
},
{
"epoch": 0.1865671641791045,
"grad_norm": 0.4388696849346161,
"learning_rate": 2e-05,
"loss": 0.8473,
"step": 175
},
{
"epoch": 0.19189765458422176,
"grad_norm": 0.25066882371902466,
"learning_rate": 2e-05,
"loss": 0.8794,
"step": 180
},
{
"epoch": 0.19722814498933902,
"grad_norm": 0.2521555721759796,
"learning_rate": 2e-05,
"loss": 0.8257,
"step": 185
},
{
"epoch": 0.2025586353944563,
"grad_norm": 0.24965791404247284,
"learning_rate": 2e-05,
"loss": 0.836,
"step": 190
},
{
"epoch": 0.20788912579957355,
"grad_norm": 0.3060144782066345,
"learning_rate": 2e-05,
"loss": 0.8342,
"step": 195
},
{
"epoch": 0.21321961620469082,
"grad_norm": 0.29012420773506165,
"learning_rate": 2e-05,
"loss": 0.8282,
"step": 200
},
{
"epoch": 0.21855010660980811,
"grad_norm": 0.2453121393918991,
"learning_rate": 2e-05,
"loss": 0.8353,
"step": 205
},
{
"epoch": 0.22388059701492538,
"grad_norm": 0.23359909653663635,
"learning_rate": 2e-05,
"loss": 0.8301,
"step": 210
},
{
"epoch": 0.22921108742004265,
"grad_norm": 0.2844357192516327,
"learning_rate": 2e-05,
"loss": 0.8246,
"step": 215
},
{
"epoch": 0.2345415778251599,
"grad_norm": 0.33136624097824097,
"learning_rate": 2e-05,
"loss": 0.8207,
"step": 220
},
{
"epoch": 0.23987206823027718,
"grad_norm": 0.3795192837715149,
"learning_rate": 2e-05,
"loss": 0.827,
"step": 225
},
{
"epoch": 0.24520255863539445,
"grad_norm": 0.2862633168697357,
"learning_rate": 2e-05,
"loss": 0.813,
"step": 230
},
{
"epoch": 0.2505330490405117,
"grad_norm": 0.2595326602458954,
"learning_rate": 2e-05,
"loss": 0.7736,
"step": 235
},
{
"epoch": 0.255863539445629,
"grad_norm": 0.28593310713768005,
"learning_rate": 2e-05,
"loss": 0.7976,
"step": 240
},
{
"epoch": 0.26119402985074625,
"grad_norm": 0.3470572531223297,
"learning_rate": 2e-05,
"loss": 0.8094,
"step": 245
},
{
"epoch": 0.26652452025586354,
"grad_norm": 0.35488802194595337,
"learning_rate": 2e-05,
"loss": 0.8174,
"step": 250
},
{
"epoch": 0.27185501066098083,
"grad_norm": 0.28033536672592163,
"learning_rate": 2e-05,
"loss": 0.8377,
"step": 255
},
{
"epoch": 0.2771855010660981,
"grad_norm": 0.23253943026065826,
"learning_rate": 2e-05,
"loss": 0.7725,
"step": 260
},
{
"epoch": 0.28251599147121537,
"grad_norm": 0.2662801146507263,
"learning_rate": 2e-05,
"loss": 0.779,
"step": 265
},
{
"epoch": 0.2878464818763326,
"grad_norm": 0.33942538499832153,
"learning_rate": 2e-05,
"loss": 0.8234,
"step": 270
},
{
"epoch": 0.2931769722814499,
"grad_norm": 0.34840917587280273,
"learning_rate": 2e-05,
"loss": 0.8095,
"step": 275
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.28402140736579895,
"learning_rate": 2e-05,
"loss": 0.8321,
"step": 280
},
{
"epoch": 0.30383795309168443,
"grad_norm": 0.26083698868751526,
"learning_rate": 2e-05,
"loss": 0.761,
"step": 285
},
{
"epoch": 0.3091684434968017,
"grad_norm": 0.25820574164390564,
"learning_rate": 2e-05,
"loss": 0.8052,
"step": 290
},
{
"epoch": 0.31449893390191896,
"grad_norm": 0.3496425449848175,
"learning_rate": 2e-05,
"loss": 0.7961,
"step": 295
},
{
"epoch": 0.31982942430703626,
"grad_norm": 0.36911600828170776,
"learning_rate": 2e-05,
"loss": 0.8297,
"step": 300
},
{
"epoch": 0.3251599147121535,
"grad_norm": 0.2791857421398163,
"learning_rate": 2e-05,
"loss": 0.8117,
"step": 305
},
{
"epoch": 0.3304904051172708,
"grad_norm": 0.2672363817691803,
"learning_rate": 2e-05,
"loss": 0.7098,
"step": 310
},
{
"epoch": 0.3358208955223881,
"grad_norm": 0.2714046239852905,
"learning_rate": 2e-05,
"loss": 0.7758,
"step": 315
},
{
"epoch": 0.3411513859275053,
"grad_norm": 0.3329773247241974,
"learning_rate": 2e-05,
"loss": 0.7929,
"step": 320
},
{
"epoch": 0.3464818763326226,
"grad_norm": 0.3871755599975586,
"learning_rate": 2e-05,
"loss": 0.8303,
"step": 325
},
{
"epoch": 0.35181236673773986,
"grad_norm": 0.2991976737976074,
"learning_rate": 2e-05,
"loss": 0.8265,
"step": 330
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.2702910304069519,
"learning_rate": 2e-05,
"loss": 0.7564,
"step": 335
},
{
"epoch": 0.3624733475479744,
"grad_norm": 0.24136558175086975,
"learning_rate": 2e-05,
"loss": 0.7688,
"step": 340
},
{
"epoch": 0.3678038379530917,
"grad_norm": 0.3107840418815613,
"learning_rate": 2e-05,
"loss": 0.7912,
"step": 345
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.3215864598751068,
"learning_rate": 2e-05,
"loss": 0.8324,
"step": 350
},
{
"epoch": 0.3784648187633262,
"grad_norm": 0.31696921586990356,
"learning_rate": 2e-05,
"loss": 0.8032,
"step": 355
},
{
"epoch": 0.3837953091684435,
"grad_norm": 0.2812045216560364,
"learning_rate": 2e-05,
"loss": 0.7391,
"step": 360
},
{
"epoch": 0.38912579957356075,
"grad_norm": 0.2580372393131256,
"learning_rate": 2e-05,
"loss": 0.7912,
"step": 365
},
{
"epoch": 0.39445628997867804,
"grad_norm": 0.32673174142837524,
"learning_rate": 2e-05,
"loss": 0.802,
"step": 370
},
{
"epoch": 0.3997867803837953,
"grad_norm": 0.36080583930015564,
"learning_rate": 2e-05,
"loss": 0.8197,
"step": 375
},
{
"epoch": 0.4051172707889126,
"grad_norm": 0.32174256443977356,
"learning_rate": 2e-05,
"loss": 0.7906,
"step": 380
},
{
"epoch": 0.41044776119402987,
"grad_norm": 0.288492888212204,
"learning_rate": 2e-05,
"loss": 0.7471,
"step": 385
},
{
"epoch": 0.4157782515991471,
"grad_norm": 0.2705792188644409,
"learning_rate": 2e-05,
"loss": 0.7552,
"step": 390
},
{
"epoch": 0.4211087420042644,
"grad_norm": 0.35100996494293213,
"learning_rate": 2e-05,
"loss": 0.8007,
"step": 395
},
{
"epoch": 0.42643923240938164,
"grad_norm": 0.3444612920284271,
"learning_rate": 2e-05,
"loss": 0.8095,
"step": 400
},
{
"epoch": 0.43176972281449894,
"grad_norm": 0.3082815706729889,
"learning_rate": 2e-05,
"loss": 0.7817,
"step": 405
},
{
"epoch": 0.43710021321961623,
"grad_norm": 0.27543190121650696,
"learning_rate": 2e-05,
"loss": 0.7575,
"step": 410
},
{
"epoch": 0.44243070362473347,
"grad_norm": 0.26944637298583984,
"learning_rate": 2e-05,
"loss": 0.7727,
"step": 415
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.3868255019187927,
"learning_rate": 2e-05,
"loss": 0.779,
"step": 420
},
{
"epoch": 0.453091684434968,
"grad_norm": 0.3912637233734131,
"learning_rate": 2e-05,
"loss": 0.7755,
"step": 425
},
{
"epoch": 0.4584221748400853,
"grad_norm": 0.29420873522758484,
"learning_rate": 2e-05,
"loss": 0.7715,
"step": 430
},
{
"epoch": 0.46375266524520253,
"grad_norm": 0.2589830160140991,
"learning_rate": 2e-05,
"loss": 0.7388,
"step": 435
},
{
"epoch": 0.4690831556503198,
"grad_norm": 0.2840547561645508,
"learning_rate": 2e-05,
"loss": 0.748,
"step": 440
},
{
"epoch": 0.4744136460554371,
"grad_norm": 0.30651238560676575,
"learning_rate": 2e-05,
"loss": 0.7478,
"step": 445
},
{
"epoch": 0.47974413646055436,
"grad_norm": 0.36977389454841614,
"learning_rate": 2e-05,
"loss": 0.8058,
"step": 450
},
{
"epoch": 0.48507462686567165,
"grad_norm": 0.28890499472618103,
"learning_rate": 2e-05,
"loss": 0.7683,
"step": 455
},
{
"epoch": 0.4904051172707889,
"grad_norm": 0.2896074652671814,
"learning_rate": 2e-05,
"loss": 0.7317,
"step": 460
},
{
"epoch": 0.4957356076759062,
"grad_norm": 0.3064037263393402,
"learning_rate": 2e-05,
"loss": 0.7531,
"step": 465
},
{
"epoch": 0.5010660980810234,
"grad_norm": 0.41676151752471924,
"learning_rate": 2e-05,
"loss": 0.7766,
"step": 470
},
{
"epoch": 0.5063965884861408,
"grad_norm": 0.3909554183483124,
"learning_rate": 2e-05,
"loss": 0.7852,
"step": 475
},
{
"epoch": 0.511727078891258,
"grad_norm": 0.3161725103855133,
"learning_rate": 2e-05,
"loss": 0.7755,
"step": 480
},
{
"epoch": 0.5170575692963753,
"grad_norm": 0.2737087607383728,
"learning_rate": 2e-05,
"loss": 0.7484,
"step": 485
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.3294726610183716,
"learning_rate": 2e-05,
"loss": 0.7421,
"step": 490
},
{
"epoch": 0.5277185501066098,
"grad_norm": 0.38085660338401794,
"learning_rate": 2e-05,
"loss": 0.8126,
"step": 495
},
{
"epoch": 0.5330490405117271,
"grad_norm": 0.3341747522354126,
"learning_rate": 2e-05,
"loss": 0.7578,
"step": 500
},
{
"epoch": 0.5383795309168443,
"grad_norm": 0.34995269775390625,
"learning_rate": 2e-05,
"loss": 0.7874,
"step": 505
},
{
"epoch": 0.5437100213219617,
"grad_norm": 0.28949394822120667,
"learning_rate": 2e-05,
"loss": 0.7031,
"step": 510
},
{
"epoch": 0.5490405117270789,
"grad_norm": 0.5440807342529297,
"learning_rate": 2e-05,
"loss": 0.7623,
"step": 515
},
{
"epoch": 0.5543710021321961,
"grad_norm": 0.3526187837123871,
"learning_rate": 2e-05,
"loss": 0.7682,
"step": 520
},
{
"epoch": 0.5597014925373134,
"grad_norm": 0.4043067991733551,
"learning_rate": 2e-05,
"loss": 0.7551,
"step": 525
},
{
"epoch": 0.5650319829424307,
"grad_norm": 0.30657297372817993,
"learning_rate": 2e-05,
"loss": 0.7768,
"step": 530
},
{
"epoch": 0.570362473347548,
"grad_norm": 0.27761197090148926,
"learning_rate": 2e-05,
"loss": 0.7453,
"step": 535
},
{
"epoch": 0.5756929637526652,
"grad_norm": 0.30321934819221497,
"learning_rate": 2e-05,
"loss": 0.7569,
"step": 540
},
{
"epoch": 0.5810234541577826,
"grad_norm": 0.3802403509616852,
"learning_rate": 2e-05,
"loss": 0.7485,
"step": 545
},
{
"epoch": 0.5863539445628998,
"grad_norm": 0.4318316876888275,
"learning_rate": 2e-05,
"loss": 0.7839,
"step": 550
},
{
"epoch": 0.591684434968017,
"grad_norm": 0.31432363390922546,
"learning_rate": 2e-05,
"loss": 0.7571,
"step": 555
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.2873448431491852,
"learning_rate": 2e-05,
"loss": 0.721,
"step": 560
},
{
"epoch": 0.6023454157782516,
"grad_norm": 0.3069778084754944,
"learning_rate": 2e-05,
"loss": 0.7283,
"step": 565
},
{
"epoch": 0.6076759061833689,
"grad_norm": 0.35373619198799133,
"learning_rate": 2e-05,
"loss": 0.7622,
"step": 570
},
{
"epoch": 0.6130063965884861,
"grad_norm": 0.35318872332572937,
"learning_rate": 2e-05,
"loss": 0.7821,
"step": 575
},
{
"epoch": 0.6183368869936035,
"grad_norm": 0.30835840106010437,
"learning_rate": 2e-05,
"loss": 0.7511,
"step": 580
},
{
"epoch": 0.6236673773987207,
"grad_norm": 0.27320486307144165,
"learning_rate": 2e-05,
"loss": 0.7308,
"step": 585
},
{
"epoch": 0.6289978678038379,
"grad_norm": 0.3529856503009796,
"learning_rate": 2e-05,
"loss": 0.7652,
"step": 590
},
{
"epoch": 0.6343283582089553,
"grad_norm": 0.33610275387763977,
"learning_rate": 2e-05,
"loss": 0.741,
"step": 595
},
{
"epoch": 0.6396588486140725,
"grad_norm": 0.3909617066383362,
"learning_rate": 2e-05,
"loss": 0.7445,
"step": 600
},
{
"epoch": 0.6449893390191898,
"grad_norm": 0.3135911226272583,
"learning_rate": 2e-05,
"loss": 0.7638,
"step": 605
},
{
"epoch": 0.650319829424307,
"grad_norm": 0.2903372347354889,
"learning_rate": 2e-05,
"loss": 0.7371,
"step": 610
},
{
"epoch": 0.6556503198294243,
"grad_norm": 0.3075706958770752,
"learning_rate": 2e-05,
"loss": 0.7528,
"step": 615
},
{
"epoch": 0.6609808102345416,
"grad_norm": 0.3804391622543335,
"learning_rate": 2e-05,
"loss": 0.7421,
"step": 620
},
{
"epoch": 0.6663113006396588,
"grad_norm": 0.3764164447784424,
"learning_rate": 2e-05,
"loss": 0.7793,
"step": 625
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.3079053461551666,
"learning_rate": 2e-05,
"loss": 0.7762,
"step": 630
},
{
"epoch": 0.6769722814498934,
"grad_norm": 0.2808702886104584,
"learning_rate": 2e-05,
"loss": 0.7294,
"step": 635
},
{
"epoch": 0.6823027718550106,
"grad_norm": 0.3023492395877838,
"learning_rate": 2e-05,
"loss": 0.7608,
"step": 640
},
{
"epoch": 0.6876332622601279,
"grad_norm": 0.3514968752861023,
"learning_rate": 2e-05,
"loss": 0.7506,
"step": 645
},
{
"epoch": 0.6929637526652452,
"grad_norm": 0.3713417947292328,
"learning_rate": 2e-05,
"loss": 0.7403,
"step": 650
},
{
"epoch": 0.6982942430703625,
"grad_norm": 0.30928298830986023,
"learning_rate": 2e-05,
"loss": 0.7598,
"step": 655
},
{
"epoch": 0.7036247334754797,
"grad_norm": 0.30625951290130615,
"learning_rate": 2e-05,
"loss": 0.7462,
"step": 660
},
{
"epoch": 0.7089552238805971,
"grad_norm": 0.3103967010974884,
"learning_rate": 2e-05,
"loss": 0.7442,
"step": 665
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.3370122015476227,
"learning_rate": 2e-05,
"loss": 0.7794,
"step": 670
},
{
"epoch": 0.7196162046908315,
"grad_norm": 0.39418521523475647,
"learning_rate": 2e-05,
"loss": 0.7613,
"step": 675
},
{
"epoch": 0.7249466950959488,
"grad_norm": 0.3453653156757355,
"learning_rate": 2e-05,
"loss": 0.7648,
"step": 680
},
{
"epoch": 0.7302771855010661,
"grad_norm": 0.33969759941101074,
"learning_rate": 2e-05,
"loss": 0.6901,
"step": 685
},
{
"epoch": 0.7356076759061834,
"grad_norm": 0.29864346981048584,
"learning_rate": 2e-05,
"loss": 0.735,
"step": 690
},
{
"epoch": 0.7409381663113006,
"grad_norm": 0.417368620634079,
"learning_rate": 2e-05,
"loss": 0.7609,
"step": 695
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.3895966708660126,
"learning_rate": 2e-05,
"loss": 0.773,
"step": 700
},
{
"epoch": 0.7515991471215352,
"grad_norm": 0.33772552013397217,
"learning_rate": 2e-05,
"loss": 0.746,
"step": 705
},
{
"epoch": 0.7569296375266524,
"grad_norm": 0.29536184668540955,
"learning_rate": 2e-05,
"loss": 0.7263,
"step": 710
},
{
"epoch": 0.7622601279317697,
"grad_norm": 0.2753921449184418,
"learning_rate": 2e-05,
"loss": 0.7659,
"step": 715
},
{
"epoch": 0.767590618336887,
"grad_norm": 0.34762871265411377,
"learning_rate": 2e-05,
"loss": 0.7584,
"step": 720
},
{
"epoch": 0.7729211087420043,
"grad_norm": 0.3880026340484619,
"learning_rate": 2e-05,
"loss": 0.7844,
"step": 725
},
{
"epoch": 0.7782515991471215,
"grad_norm": 0.3299189507961273,
"learning_rate": 2e-05,
"loss": 0.7381,
"step": 730
},
{
"epoch": 0.7835820895522388,
"grad_norm": 0.30803337693214417,
"learning_rate": 2e-05,
"loss": 0.7058,
"step": 735
},
{
"epoch": 0.7889125799573561,
"grad_norm": 0.3036564290523529,
"learning_rate": 2e-05,
"loss": 0.7196,
"step": 740
},
{
"epoch": 0.7942430703624733,
"grad_norm": 0.40265294909477234,
"learning_rate": 2e-05,
"loss": 0.7381,
"step": 745
},
{
"epoch": 0.7995735607675906,
"grad_norm": 0.4096594452857971,
"learning_rate": 2e-05,
"loss": 0.7541,
"step": 750
},
{
"epoch": 0.8049040511727079,
"grad_norm": 0.36740148067474365,
"learning_rate": 2e-05,
"loss": 0.7608,
"step": 755
},
{
"epoch": 0.8102345415778252,
"grad_norm": 0.32650887966156006,
"learning_rate": 2e-05,
"loss": 0.7351,
"step": 760
},
{
"epoch": 0.8155650319829424,
"grad_norm": 0.3586374521255493,
"learning_rate": 2e-05,
"loss": 0.7604,
"step": 765
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.3661261796951294,
"learning_rate": 2e-05,
"loss": 0.7452,
"step": 770
},
{
"epoch": 0.826226012793177,
"grad_norm": 0.35194748640060425,
"learning_rate": 2e-05,
"loss": 0.7327,
"step": 775
},
{
"epoch": 0.8315565031982942,
"grad_norm": 0.3226638734340668,
"learning_rate": 2e-05,
"loss": 0.758,
"step": 780
},
{
"epoch": 0.8368869936034116,
"grad_norm": 0.3084862530231476,
"learning_rate": 2e-05,
"loss": 0.7265,
"step": 785
},
{
"epoch": 0.8422174840085288,
"grad_norm": 0.3141063451766968,
"learning_rate": 2e-05,
"loss": 0.7373,
"step": 790
},
{
"epoch": 0.847547974413646,
"grad_norm": 0.48754021525382996,
"learning_rate": 2e-05,
"loss": 0.7344,
"step": 795
},
{
"epoch": 0.8528784648187633,
"grad_norm": 0.36847469210624695,
"learning_rate": 2e-05,
"loss": 0.742,
"step": 800
},
{
"epoch": 0.8582089552238806,
"grad_norm": 0.31983086466789246,
"learning_rate": 2e-05,
"loss": 0.7499,
"step": 805
},
{
"epoch": 0.8635394456289979,
"grad_norm": 0.2929735481739044,
"learning_rate": 2e-05,
"loss": 0.7025,
"step": 810
},
{
"epoch": 0.8688699360341151,
"grad_norm": 0.33163875341415405,
"learning_rate": 2e-05,
"loss": 0.7158,
"step": 815
},
{
"epoch": 0.8742004264392325,
"grad_norm": 0.3640444874763489,
"learning_rate": 2e-05,
"loss": 0.7089,
"step": 820
},
{
"epoch": 0.8795309168443497,
"grad_norm": 0.37700313329696655,
"learning_rate": 2e-05,
"loss": 0.7389,
"step": 825
},
{
"epoch": 0.8848614072494669,
"grad_norm": 0.3186069130897522,
"learning_rate": 2e-05,
"loss": 0.721,
"step": 830
},
{
"epoch": 0.8901918976545842,
"grad_norm": 0.30727624893188477,
"learning_rate": 2e-05,
"loss": 0.719,
"step": 835
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.3146567642688751,
"learning_rate": 2e-05,
"loss": 0.7147,
"step": 840
},
{
"epoch": 0.9008528784648188,
"grad_norm": 0.3437662720680237,
"learning_rate": 2e-05,
"loss": 0.7312,
"step": 845
},
{
"epoch": 0.906183368869936,
"grad_norm": 0.3754054009914398,
"learning_rate": 2e-05,
"loss": 0.743,
"step": 850
},
{
"epoch": 0.9115138592750534,
"grad_norm": 0.30614492297172546,
"learning_rate": 2e-05,
"loss": 0.7415,
"step": 855
},
{
"epoch": 0.9168443496801706,
"grad_norm": 0.293458491563797,
"learning_rate": 2e-05,
"loss": 0.7,
"step": 860
},
{
"epoch": 0.9221748400852878,
"grad_norm": 0.2879101037979126,
"learning_rate": 2e-05,
"loss": 0.7572,
"step": 865
},
{
"epoch": 0.9275053304904051,
"grad_norm": 0.30240631103515625,
"learning_rate": 2e-05,
"loss": 0.736,
"step": 870
},
{
"epoch": 0.9328358208955224,
"grad_norm": 0.40850868821144104,
"learning_rate": 2e-05,
"loss": 0.744,
"step": 875
},
{
"epoch": 0.9381663113006397,
"grad_norm": 0.3220716416835785,
"learning_rate": 2e-05,
"loss": 0.7444,
"step": 880
},
{
"epoch": 0.9434968017057569,
"grad_norm": 0.3097483813762665,
"learning_rate": 2e-05,
"loss": 0.7366,
"step": 885
},
{
"epoch": 0.9488272921108742,
"grad_norm": 0.3266868591308594,
"learning_rate": 2e-05,
"loss": 0.7318,
"step": 890
},
{
"epoch": 0.9541577825159915,
"grad_norm": 0.33244094252586365,
"learning_rate": 2e-05,
"loss": 0.7311,
"step": 895
},
{
"epoch": 0.9594882729211087,
"grad_norm": 0.37434569001197815,
"learning_rate": 2e-05,
"loss": 0.7468,
"step": 900
},
{
"epoch": 0.964818763326226,
"grad_norm": 0.33542153239250183,
"learning_rate": 2e-05,
"loss": 0.7464,
"step": 905
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.3000001907348633,
"learning_rate": 2e-05,
"loss": 0.6872,
"step": 910
},
{
"epoch": 0.9754797441364605,
"grad_norm": 0.3260563313961029,
"learning_rate": 2e-05,
"loss": 0.7265,
"step": 915
},
{
"epoch": 0.9808102345415778,
"grad_norm": 0.33576592803001404,
"learning_rate": 2e-05,
"loss": 0.7254,
"step": 920
},
{
"epoch": 0.9861407249466951,
"grad_norm": 0.41705191135406494,
"learning_rate": 2e-05,
"loss": 0.734,
"step": 925
},
{
"epoch": 0.9914712153518124,
"grad_norm": 0.3626074194908142,
"learning_rate": 2e-05,
"loss": 0.749,
"step": 930
},
{
"epoch": 0.9968017057569296,
"grad_norm": 0.3002621531486511,
"learning_rate": 2e-05,
"loss": 0.7157,
"step": 935
}
],
"logging_steps": 5,
"max_steps": 938,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.798520507990016e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}