Safetensors
OGPSA / qwen_OGPSA /sft /trainer_state.json
long2333's picture
Upload 33 files
7d0cb16 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 237,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06389776357827476,
"grad_norm": 17.501014709472656,
"learning_rate": 9.992973140107996e-07,
"loss": 1.3507,
"num_input_tokens_seen": 93984,
"step": 5,
"train_runtime": 556.6226,
"train_tokens_per_second": 168.847
},
{
"epoch": 0.12779552715654952,
"grad_norm": 11.144261360168457,
"learning_rate": 9.964460368509865e-07,
"loss": 1.0678,
"num_input_tokens_seen": 187200,
"step": 10,
"train_runtime": 1095.0218,
"train_tokens_per_second": 170.956
},
{
"epoch": 0.19169329073482427,
"grad_norm": 5.366312026977539,
"learning_rate": 9.914147615517526e-07,
"loss": 0.8995,
"num_input_tokens_seen": 282560,
"step": 15,
"train_runtime": 1632.6431,
"train_tokens_per_second": 173.069
},
{
"epoch": 0.25559105431309903,
"grad_norm": 5.131296157836914,
"learning_rate": 9.842255814927944e-07,
"loss": 0.8231,
"num_input_tokens_seen": 376064,
"step": 20,
"train_runtime": 2172.7033,
"train_tokens_per_second": 173.086
},
{
"epoch": 0.3194888178913738,
"grad_norm": 3.8358519077301025,
"learning_rate": 9.749100658638914e-07,
"loss": 0.7679,
"num_input_tokens_seen": 471616,
"step": 25,
"train_runtime": 2712.1273,
"train_tokens_per_second": 173.892
},
{
"epoch": 0.38338658146964855,
"grad_norm": 3.4662442207336426,
"learning_rate": 9.63509121038005e-07,
"loss": 0.7301,
"num_input_tokens_seen": 564352,
"step": 30,
"train_runtime": 3636.2616,
"train_tokens_per_second": 155.201
},
{
"epoch": 0.4472843450479233,
"grad_norm": 3.0572352409362793,
"learning_rate": 9.500728109428603e-07,
"loss": 0.7017,
"num_input_tokens_seen": 657568,
"step": 35,
"train_runtime": 3832.9972,
"train_tokens_per_second": 171.555
},
{
"epoch": 0.5111821086261981,
"grad_norm": 3.1490511894226074,
"learning_rate": 9.346601372197913e-07,
"loss": 0.7058,
"num_input_tokens_seen": 750720,
"step": 40,
"train_runtime": 4012.6034,
"train_tokens_per_second": 187.091
},
{
"epoch": 0.5750798722044729,
"grad_norm": 3.2475409507751465,
"learning_rate": 9.17338780135223e-07,
"loss": 0.6841,
"num_input_tokens_seen": 842784,
"step": 45,
"train_runtime": 4192.5527,
"train_tokens_per_second": 201.019
},
{
"epoch": 0.6389776357827476,
"grad_norm": 3.1415064334869385,
"learning_rate": 8.981848013824993e-07,
"loss": 0.6738,
"num_input_tokens_seen": 936128,
"step": 50,
"train_runtime": 4379.3543,
"train_tokens_per_second": 213.759
},
{
"epoch": 0.7028753993610224,
"grad_norm": 3.2890915870666504,
"learning_rate": 8.77282310079115e-07,
"loss": 0.6643,
"num_input_tokens_seen": 1028160,
"step": 55,
"train_runtime": 4592.6801,
"train_tokens_per_second": 223.869
},
{
"epoch": 0.7667731629392971,
"grad_norm": 2.9755921363830566,
"learning_rate": 8.547230934260311e-07,
"loss": 0.6449,
"num_input_tokens_seen": 1119552,
"step": 60,
"train_runtime": 5103.7665,
"train_tokens_per_second": 219.358
},
{
"epoch": 0.8306709265175719,
"grad_norm": 3.0064377784729004,
"learning_rate": 8.306062136509219e-07,
"loss": 0.6547,
"num_input_tokens_seen": 1212032,
"step": 65,
"train_runtime": 5281.6563,
"train_tokens_per_second": 229.48
},
{
"epoch": 0.8945686900958466,
"grad_norm": 3.0753512382507324,
"learning_rate": 8.050375730052621e-07,
"loss": 0.6543,
"num_input_tokens_seen": 1306368,
"step": 70,
"train_runtime": 5458.516,
"train_tokens_per_second": 239.327
},
{
"epoch": 0.9584664536741214,
"grad_norm": 2.9098427295684814,
"learning_rate": 7.781294487254435e-07,
"loss": 0.6579,
"num_input_tokens_seen": 1400576,
"step": 75,
"train_runtime": 5636.7125,
"train_tokens_per_second": 248.474
},
{
"epoch": 1.012779552715655,
"grad_norm": 3.033903121948242,
"learning_rate": 7.5e-07,
"loss": 0.6344,
"num_input_tokens_seen": 1481248,
"step": 80,
"train_runtime": 5831.8252,
"train_tokens_per_second": 253.994
},
{
"epoch": 1.0766773162939298,
"grad_norm": 2.6656150817871094,
"learning_rate": 7.207727491079559e-07,
"loss": 0.6292,
"num_input_tokens_seen": 1575104,
"step": 85,
"train_runtime": 6012.044,
"train_tokens_per_second": 261.991
},
{
"epoch": 1.1405750798722045,
"grad_norm": 2.7004282474517822,
"learning_rate": 6.905760390067234e-07,
"loss": 0.6239,
"num_input_tokens_seen": 1668064,
"step": 90,
"train_runtime": 6510.4742,
"train_tokens_per_second": 256.212
},
{
"epoch": 1.2044728434504792,
"grad_norm": 2.72955060005188,
"learning_rate": 6.595424697513963e-07,
"loss": 0.6157,
"num_input_tokens_seen": 1764128,
"step": 95,
"train_runtime": 6697.2327,
"train_tokens_per_second": 263.411
},
{
"epoch": 1.268370607028754,
"grad_norm": 2.819629192352295,
"learning_rate": 6.278083162202373e-07,
"loss": 0.6096,
"num_input_tokens_seen": 1858912,
"step": 100,
"train_runtime": 6880.9117,
"train_tokens_per_second": 270.155
},
{
"epoch": 1.3322683706070286,
"grad_norm": 2.837791919708252,
"learning_rate": 5.955129297032538e-07,
"loss": 0.5967,
"num_input_tokens_seen": 1952640,
"step": 105,
"train_runtime": 7062.8237,
"train_tokens_per_second": 276.467
},
{
"epoch": 1.3961661341853036,
"grad_norm": 2.6342546939849854,
"learning_rate": 5.62798125981604e-07,
"loss": 0.6051,
"num_input_tokens_seen": 2045792,
"step": 110,
"train_runtime": 7245.471,
"train_tokens_per_second": 282.355
},
{
"epoch": 1.4600638977635783,
"grad_norm": 2.5401482582092285,
"learning_rate": 5.298075625849099e-07,
"loss": 0.5899,
"num_input_tokens_seen": 2140736,
"step": 115,
"train_runtime": 7427.2464,
"train_tokens_per_second": 288.227
},
{
"epoch": 1.5239616613418532,
"grad_norm": 2.6414806842803955,
"learning_rate": 4.966861079610687e-07,
"loss": 0.5901,
"num_input_tokens_seen": 2233280,
"step": 120,
"train_runtime": 7929.4515,
"train_tokens_per_second": 281.644
},
{
"epoch": 1.5878594249201279,
"grad_norm": 2.817983865737915,
"learning_rate": 4.6357920532866816e-07,
"loss": 0.6011,
"num_input_tokens_seen": 2326144,
"step": 125,
"train_runtime": 8118.8041,
"train_tokens_per_second": 286.513
},
{
"epoch": 1.6517571884984026,
"grad_norm": 2.9443130493164062,
"learning_rate": 4.306322340054659e-07,
"loss": 0.5969,
"num_input_tokens_seen": 2418592,
"step": 130,
"train_runtime": 8306.8501,
"train_tokens_per_second": 291.156
},
{
"epoch": 1.7156549520766773,
"grad_norm": 2.630876302719116,
"learning_rate": 3.979898710174677e-07,
"loss": 0.5948,
"num_input_tokens_seen": 2512320,
"step": 135,
"train_runtime": 8501.9493,
"train_tokens_per_second": 295.499
},
{
"epoch": 1.779552715654952,
"grad_norm": 2.6901042461395264,
"learning_rate": 3.657954557919183e-07,
"loss": 0.598,
"num_input_tokens_seen": 2606112,
"step": 140,
"train_runtime": 8694.019,
"train_tokens_per_second": 299.759
},
{
"epoch": 1.8434504792332267,
"grad_norm": 2.8361966609954834,
"learning_rate": 3.3419036072396614e-07,
"loss": 0.5902,
"num_input_tokens_seen": 2699936,
"step": 145,
"train_runtime": 8889.9576,
"train_tokens_per_second": 303.706
},
{
"epoch": 1.9073482428115016,
"grad_norm": 2.87080979347229,
"learning_rate": 3.033133703809759e-07,
"loss": 0.5978,
"num_input_tokens_seen": 2795136,
"step": 150,
"train_runtime": 9406.664,
"train_tokens_per_second": 297.144
},
{
"epoch": 1.9712460063897763,
"grad_norm": 2.7429561614990234,
"learning_rate": 2.7330007207053406e-07,
"loss": 0.5946,
"num_input_tokens_seen": 2888960,
"step": 155,
"train_runtime": 9589.8224,
"train_tokens_per_second": 301.253
},
{
"epoch": 2.02555910543131,
"grad_norm": 2.6952402591705322,
"learning_rate": 2.442822604482889e-07,
"loss": 0.5918,
"num_input_tokens_seen": 2968224,
"step": 160,
"train_runtime": 9773.1495,
"train_tokens_per_second": 303.712
},
{
"epoch": 2.0894568690095845,
"grad_norm": 2.673067569732666,
"learning_rate": 2.16387358780116e-07,
"loss": 0.5663,
"num_input_tokens_seen": 3062400,
"step": 165,
"train_runtime": 9959.8127,
"train_tokens_per_second": 307.476
},
{
"epoch": 2.1533546325878596,
"grad_norm": 2.7801618576049805,
"learning_rate": 1.8973785939996927e-07,
"loss": 0.5791,
"num_input_tokens_seen": 3155520,
"step": 170,
"train_runtime": 10148.3528,
"train_tokens_per_second": 310.939
},
{
"epoch": 2.2172523961661343,
"grad_norm": 2.5863192081451416,
"learning_rate": 1.6445078582048154e-07,
"loss": 0.5695,
"num_input_tokens_seen": 3250496,
"step": 175,
"train_runtime": 10338.7555,
"train_tokens_per_second": 314.399
},
{
"epoch": 2.281150159744409,
"grad_norm": 2.7501046657562256,
"learning_rate": 1.4063717885830373e-07,
"loss": 0.5675,
"num_input_tokens_seen": 3344672,
"step": 180,
"train_runtime": 10847.5534,
"train_tokens_per_second": 308.334
},
{
"epoch": 2.3450479233226837,
"grad_norm": 2.7307002544403076,
"learning_rate": 1.184016090307059e-07,
"loss": 0.5657,
"num_input_tokens_seen": 3438784,
"step": 185,
"train_runtime": 11033.7768,
"train_tokens_per_second": 311.66
},
{
"epoch": 2.4089456869009584,
"grad_norm": 2.663017988204956,
"learning_rate": 9.78417173646176e-08,
"loss": 0.5745,
"num_input_tokens_seen": 3531840,
"step": 190,
"train_runtime": 11217.6988,
"train_tokens_per_second": 314.845
},
{
"epoch": 2.472843450479233,
"grad_norm": 2.6426873207092285,
"learning_rate": 7.904778663450323e-08,
"loss": 0.5885,
"num_input_tokens_seen": 3625984,
"step": 195,
"train_runtime": 11403.1929,
"train_tokens_per_second": 317.98
},
{
"epoch": 2.536741214057508,
"grad_norm": 2.7548089027404785,
"learning_rate": 6.210234491186079e-08,
"loss": 0.5748,
"num_input_tokens_seen": 3720352,
"step": 200,
"train_runtime": 11603.6555,
"train_tokens_per_second": 320.619
},
{
"epoch": 2.600638977635783,
"grad_norm": 2.6678242683410645,
"learning_rate": 4.7079803167238366e-08,
"loss": 0.5741,
"num_input_tokens_seen": 3812992,
"step": 205,
"train_runtime": 11793.3317,
"train_tokens_per_second": 323.318
},
{
"epoch": 2.6645367412140573,
"grad_norm": 2.8868303298950195,
"learning_rate": 3.4046128516136754e-08,
"loss": 0.5642,
"num_input_tokens_seen": 3905280,
"step": 210,
"train_runtime": 12299.9488,
"train_tokens_per_second": 317.504
},
{
"epoch": 2.7284345047923324,
"grad_norm": 2.5737545490264893,
"learning_rate": 2.3058554543638698e-08,
"loss": 0.5741,
"num_input_tokens_seen": 3999680,
"step": 215,
"train_runtime": 12494.1775,
"train_tokens_per_second": 320.124
},
{
"epoch": 2.792332268370607,
"grad_norm": 2.635117292404175,
"learning_rate": 1.4165329979794971e-08,
"loss": 0.5805,
"num_input_tokens_seen": 4094720,
"step": 220,
"train_runtime": 12687.8777,
"train_tokens_per_second": 322.727
},
{
"epoch": 2.856230031948882,
"grad_norm": 2.665903329849243,
"learning_rate": 7.405506829382735e-09,
"loss": 0.5779,
"num_input_tokens_seen": 4189248,
"step": 225,
"train_runtime": 12880.2851,
"train_tokens_per_second": 325.245
},
{
"epoch": 2.9201277955271565,
"grad_norm": 2.6100857257843018,
"learning_rate": 2.808768886403301e-09,
"loss": 0.5671,
"num_input_tokens_seen": 4282208,
"step": 230,
"train_runtime": 13076.8487,
"train_tokens_per_second": 327.465
},
{
"epoch": 2.984025559105431,
"grad_norm": 2.5199291706085205,
"learning_rate": 3.9530138634907837e-10,
"loss": 0.5685,
"num_input_tokens_seen": 4376096,
"step": 235,
"train_runtime": 13275.1963,
"train_tokens_per_second": 329.645
},
{
"epoch": 3.0,
"num_input_tokens_seen": 4400096,
"step": 237,
"total_flos": 1.8666841676395315e+17,
"train_loss": 0.6493978349468376,
"train_runtime": 13351.2629,
"train_samples_per_second": 2.247,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 237,
"num_input_tokens_seen": 4400096,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8666841676395315e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}