qwen-fbdd-7b / checkpoint-396 /trainer_state.json
Cluna80's picture
Upload folder using huggingface_hub
77e639d verified
{
"best_global_step": 396,
"best_metric": 0.24572625756263733,
"best_model_checkpoint": "./qwen-fbdd-finetuned/checkpoint-396",
"epoch": 3.0,
"eval_steps": 100,
"global_step": 396,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1193337190896273,
"epoch": 0.0761904761904762,
"grad_norm": 0.7966287136077881,
"learning_rate": 9e-06,
"loss": 2.9907556533813477,
"mean_token_accuracy": 0.499012922309339,
"num_tokens": 39817.0,
"step": 10
},
{
"entropy": 1.1413242232054472,
"epoch": 0.1523809523809524,
"grad_norm": 1.1645318269729614,
"learning_rate": 1.9e-05,
"loss": 3.014940643310547,
"mean_token_accuracy": 0.49516682866960765,
"num_tokens": 79143.0,
"step": 20
},
{
"entropy": 1.1685865793377161,
"epoch": 0.22857142857142856,
"grad_norm": 1.1971632242202759,
"learning_rate": 2.9e-05,
"loss": 2.969620704650879,
"mean_token_accuracy": 0.49454432763159273,
"num_tokens": 118882.0,
"step": 30
},
{
"entropy": 1.1996186520904302,
"epoch": 0.3047619047619048,
"grad_norm": 1.4842681884765625,
"learning_rate": 3.9000000000000006e-05,
"loss": 2.808257484436035,
"mean_token_accuracy": 0.5064258672297001,
"num_tokens": 158369.0,
"step": 40
},
{
"entropy": 1.3778786644339562,
"epoch": 0.38095238095238093,
"grad_norm": 1.4645507335662842,
"learning_rate": 4.9e-05,
"loss": 2.6315568923950194,
"mean_token_accuracy": 0.5036305475980043,
"num_tokens": 195393.0,
"step": 50
},
{
"entropy": 1.5997230507433415,
"epoch": 0.45714285714285713,
"grad_norm": 0.9199417233467102,
"learning_rate": 4.869942196531792e-05,
"loss": 2.2722213745117186,
"mean_token_accuracy": 0.5217761812731624,
"num_tokens": 234242.0,
"step": 60
},
{
"entropy": 1.7654182992875576,
"epoch": 0.5333333333333333,
"grad_norm": 0.7629300951957703,
"learning_rate": 4.7254335260115614e-05,
"loss": 2.0799581527709963,
"mean_token_accuracy": 0.5340151842683554,
"num_tokens": 271101.0,
"step": 70
},
{
"entropy": 1.8141176998615265,
"epoch": 0.6095238095238096,
"grad_norm": 0.6566410660743713,
"learning_rate": 4.58092485549133e-05,
"loss": 1.8678638458251953,
"mean_token_accuracy": 0.5625626968219877,
"num_tokens": 310931.0,
"step": 80
},
{
"entropy": 1.8015068300068378,
"epoch": 0.6857142857142857,
"grad_norm": 0.6888961791992188,
"learning_rate": 4.4364161849710985e-05,
"loss": 1.7109407424926757,
"mean_token_accuracy": 0.5945755124092102,
"num_tokens": 352666.0,
"step": 90
},
{
"entropy": 1.6939984746277332,
"epoch": 0.7619047619047619,
"grad_norm": 1.0066399574279785,
"learning_rate": 4.291907514450868e-05,
"loss": 1.5721519470214844,
"mean_token_accuracy": 0.6281625108793378,
"num_tokens": 391127.0,
"step": 100
},
{
"epoch": 0.7619047619047619,
"eval_entropy": 1.6317288610670302,
"eval_loss": 1.5727437734603882,
"eval_mean_token_accuracy": 0.6416771459019083,
"eval_num_tokens": 391127.0,
"eval_runtime": 8.0884,
"eval_samples_per_second": 28.93,
"eval_steps_per_second": 28.93,
"step": 100
},
{
"entropy": 1.5300732851028442,
"epoch": 0.8380952380952381,
"grad_norm": 0.8622047305107117,
"learning_rate": 4.147398843930636e-05,
"loss": 1.4249468803405763,
"mean_token_accuracy": 0.6621094869449735,
"num_tokens": 429779.0,
"step": 110
},
{
"entropy": 1.3978409506380558,
"epoch": 0.9142857142857143,
"grad_norm": 0.846666693687439,
"learning_rate": 4.002890173410404e-05,
"loss": 1.2797026634216309,
"mean_token_accuracy": 0.700611076131463,
"num_tokens": 468862.0,
"step": 120
},
{
"entropy": 1.2408478770405054,
"epoch": 0.9904761904761905,
"grad_norm": 0.9868558049201965,
"learning_rate": 3.8583815028901736e-05,
"loss": 1.1070871353149414,
"mean_token_accuracy": 0.7445570107549428,
"num_tokens": 508425.0,
"step": 130
},
{
"entropy": 1.0399991414836935,
"epoch": 1.0609523809523809,
"grad_norm": 0.8608207106590271,
"learning_rate": 3.713872832369942e-05,
"loss": 0.8918219566345215,
"mean_token_accuracy": 0.792083133716841,
"num_tokens": 544910.0,
"step": 140
},
{
"entropy": 0.9168793668970465,
"epoch": 1.1371428571428572,
"grad_norm": 0.9900131821632385,
"learning_rate": 3.569364161849711e-05,
"loss": 0.7435701847076416,
"mean_token_accuracy": 0.8109283685684204,
"num_tokens": 584485.0,
"step": 150
},
{
"entropy": 0.841719158180058,
"epoch": 1.2133333333333334,
"grad_norm": 1.0477681159973145,
"learning_rate": 3.42485549132948e-05,
"loss": 0.6946741580963135,
"mean_token_accuracy": 0.8230816710740327,
"num_tokens": 623281.0,
"step": 160
},
{
"entropy": 0.7224269095808268,
"epoch": 1.2895238095238095,
"grad_norm": 0.9945750832557678,
"learning_rate": 3.2803468208092486e-05,
"loss": 0.5712966442108154,
"mean_token_accuracy": 0.8562004685401916,
"num_tokens": 660421.0,
"step": 170
},
{
"entropy": 0.7375645495951175,
"epoch": 1.3657142857142857,
"grad_norm": 0.9197537899017334,
"learning_rate": 3.135838150289018e-05,
"loss": 0.622592830657959,
"mean_token_accuracy": 0.8469895273447037,
"num_tokens": 699568.0,
"step": 180
},
{
"entropy": 0.6055924735963345,
"epoch": 1.441904761904762,
"grad_norm": 0.931404173374176,
"learning_rate": 2.9913294797687864e-05,
"loss": 0.5101625919342041,
"mean_token_accuracy": 0.8736397925764322,
"num_tokens": 739234.0,
"step": 190
},
{
"entropy": 0.569295346736908,
"epoch": 1.518095238095238,
"grad_norm": 0.8699201941490173,
"learning_rate": 2.846820809248555e-05,
"loss": 0.477140474319458,
"mean_token_accuracy": 0.8829405516386032,
"num_tokens": 777743.0,
"step": 200
},
{
"epoch": 1.518095238095238,
"eval_entropy": 0.537933240716274,
"eval_loss": 0.4319809377193451,
"eval_mean_token_accuracy": 0.8860002267055023,
"eval_num_tokens": 777743.0,
"eval_runtime": 8.0156,
"eval_samples_per_second": 29.193,
"eval_steps_per_second": 29.193,
"step": 200
},
{
"entropy": 0.554788151010871,
"epoch": 1.5942857142857143,
"grad_norm": 1.1020101308822632,
"learning_rate": 2.702312138728324e-05,
"loss": 0.5022446155548096,
"mean_token_accuracy": 0.8869980745017528,
"num_tokens": 817800.0,
"step": 210
},
{
"entropy": 0.5009425025433302,
"epoch": 1.6704761904761904,
"grad_norm": 0.9859704971313477,
"learning_rate": 2.5578034682080925e-05,
"loss": 0.4281635761260986,
"mean_token_accuracy": 0.8912746794521809,
"num_tokens": 859457.0,
"step": 220
},
{
"entropy": 0.46139501575380565,
"epoch": 1.7466666666666666,
"grad_norm": 0.7826982140541077,
"learning_rate": 2.4132947976878615e-05,
"loss": 0.3664123296737671,
"mean_token_accuracy": 0.8995421338826418,
"num_tokens": 899431.0,
"step": 230
},
{
"entropy": 0.4695171698927879,
"epoch": 1.822857142857143,
"grad_norm": 0.8453237414360046,
"learning_rate": 2.2687861271676304e-05,
"loss": 0.37414577007293703,
"mean_token_accuracy": 0.9007753636687994,
"num_tokens": 937747.0,
"step": 240
},
{
"entropy": 0.44786179112270474,
"epoch": 1.899047619047619,
"grad_norm": 1.1034295558929443,
"learning_rate": 2.124277456647399e-05,
"loss": 0.352729344367981,
"mean_token_accuracy": 0.9072672612965107,
"num_tokens": 977048.0,
"step": 250
},
{
"entropy": 0.42663233568891884,
"epoch": 1.9752380952380952,
"grad_norm": 0.9720745086669922,
"learning_rate": 1.9797687861271676e-05,
"loss": 0.33823652267456056,
"mean_token_accuracy": 0.9057052366435527,
"num_tokens": 1015130.0,
"step": 260
},
{
"entropy": 0.41624320489732,
"epoch": 2.045714285714286,
"grad_norm": 1.063590168952942,
"learning_rate": 1.8352601156069365e-05,
"loss": 0.3131232261657715,
"mean_token_accuracy": 0.9075767901298162,
"num_tokens": 1050223.0,
"step": 270
},
{
"entropy": 0.39982422441244125,
"epoch": 2.1219047619047617,
"grad_norm": 1.0073680877685547,
"learning_rate": 1.6907514450867054e-05,
"loss": 0.3176624298095703,
"mean_token_accuracy": 0.913105733692646,
"num_tokens": 1089337.0,
"step": 280
},
{
"entropy": 0.3832275261171162,
"epoch": 2.198095238095238,
"grad_norm": 1.0207056999206543,
"learning_rate": 1.546242774566474e-05,
"loss": 0.2810999631881714,
"mean_token_accuracy": 0.9169781133532524,
"num_tokens": 1128066.0,
"step": 290
},
{
"entropy": 0.37502991100773214,
"epoch": 2.2742857142857145,
"grad_norm": 1.1781491041183472,
"learning_rate": 1.4017341040462428e-05,
"loss": 0.2836958646774292,
"mean_token_accuracy": 0.9159112725406885,
"num_tokens": 1166578.0,
"step": 300
},
{
"epoch": 2.2742857142857145,
"eval_entropy": 0.3737119477656152,
"eval_loss": 0.2801544964313507,
"eval_mean_token_accuracy": 0.9141584352040902,
"eval_num_tokens": 1166578.0,
"eval_runtime": 8.0772,
"eval_samples_per_second": 28.971,
"eval_steps_per_second": 28.971,
"step": 300
},
{
"entropy": 0.37731272270902994,
"epoch": 2.3504761904761904,
"grad_norm": 0.933684229850769,
"learning_rate": 1.2572254335260117e-05,
"loss": 0.2837996482849121,
"mean_token_accuracy": 0.917362405359745,
"num_tokens": 1206830.0,
"step": 310
},
{
"entropy": 0.36399535620585083,
"epoch": 2.4266666666666667,
"grad_norm": 1.1197153329849243,
"learning_rate": 1.1127167630057805e-05,
"loss": 0.27221682071685793,
"mean_token_accuracy": 0.9155685339123011,
"num_tokens": 1245749.0,
"step": 320
},
{
"entropy": 0.35384900290519,
"epoch": 2.5028571428571427,
"grad_norm": 1.0109339952468872,
"learning_rate": 9.68208092485549e-06,
"loss": 0.2685784101486206,
"mean_token_accuracy": 0.9206566758453846,
"num_tokens": 1287709.0,
"step": 330
},
{
"entropy": 0.3556200794875622,
"epoch": 2.579047619047619,
"grad_norm": 1.366537094116211,
"learning_rate": 8.23699421965318e-06,
"loss": 0.26150500774383545,
"mean_token_accuracy": 0.9224125389009714,
"num_tokens": 1324926.0,
"step": 340
},
{
"entropy": 0.3399674018844962,
"epoch": 2.6552380952380954,
"grad_norm": 0.766735851764679,
"learning_rate": 6.791907514450866e-06,
"loss": 0.24372787475585939,
"mean_token_accuracy": 0.9262156378477812,
"num_tokens": 1361824.0,
"step": 350
},
{
"entropy": 0.34854550352320074,
"epoch": 2.7314285714285713,
"grad_norm": 0.7423695921897888,
"learning_rate": 5.346820809248556e-06,
"loss": 0.2581783771514893,
"mean_token_accuracy": 0.9233053136616945,
"num_tokens": 1401454.0,
"step": 360
},
{
"entropy": 0.34303188025951387,
"epoch": 2.8076190476190477,
"grad_norm": 1.0954190492630005,
"learning_rate": 3.901734104046243e-06,
"loss": 0.2694044351577759,
"mean_token_accuracy": 0.9223306879401207,
"num_tokens": 1442263.0,
"step": 370
},
{
"entropy": 0.3252540229819715,
"epoch": 2.883809523809524,
"grad_norm": 1.0154286623001099,
"learning_rate": 2.456647398843931e-06,
"loss": 0.23521080017089843,
"mean_token_accuracy": 0.9274044901132583,
"num_tokens": 1479797.0,
"step": 380
},
{
"entropy": 0.3254506874829531,
"epoch": 2.96,
"grad_norm": 0.8258033990859985,
"learning_rate": 1.0115606936416186e-06,
"loss": 0.24233431816101075,
"mean_token_accuracy": 0.9235792737454176,
"num_tokens": 1518093.0,
"step": 390
},
{
"epoch": 3.0,
"eval_entropy": 0.33179805516941935,
"eval_loss": 0.24572625756263733,
"eval_mean_token_accuracy": 0.9225198356514304,
"eval_num_tokens": 1539813.0,
"eval_runtime": 8.1336,
"eval_samples_per_second": 28.769,
"eval_steps_per_second": 28.769,
"step": 396
}
],
"logging_steps": 10,
"max_steps": 396,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.534789852765082e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}