{ "best_global_step": 396, "best_metric": 0.24572625756263733, "best_model_checkpoint": "./qwen-fbdd-finetuned/checkpoint-396", "epoch": 3.0, "eval_steps": 100, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1193337190896273, "epoch": 0.0761904761904762, "grad_norm": 0.7966287136077881, "learning_rate": 9e-06, "loss": 2.9907556533813477, "mean_token_accuracy": 0.499012922309339, "num_tokens": 39817.0, "step": 10 }, { "entropy": 1.1413242232054472, "epoch": 0.1523809523809524, "grad_norm": 1.1645318269729614, "learning_rate": 1.9e-05, "loss": 3.014940643310547, "mean_token_accuracy": 0.49516682866960765, "num_tokens": 79143.0, "step": 20 }, { "entropy": 1.1685865793377161, "epoch": 0.22857142857142856, "grad_norm": 1.1971632242202759, "learning_rate": 2.9e-05, "loss": 2.969620704650879, "mean_token_accuracy": 0.49454432763159273, "num_tokens": 118882.0, "step": 30 }, { "entropy": 1.1996186520904302, "epoch": 0.3047619047619048, "grad_norm": 1.4842681884765625, "learning_rate": 3.9000000000000006e-05, "loss": 2.808257484436035, "mean_token_accuracy": 0.5064258672297001, "num_tokens": 158369.0, "step": 40 }, { "entropy": 1.3778786644339562, "epoch": 0.38095238095238093, "grad_norm": 1.4645507335662842, "learning_rate": 4.9e-05, "loss": 2.6315568923950194, "mean_token_accuracy": 0.5036305475980043, "num_tokens": 195393.0, "step": 50 }, { "entropy": 1.5997230507433415, "epoch": 0.45714285714285713, "grad_norm": 0.9199417233467102, "learning_rate": 4.869942196531792e-05, "loss": 2.2722213745117186, "mean_token_accuracy": 0.5217761812731624, "num_tokens": 234242.0, "step": 60 }, { "entropy": 1.7654182992875576, "epoch": 0.5333333333333333, "grad_norm": 0.7629300951957703, "learning_rate": 4.7254335260115614e-05, "loss": 2.0799581527709963, "mean_token_accuracy": 0.5340151842683554, "num_tokens": 271101.0, "step": 70 }, { "entropy": 1.8141176998615265, "epoch": 0.6095238095238096, "grad_norm": 0.6566410660743713, "learning_rate": 4.58092485549133e-05, "loss": 1.8678638458251953, "mean_token_accuracy": 0.5625626968219877, "num_tokens": 310931.0, "step": 80 }, { "entropy": 1.8015068300068378, "epoch": 0.6857142857142857, "grad_norm": 0.6888961791992188, "learning_rate": 4.4364161849710985e-05, "loss": 1.7109407424926757, "mean_token_accuracy": 0.5945755124092102, "num_tokens": 352666.0, "step": 90 }, { "entropy": 1.6939984746277332, "epoch": 0.7619047619047619, "grad_norm": 1.0066399574279785, "learning_rate": 4.291907514450868e-05, "loss": 1.5721519470214844, "mean_token_accuracy": 0.6281625108793378, "num_tokens": 391127.0, "step": 100 }, { "epoch": 0.7619047619047619, "eval_entropy": 1.6317288610670302, "eval_loss": 1.5727437734603882, "eval_mean_token_accuracy": 0.6416771459019083, "eval_num_tokens": 391127.0, "eval_runtime": 8.0884, "eval_samples_per_second": 28.93, "eval_steps_per_second": 28.93, "step": 100 }, { "entropy": 1.5300732851028442, "epoch": 0.8380952380952381, "grad_norm": 0.8622047305107117, "learning_rate": 4.147398843930636e-05, "loss": 1.4249468803405763, "mean_token_accuracy": 0.6621094869449735, "num_tokens": 429779.0, "step": 110 }, { "entropy": 1.3978409506380558, "epoch": 0.9142857142857143, "grad_norm": 0.846666693687439, "learning_rate": 4.002890173410404e-05, "loss": 1.2797026634216309, "mean_token_accuracy": 0.700611076131463, "num_tokens": 468862.0, "step": 120 }, { "entropy": 1.2408478770405054, "epoch": 0.9904761904761905, "grad_norm": 0.9868558049201965, "learning_rate": 3.8583815028901736e-05, "loss": 1.1070871353149414, "mean_token_accuracy": 0.7445570107549428, "num_tokens": 508425.0, "step": 130 }, { "entropy": 1.0399991414836935, "epoch": 1.0609523809523809, "grad_norm": 0.8608207106590271, "learning_rate": 3.713872832369942e-05, "loss": 0.8918219566345215, "mean_token_accuracy": 0.792083133716841, "num_tokens": 544910.0, "step": 140 }, { "entropy": 0.9168793668970465, "epoch": 1.1371428571428572, "grad_norm": 0.9900131821632385, "learning_rate": 3.569364161849711e-05, "loss": 0.7435701847076416, "mean_token_accuracy": 0.8109283685684204, "num_tokens": 584485.0, "step": 150 }, { "entropy": 0.841719158180058, "epoch": 1.2133333333333334, "grad_norm": 1.0477681159973145, "learning_rate": 3.42485549132948e-05, "loss": 0.6946741580963135, "mean_token_accuracy": 0.8230816710740327, "num_tokens": 623281.0, "step": 160 }, { "entropy": 0.7224269095808268, "epoch": 1.2895238095238095, "grad_norm": 0.9945750832557678, "learning_rate": 3.2803468208092486e-05, "loss": 0.5712966442108154, "mean_token_accuracy": 0.8562004685401916, "num_tokens": 660421.0, "step": 170 }, { "entropy": 0.7375645495951175, "epoch": 1.3657142857142857, "grad_norm": 0.9197537899017334, "learning_rate": 3.135838150289018e-05, "loss": 0.622592830657959, "mean_token_accuracy": 0.8469895273447037, "num_tokens": 699568.0, "step": 180 }, { "entropy": 0.6055924735963345, "epoch": 1.441904761904762, "grad_norm": 0.931404173374176, "learning_rate": 2.9913294797687864e-05, "loss": 0.5101625919342041, "mean_token_accuracy": 0.8736397925764322, "num_tokens": 739234.0, "step": 190 }, { "entropy": 0.569295346736908, "epoch": 1.518095238095238, "grad_norm": 0.8699201941490173, "learning_rate": 2.846820809248555e-05, "loss": 0.477140474319458, "mean_token_accuracy": 0.8829405516386032, "num_tokens": 777743.0, "step": 200 }, { "epoch": 1.518095238095238, "eval_entropy": 0.537933240716274, "eval_loss": 0.4319809377193451, "eval_mean_token_accuracy": 0.8860002267055023, "eval_num_tokens": 777743.0, "eval_runtime": 8.0156, "eval_samples_per_second": 29.193, "eval_steps_per_second": 29.193, "step": 200 }, { "entropy": 0.554788151010871, "epoch": 1.5942857142857143, "grad_norm": 1.1020101308822632, "learning_rate": 2.702312138728324e-05, "loss": 0.5022446155548096, "mean_token_accuracy": 0.8869980745017528, "num_tokens": 817800.0, "step": 210 }, { "entropy": 0.5009425025433302, "epoch": 1.6704761904761904, "grad_norm": 0.9859704971313477, "learning_rate": 2.5578034682080925e-05, "loss": 0.4281635761260986, "mean_token_accuracy": 0.8912746794521809, "num_tokens": 859457.0, "step": 220 }, { "entropy": 0.46139501575380565, "epoch": 1.7466666666666666, "grad_norm": 0.7826982140541077, "learning_rate": 2.4132947976878615e-05, "loss": 0.3664123296737671, "mean_token_accuracy": 0.8995421338826418, "num_tokens": 899431.0, "step": 230 }, { "entropy": 0.4695171698927879, "epoch": 1.822857142857143, "grad_norm": 0.8453237414360046, "learning_rate": 2.2687861271676304e-05, "loss": 0.37414577007293703, "mean_token_accuracy": 0.9007753636687994, "num_tokens": 937747.0, "step": 240 }, { "entropy": 0.44786179112270474, "epoch": 1.899047619047619, "grad_norm": 1.1034295558929443, "learning_rate": 2.124277456647399e-05, "loss": 0.352729344367981, "mean_token_accuracy": 0.9072672612965107, "num_tokens": 977048.0, "step": 250 }, { "entropy": 0.42663233568891884, "epoch": 1.9752380952380952, "grad_norm": 0.9720745086669922, "learning_rate": 1.9797687861271676e-05, "loss": 0.33823652267456056, "mean_token_accuracy": 0.9057052366435527, "num_tokens": 1015130.0, "step": 260 }, { "entropy": 0.41624320489732, "epoch": 2.045714285714286, "grad_norm": 1.063590168952942, "learning_rate": 1.8352601156069365e-05, "loss": 0.3131232261657715, "mean_token_accuracy": 0.9075767901298162, "num_tokens": 1050223.0, "step": 270 }, { "entropy": 0.39982422441244125, "epoch": 2.1219047619047617, "grad_norm": 1.0073680877685547, "learning_rate": 1.6907514450867054e-05, "loss": 0.3176624298095703, "mean_token_accuracy": 0.913105733692646, "num_tokens": 1089337.0, "step": 280 }, { "entropy": 0.3832275261171162, "epoch": 2.198095238095238, "grad_norm": 1.0207056999206543, "learning_rate": 1.546242774566474e-05, "loss": 0.2810999631881714, "mean_token_accuracy": 0.9169781133532524, "num_tokens": 1128066.0, "step": 290 }, { "entropy": 0.37502991100773214, "epoch": 2.2742857142857145, "grad_norm": 1.1781491041183472, "learning_rate": 1.4017341040462428e-05, "loss": 0.2836958646774292, "mean_token_accuracy": 0.9159112725406885, "num_tokens": 1166578.0, "step": 300 }, { "epoch": 2.2742857142857145, "eval_entropy": 0.3737119477656152, "eval_loss": 0.2801544964313507, "eval_mean_token_accuracy": 0.9141584352040902, "eval_num_tokens": 1166578.0, "eval_runtime": 8.0772, "eval_samples_per_second": 28.971, "eval_steps_per_second": 28.971, "step": 300 }, { "entropy": 0.37731272270902994, "epoch": 2.3504761904761904, "grad_norm": 0.933684229850769, "learning_rate": 1.2572254335260117e-05, "loss": 0.2837996482849121, "mean_token_accuracy": 0.917362405359745, "num_tokens": 1206830.0, "step": 310 }, { "entropy": 0.36399535620585083, "epoch": 2.4266666666666667, "grad_norm": 1.1197153329849243, "learning_rate": 1.1127167630057805e-05, "loss": 0.27221682071685793, "mean_token_accuracy": 0.9155685339123011, "num_tokens": 1245749.0, "step": 320 }, { "entropy": 0.35384900290519, "epoch": 2.5028571428571427, "grad_norm": 1.0109339952468872, "learning_rate": 9.68208092485549e-06, "loss": 0.2685784101486206, "mean_token_accuracy": 0.9206566758453846, "num_tokens": 1287709.0, "step": 330 }, { "entropy": 0.3556200794875622, "epoch": 2.579047619047619, "grad_norm": 1.366537094116211, "learning_rate": 8.23699421965318e-06, "loss": 0.26150500774383545, "mean_token_accuracy": 0.9224125389009714, "num_tokens": 1324926.0, "step": 340 }, { "entropy": 0.3399674018844962, "epoch": 2.6552380952380954, "grad_norm": 0.766735851764679, "learning_rate": 6.791907514450866e-06, "loss": 0.24372787475585939, "mean_token_accuracy": 0.9262156378477812, "num_tokens": 1361824.0, "step": 350 }, { "entropy": 0.34854550352320074, "epoch": 2.7314285714285713, "grad_norm": 0.7423695921897888, "learning_rate": 5.346820809248556e-06, "loss": 0.2581783771514893, "mean_token_accuracy": 0.9233053136616945, "num_tokens": 1401454.0, "step": 360 }, { "entropy": 0.34303188025951387, "epoch": 2.8076190476190477, "grad_norm": 1.0954190492630005, "learning_rate": 3.901734104046243e-06, "loss": 0.2694044351577759, "mean_token_accuracy": 0.9223306879401207, "num_tokens": 1442263.0, "step": 370 }, { "entropy": 0.3252540229819715, "epoch": 2.883809523809524, "grad_norm": 1.0154286623001099, "learning_rate": 2.456647398843931e-06, "loss": 0.23521080017089843, "mean_token_accuracy": 0.9274044901132583, "num_tokens": 1479797.0, "step": 380 }, { "entropy": 0.3254506874829531, "epoch": 2.96, "grad_norm": 0.8258033990859985, "learning_rate": 1.0115606936416186e-06, "loss": 0.24233431816101075, "mean_token_accuracy": 0.9235792737454176, "num_tokens": 1518093.0, "step": 390 }, { "epoch": 3.0, "eval_entropy": 0.33179805516941935, "eval_loss": 0.24572625756263733, "eval_mean_token_accuracy": 0.9225198356514304, "eval_num_tokens": 1539813.0, "eval_runtime": 8.1336, "eval_samples_per_second": 28.769, "eval_steps_per_second": 28.769, "step": 396 } ], "logging_steps": 10, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.534789852765082e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }