{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8764721993974254, "eval_steps": 200, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021911804984935633, "grad_norm": 0.14338380098342896, "learning_rate": 7.6e-05, "loss": 0.578131914138794, "step": 20 }, { "epoch": 0.043823609969871266, "grad_norm": 0.043006837368011475, "learning_rate": 0.00015600000000000002, "loss": 0.3712817430496216, "step": 40 }, { "epoch": 0.0657354149548069, "grad_norm": 0.05703941732645035, "learning_rate": 0.00019994633460515538, "loss": 0.3367295265197754, "step": 60 }, { "epoch": 0.08764721993974253, "grad_norm": 0.04901430010795593, "learning_rate": 0.00019944327493951773, "loss": 0.3087886571884155, "step": 80 }, { "epoch": 0.10955902492467817, "grad_norm": 0.06868838518857956, "learning_rate": 0.00019841332314076855, "loss": 0.3139586210250854, "step": 100 }, { "epoch": 0.1314708299096138, "grad_norm": 0.07925613969564438, "learning_rate": 0.00019686193632504338, "loss": 0.28915910720825194, "step": 120 }, { "epoch": 0.15338263489454945, "grad_norm": 0.07879550755023956, "learning_rate": 0.00019479733438965667, "loss": 0.2889900207519531, "step": 140 }, { "epoch": 0.17529443987948506, "grad_norm": 0.06741970777511597, "learning_rate": 0.00019223045646064212, "loss": 0.27687640190124513, "step": 160 }, { "epoch": 0.1972062448644207, "grad_norm": 0.062402546405792236, "learning_rate": 0.00018917490293267973, "loss": 0.2929875135421753, "step": 180 }, { "epoch": 0.21911804984935634, "grad_norm": 0.07164579629898071, "learning_rate": 0.00018564686340850708, "loss": 0.2802800893783569, "step": 200 }, { "epoch": 0.21911804984935634, "eval_loss": 0.2665560841560364, "eval_runtime": 78.0807, "eval_samples_per_second": 4.278, "eval_steps_per_second": 1.076, "step": 200 }, { "epoch": 0.24102985483429198, "grad_norm": 0.06281758099794388, "learning_rate": 0.0001816650309196209, "loss": 0.2850670576095581, "step": 220 }, { "epoch": 0.2629416598192276, "grad_norm": 0.05580520257353783, "learning_rate": 0.000177250502882765, "loss": 0.2707890510559082, "step": 240 }, { "epoch": 0.28485346480416324, "grad_norm": 0.0578630194067955, "learning_rate": 0.0001724266693169772, "loss": 0.2684861898422241, "step": 260 }, { "epoch": 0.3067652697890989, "grad_norm": 0.056395046412944794, "learning_rate": 0.0001672190889134691, "loss": 0.27494494915008544, "step": 280 }, { "epoch": 0.3286770747740345, "grad_norm": 0.06928149610757828, "learning_rate": 0.00016165535361497218, "loss": 0.27622313499450685, "step": 300 }, { "epoch": 0.35058887975897013, "grad_norm": 0.06063492223620415, "learning_rate": 0.00015576494242206508, "loss": 0.26791768074035643, "step": 320 }, { "epoch": 0.3725006847439058, "grad_norm": 0.052748970687389374, "learning_rate": 0.00014957906520107845, "loss": 0.2828014373779297, "step": 340 }, { "epoch": 0.3944124897288414, "grad_norm": 0.05597842484712601, "learning_rate": 0.00014313049732114715, "loss": 0.27218098640441896, "step": 360 }, { "epoch": 0.416324294713777, "grad_norm": 0.054700642824172974, "learning_rate": 0.0001364534059965735, "loss": 0.258621072769165, "step": 380 }, { "epoch": 0.4382360996987127, "grad_norm": 0.06639006733894348, "learning_rate": 0.00012958316925461085, "loss": 0.26321959495544434, "step": 400 }, { "epoch": 0.4382360996987127, "eval_loss": 0.25228193402290344, "eval_runtime": 76.7593, "eval_samples_per_second": 4.351, "eval_steps_per_second": 1.094, "step": 400 }, { "epoch": 0.4601479046836483, "grad_norm": 0.04828259348869324, "learning_rate": 0.00012255618848785378, "loss": 0.2582087993621826, "step": 420 }, { "epoch": 0.48205970966858397, "grad_norm": 0.09414570778608322, "learning_rate": 0.0001154096955844091, "loss": 0.26802127361297606, "step": 440 }, { "epoch": 0.5039715146535196, "grad_norm": 0.06418801099061966, "learning_rate": 0.00010818155565775443, "loss": 0.2587978601455688, "step": 460 }, { "epoch": 0.5258833196384552, "grad_norm": 0.059086013585329056, "learning_rate": 0.0001009100664215028, "loss": 0.26389484405517577, "step": 480 }, { "epoch": 0.5477951246233909, "grad_norm": 0.05814081430435181, "learning_rate": 9.363375527207111e-05, "loss": 0.2642557144165039, "step": 500 }, { "epoch": 0.5697069296083265, "grad_norm": 0.060968417674303055, "learning_rate": 8.639117515439248e-05, "loss": 0.2558141708374023, "step": 520 }, { "epoch": 0.5916187345932621, "grad_norm": 0.04676595702767372, "learning_rate": 7.92207002922618e-05, "loss": 0.25071308612823484, "step": 540 }, { "epoch": 0.6135305395781978, "grad_norm": 0.055148206651210785, "learning_rate": 7.216032286562122e-05, "loss": 0.25150718688964846, "step": 560 }, { "epoch": 0.6354423445631334, "grad_norm": 0.05283847823739052, "learning_rate": 6.524745171207339e-05, "loss": 0.2621228933334351, "step": 580 }, { "epoch": 0.657354149548069, "grad_norm": 0.05972014740109444, "learning_rate": 5.851871411918743e-05, "loss": 0.262609076499939, "step": 600 }, { "epoch": 0.657354149548069, "eval_loss": 0.24315646290779114, "eval_runtime": 76.7442, "eval_samples_per_second": 4.352, "eval_steps_per_second": 1.095, "step": 600 }, { "epoch": 0.6792659545330046, "grad_norm": 0.0542420968413353, "learning_rate": 5.20097617577839e-05, "loss": 0.24464335441589355, "step": 620 }, { "epoch": 0.7011777595179403, "grad_norm": 0.05391804128885269, "learning_rate": 4.5755081784446306e-05, "loss": 0.23991961479187013, "step": 640 }, { "epoch": 0.7230895645028759, "grad_norm": 0.067460797727108, "learning_rate": 3.978781411411705e-05, "loss": 0.25521044731140136, "step": 660 }, { "epoch": 0.7450013694878116, "grad_norm": 0.060182541608810425, "learning_rate": 3.413957583094358e-05, "loss": 0.25919690132141116, "step": 680 }, { "epoch": 0.7669131744727472, "grad_norm": 0.05970674753189087, "learning_rate": 2.8840293667720653e-05, "loss": 0.25497357845306395, "step": 700 }, { "epoch": 0.7888249794576828, "grad_norm": 0.05807078629732132, "learning_rate": 2.3918045441521718e-05, "loss": 0.2616193532943726, "step": 720 }, { "epoch": 0.8107367844426184, "grad_norm": 0.07185934484004974, "learning_rate": 1.9398911285660816e-05, "loss": 0.25238714218139646, "step": 740 }, { "epoch": 0.832648589427554, "grad_norm": 0.05691298842430115, "learning_rate": 1.5306835466219738e-05, "loss": 0.25438358783721926, "step": 760 }, { "epoch": 0.8545603944124898, "grad_norm": 0.05606130138039589, "learning_rate": 1.1663499515294762e-05, "loss": 0.23820433616638184, "step": 780 }, { "epoch": 0.8764721993974254, "grad_norm": 0.05140916630625725, "learning_rate": 8.488207353155986e-06, "loss": 0.26239197254180907, "step": 800 }, { "epoch": 0.8764721993974254, "eval_loss": 0.23945921659469604, "eval_runtime": 76.7615, "eval_samples_per_second": 4.351, "eval_steps_per_second": 1.094, "step": 800 } ], "logging_steps": 20, "max_steps": 913, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.475205602097205e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }