{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.8578038491308688, "epoch": 0.07714561234329798, "grad_norm": 0.0174560546875, "learning_rate": 0.00019902680687415705, "loss": 1.9434934997558593, "mean_token_accuracy": 0.5780243317037821, "num_tokens": 296483.0, "step": 25 }, { "entropy": 1.5701066633313894, "epoch": 0.15429122468659595, "grad_norm": 0.04052734375, "learning_rate": 0.00019253043004739968, "loss": 1.5351368713378906, "mean_token_accuracy": 0.6528984536230564, "num_tokens": 595349.0, "step": 50 }, { "entropy": 1.1479615284875035, "epoch": 0.23143683702989393, "grad_norm": 0.037109375, "learning_rate": 0.00018031146921373018, "loss": 1.0957810974121094, "mean_token_accuracy": 0.7462421279400587, "num_tokens": 896327.0, "step": 75 }, { "entropy": 0.8900932836066932, "epoch": 0.3085824493731919, "grad_norm": 0.058837890625, "learning_rate": 0.0001631256112300239, "loss": 0.8309718322753906, "mean_token_accuracy": 0.8106287607550621, "num_tokens": 1190579.0, "step": 100 }, { "entropy": 0.722694746227935, "epoch": 0.3857280617164899, "grad_norm": 0.04541015625, "learning_rate": 0.00014203572283095657, "loss": 0.6702272796630859, "mean_token_accuracy": 0.8465786771476269, "num_tokens": 1483157.0, "step": 125 }, { "entropy": 0.6168222531164065, "epoch": 0.46287367405978785, "grad_norm": 0.0419921875, "learning_rate": 0.00011834611718137824, "loss": 0.5611977386474609, "mean_token_accuracy": 0.869293844178319, "num_tokens": 1776533.0, "step": 150 }, { "entropy": 0.5858749843109399, "epoch": 0.5400192864030858, "grad_norm": 0.037353515625, "learning_rate": 9.352188807098481e-05, "loss": 0.5254201889038086, "mean_token_accuracy": 0.8756095879524947, "num_tokens": 2065558.0, "step": 175 }, { "entropy": 0.5470695828087628, "epoch": 0.6171648987463838, "grad_norm": 0.0299072265625, "learning_rate": 6.909830056250527e-05, "loss": 0.5220871353149414, "mean_token_accuracy": 0.8878432418406009, "num_tokens": 2361335.0, "step": 200 }, { "entropy": 0.5989037967612967, "epoch": 0.6943105110896818, "grad_norm": 0.025390625, "learning_rate": 4.658584186750713e-05, "loss": 0.5457814025878907, "mean_token_accuracy": 0.880027602687478, "num_tokens": 2655008.0, "step": 225 }, { "entropy": 0.502092773411423, "epoch": 0.7714561234329798, "grad_norm": 0.01806640625, "learning_rate": 2.7376804619000707e-05, "loss": 0.4714463043212891, "mean_token_accuracy": 0.8987671569734812, "num_tokens": 2957052.0, "step": 250 }, { "entropy": 0.49828358624130487, "epoch": 0.8486017357762777, "grad_norm": 0.022216796875, "learning_rate": 1.2659179938287035e-05, "loss": 0.44431716918945313, "mean_token_accuracy": 0.9000337335467339, "num_tokens": 3252025.0, "step": 275 }, { "entropy": 0.49207511749817057, "epoch": 0.9257473481195757, "grad_norm": 0.026123046875, "learning_rate": 3.3431856161452835e-06, "loss": 0.4675511932373047, "mean_token_accuracy": 0.9023960041999817, "num_tokens": 3544940.0, "step": 300 }, { "entropy": 0.5535661403912229, "epoch": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 4.973304405697654e-09, "loss": 0.49514381408691405, "mean_token_accuracy": 0.8885030556034732, "num_tokens": 3826435.0, "step": 325 } ], "logging_steps": 25, "max_steps": 325, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6325843032837632e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }