{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.6201617665588857, "epoch": 0.05291005291005291, "grad_norm": 0.107421875, "learning_rate": 0.00019980950399213344, "loss": 1.665807342529297, "mean_token_accuracy": 0.6513840228319168, "num_tokens": 159158.0, "step": 25 }, { "entropy": 1.1972093664109706, "epoch": 0.10582010582010581, "grad_norm": 0.06494140625, "learning_rate": 0.0001972927568670583, "loss": 1.1417469024658202, "mean_token_accuracy": 0.7515610492974519, "num_tokens": 318771.0, "step": 50 }, { "entropy": 0.7925181396957487, "epoch": 0.15873015873015872, "grad_norm": 0.06494140625, "learning_rate": 0.0001919219386594007, "loss": 0.7702044677734375, "mean_token_accuracy": 0.8282645887136459, "num_tokens": 478916.0, "step": 75 }, { "entropy": 0.8690306516364217, "epoch": 0.21164021164021163, "grad_norm": 0.049560546875, "learning_rate": 0.00018385460166215638, "loss": 0.852333984375, "mean_token_accuracy": 0.8131964223086834, "num_tokens": 640794.0, "step": 100 }, { "entropy": 0.8248864999366924, "epoch": 0.26455026455026454, "grad_norm": 0.05419921875, "learning_rate": 0.00017332740021608722, "loss": 0.8277694702148437, "mean_token_accuracy": 0.8324337004125119, "num_tokens": 801553.0, "step": 125 }, { "entropy": 0.6974356794031337, "epoch": 0.31746031746031744, "grad_norm": 0.07177734375, "learning_rate": 0.0001606491484836782, "loss": 0.6716735076904297, "mean_token_accuracy": 0.8575316358357667, "num_tokens": 956091.0, "step": 150 }, { "entropy": 0.5518482267530636, "epoch": 0.37037037037037035, "grad_norm": 0.043701171875, "learning_rate": 0.00014619176142357935, "loss": 0.532108039855957, "mean_token_accuracy": 0.8848500949889422, "num_tokens": 1113595.0, "step": 175 }, { "entropy": 0.4954574690805748, "epoch": 0.42328042328042326, "grad_norm": 0.052734375, "learning_rate": 0.00013037934471093682, "loss": 0.48598655700683596, "mean_token_accuracy": 0.8925073320418596, "num_tokens": 1270329.0, "step": 200 }, { "entropy": 0.5833652867842466, "epoch": 0.47619047619047616, "grad_norm": 0.095703125, "learning_rate": 0.00011367575364946006, "loss": 0.5821831130981445, "mean_token_accuracy": 0.877345666885376, "num_tokens": 1428957.0, "step": 225 }, { "entropy": 0.5967767992173322, "epoch": 0.5291005291005291, "grad_norm": 0.04296875, "learning_rate": 9.657098603301346e-05, "loss": 0.6138711929321289, "mean_token_accuracy": 0.8787138384580612, "num_tokens": 1587655.0, "step": 250 }, { "entropy": 0.5595623605395668, "epoch": 0.582010582010582, "grad_norm": 0.03125, "learning_rate": 7.95668081204676e-05, "loss": 0.5828312683105469, "mean_token_accuracy": 0.884635460972786, "num_tokens": 1747061.0, "step": 275 }, { "entropy": 0.586139135141857, "epoch": 0.6349206349206349, "grad_norm": 0.042724609375, "learning_rate": 6.316203538407397e-05, "loss": 0.5686460113525391, "mean_token_accuracy": 0.8748830965906381, "num_tokens": 1903918.0, "step": 300 }, { "entropy": 0.6374190147127956, "epoch": 0.6878306878306878, "grad_norm": 0.05419921875, "learning_rate": 4.783789981880267e-05, "loss": 0.6625697326660156, "mean_token_accuracy": 0.8703962732851506, "num_tokens": 2060556.0, "step": 325 }, { "entropy": 0.6380186561308802, "epoch": 0.7407407407407407, "grad_norm": 0.0390625, "learning_rate": 3.4043933060828605e-05, "loss": 0.6768576049804688, "mean_token_accuracy": 0.8745240803062916, "num_tokens": 2221905.0, "step": 350 }, { "entropy": 0.5965662833326496, "epoch": 0.7936507936507936, "grad_norm": 0.072265625, "learning_rate": 2.218477943215229e-05, "loss": 0.6222876358032227, "mean_token_accuracy": 0.877472730949521, "num_tokens": 2385717.0, "step": 375 }, { "entropy": 0.6784971536113881, "epoch": 0.8465608465608465, "grad_norm": 0.0390625, "learning_rate": 1.2608325749073591e-05, "loss": 0.716466064453125, "mean_token_accuracy": 0.8638603837788105, "num_tokens": 2550816.0, "step": 400 }, { "entropy": 0.6590864680474624, "epoch": 0.8994708994708994, "grad_norm": 0.05126953125, "learning_rate": 5.5954961051291384e-06, "loss": 0.6626347351074219, "mean_token_accuracy": 0.8649837756156922, "num_tokens": 2713029.0, "step": 425 }, { "entropy": 0.6229005157970824, "epoch": 0.9523809523809523, "grad_norm": 0.0125732421875, "learning_rate": 1.3520109972846917e-06, "loss": 0.6455535125732422, "mean_token_accuracy": 0.8719264762848615, "num_tokens": 2871773.0, "step": 450 } ], "logging_steps": 25, "max_steps": 473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.286492311248937e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }