| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 473, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.6201617665588857, |
| "epoch": 0.05291005291005291, |
| "grad_norm": 0.107421875, |
| "learning_rate": 0.00019980950399213344, |
| "loss": 1.665807342529297, |
| "mean_token_accuracy": 0.6513840228319168, |
| "num_tokens": 159158.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.1972093664109706, |
| "epoch": 0.10582010582010581, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 0.0001972927568670583, |
| "loss": 1.1417469024658202, |
| "mean_token_accuracy": 0.7515610492974519, |
| "num_tokens": 318771.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.7925181396957487, |
| "epoch": 0.15873015873015872, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 0.0001919219386594007, |
| "loss": 0.7702044677734375, |
| "mean_token_accuracy": 0.8282645887136459, |
| "num_tokens": 478916.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.8690306516364217, |
| "epoch": 0.21164021164021163, |
| "grad_norm": 0.049560546875, |
| "learning_rate": 0.00018385460166215638, |
| "loss": 0.852333984375, |
| "mean_token_accuracy": 0.8131964223086834, |
| "num_tokens": 640794.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8248864999366924, |
| "epoch": 0.26455026455026454, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 0.00017332740021608722, |
| "loss": 0.8277694702148437, |
| "mean_token_accuracy": 0.8324337004125119, |
| "num_tokens": 801553.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.6974356794031337, |
| "epoch": 0.31746031746031744, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001606491484836782, |
| "loss": 0.6716735076904297, |
| "mean_token_accuracy": 0.8575316358357667, |
| "num_tokens": 956091.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.5518482267530636, |
| "epoch": 0.37037037037037035, |
| "grad_norm": 0.043701171875, |
| "learning_rate": 0.00014619176142357935, |
| "loss": 0.532108039855957, |
| "mean_token_accuracy": 0.8848500949889422, |
| "num_tokens": 1113595.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.4954574690805748, |
| "epoch": 0.42328042328042326, |
| "grad_norm": 0.052734375, |
| "learning_rate": 0.00013037934471093682, |
| "loss": 0.48598655700683596, |
| "mean_token_accuracy": 0.8925073320418596, |
| "num_tokens": 1270329.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.5833652867842466, |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.095703125, |
| "learning_rate": 0.00011367575364946006, |
| "loss": 0.5821831130981445, |
| "mean_token_accuracy": 0.877345666885376, |
| "num_tokens": 1428957.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.5967767992173322, |
| "epoch": 0.5291005291005291, |
| "grad_norm": 0.04296875, |
| "learning_rate": 9.657098603301346e-05, |
| "loss": 0.6138711929321289, |
| "mean_token_accuracy": 0.8787138384580612, |
| "num_tokens": 1587655.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.5595623605395668, |
| "epoch": 0.582010582010582, |
| "grad_norm": 0.03125, |
| "learning_rate": 7.95668081204676e-05, |
| "loss": 0.5828312683105469, |
| "mean_token_accuracy": 0.884635460972786, |
| "num_tokens": 1747061.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.586139135141857, |
| "epoch": 0.6349206349206349, |
| "grad_norm": 0.042724609375, |
| "learning_rate": 6.316203538407397e-05, |
| "loss": 0.5686460113525391, |
| "mean_token_accuracy": 0.8748830965906381, |
| "num_tokens": 1903918.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.6374190147127956, |
| "epoch": 0.6878306878306878, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 4.783789981880267e-05, |
| "loss": 0.6625697326660156, |
| "mean_token_accuracy": 0.8703962732851506, |
| "num_tokens": 2060556.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.6380186561308802, |
| "epoch": 0.7407407407407407, |
| "grad_norm": 0.0390625, |
| "learning_rate": 3.4043933060828605e-05, |
| "loss": 0.6768576049804688, |
| "mean_token_accuracy": 0.8745240803062916, |
| "num_tokens": 2221905.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5965662833326496, |
| "epoch": 0.7936507936507936, |
| "grad_norm": 0.072265625, |
| "learning_rate": 2.218477943215229e-05, |
| "loss": 0.6222876358032227, |
| "mean_token_accuracy": 0.877472730949521, |
| "num_tokens": 2385717.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.6784971536113881, |
| "epoch": 0.8465608465608465, |
| "grad_norm": 0.0390625, |
| "learning_rate": 1.2608325749073591e-05, |
| "loss": 0.716466064453125, |
| "mean_token_accuracy": 0.8638603837788105, |
| "num_tokens": 2550816.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.6590864680474624, |
| "epoch": 0.8994708994708994, |
| "grad_norm": 0.05126953125, |
| "learning_rate": 5.5954961051291384e-06, |
| "loss": 0.6626347351074219, |
| "mean_token_accuracy": 0.8649837756156922, |
| "num_tokens": 2713029.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.6229005157970824, |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.3520109972846917e-06, |
| "loss": 0.6455535125732422, |
| "mean_token_accuracy": 0.8719264762848615, |
| "num_tokens": 2871773.0, |
| "step": 450 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.286492311248937e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|