{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 425, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.060568468645215, "epoch": 0.05889281507656066, "grad_norm": 0.037353515625, "learning_rate": 0.00019964843466009714, "loss": 2.17614990234375, "mean_token_accuracy": 0.5645758168399334, "num_tokens": 149693.0, "step": 25 }, { "entropy": 1.852722297757864, "epoch": 0.11778563015312132, "grad_norm": 0.058349609375, "learning_rate": 0.0001962558656223516, "loss": 1.888441619873047, "mean_token_accuracy": 0.6032173852622509, "num_tokens": 299610.0, "step": 50 }, { "entropy": 1.887919376939535, "epoch": 0.17667844522968199, "grad_norm": 0.05322265625, "learning_rate": 0.00018937593526353096, "loss": 1.9618568420410156, "mean_token_accuracy": 0.5999528316408396, "num_tokens": 448232.0, "step": 75 }, { "entropy": 1.7166480829566717, "epoch": 0.23557126030624265, "grad_norm": 0.087890625, "learning_rate": 0.00017925790426889235, "loss": 1.740384063720703, "mean_token_accuracy": 0.6356044755131006, "num_tokens": 599766.0, "step": 100 }, { "entropy": 1.5865243578515946, "epoch": 0.2944640753828033, "grad_norm": 0.09716796875, "learning_rate": 0.00016626835009018892, "loss": 1.6446778869628906, "mean_token_accuracy": 0.6714080322533846, "num_tokens": 757139.0, "step": 125 }, { "entropy": 1.6081184281408787, "epoch": 0.35335689045936397, "grad_norm": 0.1083984375, "learning_rate": 0.00015087788580152206, "loss": 1.70118408203125, "mean_token_accuracy": 0.6645687958598137, "num_tokens": 911646.0, "step": 150 }, { "entropy": 1.43047407111153, "epoch": 0.4122497055359246, "grad_norm": 0.087890625, "learning_rate": 0.00013364410973237185, "loss": 1.5111715698242187, "mean_token_accuracy": 0.7009587688744068, "num_tokens": 1061331.0, "step": 175 }, { "entropy": 1.3645561857521533, "epoch": 0.4711425206124853, "grad_norm": 0.0966796875, "learning_rate": 0.00011519140361460509, "loss": 1.3484107971191406, "mean_token_accuracy": 0.7158325979858637, "num_tokens": 1220495.0, "step": 200 }, { "entropy": 1.1696428343467415, "epoch": 0.5300353356890459, "grad_norm": 0.1591796875, "learning_rate": 9.618831115896815e-05, "loss": 1.168991241455078, "mean_token_accuracy": 0.7582907542586327, "num_tokens": 1370421.0, "step": 225 }, { "entropy": 1.2050547137390821, "epoch": 0.5889281507656066, "grad_norm": 0.1611328125, "learning_rate": 7.732331663789592e-05, "loss": 1.2300393676757813, "mean_token_accuracy": 0.7527728863805533, "num_tokens": 1523518.0, "step": 250 }, { "entropy": 1.2382783625554294, "epoch": 0.6478209658421673, "grad_norm": 0.080078125, "learning_rate": 5.927990101942828e-05, "loss": 1.2677821350097656, "mean_token_accuracy": 0.7519316013902426, "num_tokens": 1674285.0, "step": 275 }, { "entropy": 1.091745662111789, "epoch": 0.7067137809187279, "grad_norm": 0.099609375, "learning_rate": 4.271177937143245e-05, "loss": 1.1259383392333984, "mean_token_accuracy": 0.782644914239645, "num_tokens": 1821921.0, "step": 300 }, { "entropy": 0.9836642376985401, "epoch": 0.7656065959952886, "grad_norm": 0.1669921875, "learning_rate": 2.821921668788571e-05, "loss": 0.9803294372558594, "mean_token_accuracy": 0.803754693493247, "num_tokens": 1969852.0, "step": 325 }, { "entropy": 0.9987670464720577, "epoch": 0.8244994110718492, "grad_norm": 0.12060546875, "learning_rate": 1.6327280217615792e-05, "loss": 0.9915267181396484, "mean_token_accuracy": 0.8082605458050967, "num_tokens": 2117931.0, "step": 350 }, { "entropy": 0.9711884427443147, "epoch": 0.8833922261484098, "grad_norm": 0.1044921875, "learning_rate": 7.46681621618297e-06, "loss": 0.9647718048095704, "mean_token_accuracy": 0.8138997189700603, "num_tokens": 2269751.0, "step": 375 }, { "entropy": 0.9663474693335593, "epoch": 0.9422850412249706, "grad_norm": 0.076171875, "learning_rate": 1.9588403354188325e-06, "loss": 0.996950454711914, "mean_token_accuracy": 0.809981662556529, "num_tokens": 2419781.0, "step": 400 }, { "entropy": 0.9816382658591836, "epoch": 1.0, "grad_norm": 0.185546875, "learning_rate": 2.907188642786718e-09, "loss": 1.0153296661376954, "mean_token_accuracy": 0.807481124997139, "num_tokens": 2569294.0, "step": 425 } ], "logging_steps": 25, "max_steps": 425, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0962133304031437e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }