| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 425, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.060568468645215, |
| "epoch": 0.05889281507656066, |
| "grad_norm": 0.037353515625, |
| "learning_rate": 0.00019964843466009714, |
| "loss": 2.17614990234375, |
| "mean_token_accuracy": 0.5645758168399334, |
| "num_tokens": 149693.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.852722297757864, |
| "epoch": 0.11778563015312132, |
| "grad_norm": 0.058349609375, |
| "learning_rate": 0.0001962558656223516, |
| "loss": 1.888441619873047, |
| "mean_token_accuracy": 0.6032173852622509, |
| "num_tokens": 299610.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.887919376939535, |
| "epoch": 0.17667844522968199, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 0.00018937593526353096, |
| "loss": 1.9618568420410156, |
| "mean_token_accuracy": 0.5999528316408396, |
| "num_tokens": 448232.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.7166480829566717, |
| "epoch": 0.23557126030624265, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00017925790426889235, |
| "loss": 1.740384063720703, |
| "mean_token_accuracy": 0.6356044755131006, |
| "num_tokens": 599766.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.5865243578515946, |
| "epoch": 0.2944640753828033, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 0.00016626835009018892, |
| "loss": 1.6446778869628906, |
| "mean_token_accuracy": 0.6714080322533846, |
| "num_tokens": 757139.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.6081184281408787, |
| "epoch": 0.35335689045936397, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 0.00015087788580152206, |
| "loss": 1.70118408203125, |
| "mean_token_accuracy": 0.6645687958598137, |
| "num_tokens": 911646.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.43047407111153, |
| "epoch": 0.4122497055359246, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00013364410973237185, |
| "loss": 1.5111715698242187, |
| "mean_token_accuracy": 0.7009587688744068, |
| "num_tokens": 1061331.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.3645561857521533, |
| "epoch": 0.4711425206124853, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.00011519140361460509, |
| "loss": 1.3484107971191406, |
| "mean_token_accuracy": 0.7158325979858637, |
| "num_tokens": 1220495.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.1696428343467415, |
| "epoch": 0.5300353356890459, |
| "grad_norm": 0.1591796875, |
| "learning_rate": 9.618831115896815e-05, |
| "loss": 1.168991241455078, |
| "mean_token_accuracy": 0.7582907542586327, |
| "num_tokens": 1370421.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.2050547137390821, |
| "epoch": 0.5889281507656066, |
| "grad_norm": 0.1611328125, |
| "learning_rate": 7.732331663789592e-05, |
| "loss": 1.2300393676757813, |
| "mean_token_accuracy": 0.7527728863805533, |
| "num_tokens": 1523518.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.2382783625554294, |
| "epoch": 0.6478209658421673, |
| "grad_norm": 0.080078125, |
| "learning_rate": 5.927990101942828e-05, |
| "loss": 1.2677821350097656, |
| "mean_token_accuracy": 0.7519316013902426, |
| "num_tokens": 1674285.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.091745662111789, |
| "epoch": 0.7067137809187279, |
| "grad_norm": 0.099609375, |
| "learning_rate": 4.271177937143245e-05, |
| "loss": 1.1259383392333984, |
| "mean_token_accuracy": 0.782644914239645, |
| "num_tokens": 1821921.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9836642376985401, |
| "epoch": 0.7656065959952886, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 2.821921668788571e-05, |
| "loss": 0.9803294372558594, |
| "mean_token_accuracy": 0.803754693493247, |
| "num_tokens": 1969852.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.9987670464720577, |
| "epoch": 0.8244994110718492, |
| "grad_norm": 0.12060546875, |
| "learning_rate": 1.6327280217615792e-05, |
| "loss": 0.9915267181396484, |
| "mean_token_accuracy": 0.8082605458050967, |
| "num_tokens": 2117931.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.9711884427443147, |
| "epoch": 0.8833922261484098, |
| "grad_norm": 0.1044921875, |
| "learning_rate": 7.46681621618297e-06, |
| "loss": 0.9647718048095704, |
| "mean_token_accuracy": 0.8138997189700603, |
| "num_tokens": 2269751.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.9663474693335593, |
| "epoch": 0.9422850412249706, |
| "grad_norm": 0.076171875, |
| "learning_rate": 1.9588403354188325e-06, |
| "loss": 0.996950454711914, |
| "mean_token_accuracy": 0.809981662556529, |
| "num_tokens": 2419781.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.9816382658591836, |
| "epoch": 1.0, |
| "grad_norm": 0.185546875, |
| "learning_rate": 2.907188642786718e-09, |
| "loss": 1.0153296661376954, |
| "mean_token_accuracy": 0.807481124997139, |
| "num_tokens": 2569294.0, |
| "step": 425 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 425, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0962133304031437e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|