{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 2.131298542022705, "learning_rate": 0.00019962000000000002, "loss": 2.0121, "mean_token_accuracy": 0.6703190118074417, "num_tokens": 2348.0, "step": 20 }, { "epoch": 0.008, "grad_norm": 1.2890418767929077, "learning_rate": 0.00019922, "loss": 0.2751, "mean_token_accuracy": 0.9120438575744629, "num_tokens": 4697.0, "step": 40 }, { "epoch": 0.012, "grad_norm": 0.8126867413520813, "learning_rate": 0.00019882, "loss": 0.1966, "mean_token_accuracy": 0.9203487157821655, "num_tokens": 7014.0, "step": 60 }, { "epoch": 0.016, "grad_norm": 0.6051881313323975, "learning_rate": 0.00019842000000000001, "loss": 0.1851, "mean_token_accuracy": 0.9291799515485764, "num_tokens": 9327.0, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.6378348469734192, "learning_rate": 0.00019802, "loss": 0.1766, "mean_token_accuracy": 0.9286984890699387, "num_tokens": 11670.0, "step": 100 }, { "epoch": 0.024, "grad_norm": 0.624138593673706, "learning_rate": 0.00019762, "loss": 0.1784, "mean_token_accuracy": 0.9264282643795013, "num_tokens": 14000.0, "step": 120 }, { "epoch": 0.028, "grad_norm": 0.2111046463251114, "learning_rate": 0.00019722, "loss": 0.1702, "mean_token_accuracy": 0.9321970880031586, "num_tokens": 16329.0, "step": 140 }, { "epoch": 0.032, "grad_norm": 0.5350440740585327, "learning_rate": 0.00019682, "loss": 0.171, "mean_token_accuracy": 0.9311463803052902, "num_tokens": 18667.0, "step": 160 }, { "epoch": 0.036, "grad_norm": 0.19237647950649261, "learning_rate": 0.00019642, "loss": 0.167, "mean_token_accuracy": 0.9345656305551528, "num_tokens": 20985.0, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.28153756260871887, "learning_rate": 0.00019602, "loss": 0.1674, "mean_token_accuracy": 0.9329667061567306, "num_tokens": 23325.0, "step": 200 }, { "epoch": 0.044, "grad_norm": 0.8545331954956055, "learning_rate": 0.00019562, "loss": 0.166, "mean_token_accuracy": 0.9344212204217911, "num_tokens": 25670.0, "step": 220 }, { "epoch": 0.048, "grad_norm": 0.24941129982471466, "learning_rate": 0.00019522, "loss": 0.1661, "mean_token_accuracy": 0.934859549999237, "num_tokens": 28016.0, "step": 240 }, { "epoch": 0.052, "grad_norm": 0.29549548029899597, "learning_rate": 0.00019482, "loss": 0.1707, "mean_token_accuracy": 0.9345517784357071, "num_tokens": 30345.0, "step": 260 }, { "epoch": 0.056, "grad_norm": 0.20388178527355194, "learning_rate": 0.00019442, "loss": 0.1673, "mean_token_accuracy": 0.9353183209896088, "num_tokens": 32691.0, "step": 280 }, { "epoch": 0.06, "grad_norm": 0.10762794315814972, "learning_rate": 0.00019402, "loss": 0.1642, "mean_token_accuracy": 0.9325390756130219, "num_tokens": 35030.0, "step": 300 }, { "epoch": 0.064, "grad_norm": 0.07676753401756287, "learning_rate": 0.00019362, "loss": 0.1633, "mean_token_accuracy": 0.9348760217428207, "num_tokens": 37359.0, "step": 320 }, { "epoch": 0.068, "grad_norm": 0.06781225651502609, "learning_rate": 0.00019322, "loss": 0.1589, "mean_token_accuracy": 0.936154904961586, "num_tokens": 39707.0, "step": 340 }, { "epoch": 0.072, "grad_norm": 0.10010460019111633, "learning_rate": 0.00019282000000000001, "loss": 0.1583, "mean_token_accuracy": 0.9410124599933625, "num_tokens": 42071.0, "step": 360 }, { "epoch": 0.076, "grad_norm": 0.07932794839143753, "learning_rate": 0.00019242, "loss": 0.1608, "mean_token_accuracy": 0.9380002528429031, "num_tokens": 44404.0, "step": 380 }, { "epoch": 0.08, "grad_norm": 0.06678586453199387, "learning_rate": 0.00019202, "loss": 0.1633, "mean_token_accuracy": 0.9347729980945587, "num_tokens": 46715.0, "step": 400 }, { "epoch": 0.084, "grad_norm": 0.05118393525481224, "learning_rate": 0.00019162, "loss": 0.1621, "mean_token_accuracy": 0.9356200367212295, "num_tokens": 49030.0, "step": 420 }, { "epoch": 0.088, "grad_norm": 0.07563836127519608, "learning_rate": 0.00019122, "loss": 0.1603, "mean_token_accuracy": 0.9362942427396774, "num_tokens": 51366.0, "step": 440 }, { "epoch": 0.092, "grad_norm": 0.053388580679893494, "learning_rate": 0.00019082, "loss": 0.1585, "mean_token_accuracy": 0.9377258807420731, "num_tokens": 53706.0, "step": 460 }, { "epoch": 0.096, "grad_norm": 0.05659119412302971, "learning_rate": 0.00019042, "loss": 0.1575, "mean_token_accuracy": 0.937972965836525, "num_tokens": 56052.0, "step": 480 }, { "epoch": 0.1, "grad_norm": 0.04934714362025261, "learning_rate": 0.00019002, "loss": 0.1606, "mean_token_accuracy": 0.9356311202049256, "num_tokens": 58374.0, "step": 500 }, { "epoch": 0.104, "grad_norm": 0.05647804215550423, "learning_rate": 0.00018962000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9353525519371033, "num_tokens": 60706.0, "step": 520 }, { "epoch": 0.108, "grad_norm": 0.058523017913103104, "learning_rate": 0.00018922, "loss": 0.1595, "mean_token_accuracy": 0.9371987581253052, "num_tokens": 63021.0, "step": 540 }, { "epoch": 0.112, "grad_norm": 0.03793497756123543, "learning_rate": 0.00018882000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9341553807258606, "num_tokens": 65369.0, "step": 560 }, { "epoch": 0.116, "grad_norm": 0.04743633046746254, "learning_rate": 0.00018842000000000002, "loss": 0.1586, "mean_token_accuracy": 0.936157900094986, "num_tokens": 67700.0, "step": 580 }, { "epoch": 0.12, "grad_norm": 0.05463261529803276, "learning_rate": 0.00018802, "loss": 0.1578, "mean_token_accuracy": 0.937694975733757, "num_tokens": 70038.0, "step": 600 } ], "logging_steps": 20, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 847458860064768.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }