{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 779, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.769014136493206, "epoch": 0.03209757663296421, "grad_norm": 0.0654296875, "learning_rate": 0.0002, "loss": 1.927232666015625, "mean_token_accuracy": 0.5845547493547201, "num_tokens": 147388.0, "step": 25 }, { "entropy": 1.5947067447006702, "epoch": 0.06419515326592842, "grad_norm": 0.0703125, "learning_rate": 0.00019945941475610623, "loss": 1.5756130981445313, "mean_token_accuracy": 0.6429479970037937, "num_tokens": 291902.0, "step": 50 }, { "entropy": 1.4042670665308834, "epoch": 0.09629272989889263, "grad_norm": 0.083984375, "learning_rate": 0.00019784350367254322, "loss": 1.3896011352539062, "mean_token_accuracy": 0.6840905399620533, "num_tokens": 436739.0, "step": 75 }, { "entropy": 1.2599401247128845, "epoch": 0.12839030653185685, "grad_norm": 0.07470703125, "learning_rate": 0.00019516973750305532, "loss": 1.247992630004883, "mean_token_accuracy": 0.7115654723346233, "num_tokens": 578709.0, "step": 100 }, { "entropy": 1.1777591889351606, "epoch": 0.16048788316482104, "grad_norm": 0.0859375, "learning_rate": 0.0001914670242183795, "loss": 1.1679031372070312, "mean_token_accuracy": 0.7358129184693099, "num_tokens": 723238.0, "step": 125 }, { "entropy": 1.0801194526255131, "epoch": 0.19258545979778527, "grad_norm": 0.1064453125, "learning_rate": 0.00018677539646179707, "loss": 1.0448343658447266, "mean_token_accuracy": 0.7570553781092166, "num_tokens": 866427.0, "step": 150 }, { "entropy": 1.0227313496544956, "epoch": 0.22468303643074947, "grad_norm": 0.080078125, "learning_rate": 0.00018114557872800905, "loss": 1.023138198852539, "mean_token_accuracy": 0.7729750864207745, "num_tokens": 1012131.0, "step": 175 }, { "entropy": 0.8637316187471151, "epoch": 0.2567806130637137, "grad_norm": 0.09033203125, "learning_rate": 0.00017463843894486937, "loss": 0.8490833282470703, "mean_token_accuracy": 0.8087801007926464, "num_tokens": 1157299.0, "step": 200 }, { "entropy": 0.8652995108067989, "epoch": 0.2888781896966779, "grad_norm": 0.07373046875, "learning_rate": 0.00016732433038731242, "loss": 0.8502191162109375, "mean_token_accuracy": 0.8063283850252628, "num_tokens": 1299656.0, "step": 225 }, { "entropy": 0.7840015215892344, "epoch": 0.3209757663296421, "grad_norm": 0.087890625, "learning_rate": 0.0001592823310385073, "loss": 0.7785140228271484, "mean_token_accuracy": 0.8246484726667405, "num_tokens": 1441547.0, "step": 250 }, { "entropy": 0.8248297751229257, "epoch": 0.35307334296260634, "grad_norm": 0.1279296875, "learning_rate": 0.00015059938862204127, "loss": 0.8086457824707032, "mean_token_accuracy": 0.8171890932321548, "num_tokens": 1582972.0, "step": 275 }, { "entropy": 0.770206793518737, "epoch": 0.38517091959557054, "grad_norm": 0.12890625, "learning_rate": 0.00014136938054879283, "loss": 0.7686289978027344, "mean_token_accuracy": 0.8326107160747052, "num_tokens": 1731903.0, "step": 300 }, { "entropy": 0.8494727142900228, "epoch": 0.41726849622853474, "grad_norm": 0.072265625, "learning_rate": 0.0001316920989420703, "loss": 0.842440185546875, "mean_token_accuracy": 0.812195960432291, "num_tokens": 1877046.0, "step": 325 }, { "entropy": 0.7691456920653582, "epoch": 0.44936607286149893, "grad_norm": 0.08984375, "learning_rate": 0.00012167217171462566, "loss": 0.7498253631591797, "mean_token_accuracy": 0.8340029817819595, "num_tokens": 2020060.0, "step": 350 }, { "entropy": 0.6002831929549575, "epoch": 0.4814636494944632, "grad_norm": 0.09716796875, "learning_rate": 0.00011141793136253986, "loss": 0.574959716796875, "mean_token_accuracy": 0.8719502376019954, "num_tokens": 2163196.0, "step": 375 }, { "entropy": 0.5365751892980188, "epoch": 0.5135612261274274, "grad_norm": 0.060546875, "learning_rate": 0.00010104024370624644, "loss": 0.5327357482910157, "mean_token_accuracy": 0.8794836418330669, "num_tokens": 2306310.0, "step": 400 }, { "entropy": 0.6227946990681812, "epoch": 0.5456588027603916, "grad_norm": 0.08837890625, "learning_rate": 9.065130924199998e-05, "loss": 0.6111849975585938, "mean_token_accuracy": 0.8625345961749553, "num_tokens": 2451606.0, "step": 425 }, { "entropy": 0.5804533056542277, "epoch": 0.5777563793933558, "grad_norm": 0.0654296875, "learning_rate": 8.036345006322359e-05, "loss": 0.5718699645996094, "mean_token_accuracy": 0.8725813579559326, "num_tokens": 2597270.0, "step": 450 }, { "entropy": 0.6302943672612309, "epoch": 0.60985395602632, "grad_norm": 0.06884765625, "learning_rate": 7.028789546718326e-05, "loss": 0.6353359985351562, "mean_token_accuracy": 0.8635560862720013, "num_tokens": 2742612.0, "step": 475 }, { "entropy": 0.5299606989277527, "epoch": 0.6419515326592842, "grad_norm": 0.08935546875, "learning_rate": 6.053357937665237e-05, "loss": 0.5236670303344727, "mean_token_accuracy": 0.8828775423765183, "num_tokens": 2887935.0, "step": 500 }, { "entropy": 0.6732268288638443, "epoch": 0.6740491092922485, "grad_norm": 0.056640625, "learning_rate": 5.1205962578487155e-05, "loss": 0.6722711181640625, "mean_token_accuracy": 0.8543066729605198, "num_tokens": 3035549.0, "step": 525 }, { "entropy": 0.4612098385160789, "epoch": 0.7061466859252127, "grad_norm": 0.041748046875, "learning_rate": 4.240589251272342e-05, "loss": 0.4541243743896484, "mean_token_accuracy": 0.9010929284989834, "num_tokens": 3180083.0, "step": 550 }, { "entropy": 0.5990941160498187, "epoch": 0.7382442625581769, "grad_norm": 0.08935546875, "learning_rate": 3.422851293981676e-05, "loss": 0.6069795989990234, "mean_token_accuracy": 0.8644757103919983, "num_tokens": 3325528.0, "step": 575 }, { "entropy": 0.5454245122382417, "epoch": 0.7703418391911411, "grad_norm": 0.06787109375, "learning_rate": 2.6762235274383772e-05, "loss": 0.5289861679077148, "mean_token_accuracy": 0.8816468404233455, "num_tokens": 3469264.0, "step": 600 }, { "entropy": 0.5093659822596237, "epoch": 0.8024394158241053, "grad_norm": 0.060302734375, "learning_rate": 2.008778270707944e-05, "loss": 0.503768424987793, "mean_token_accuracy": 0.8897754719853401, "num_tokens": 3614662.0, "step": 625 }, { "entropy": 0.5269298075186089, "epoch": 0.8345369924570695, "grad_norm": 0.051025390625, "learning_rate": 1.4277317449282834e-05, "loss": 0.5159123229980469, "mean_token_accuracy": 0.8846879424154759, "num_tokens": 3757746.0, "step": 650 }, { "entropy": 0.6054773937677964, "epoch": 0.8666345690900337, "grad_norm": 0.068359375, "learning_rate": 9.393660536564408e-06, "loss": 0.5959170532226562, "mean_token_accuracy": 0.8715771237760782, "num_tokens": 3904293.0, "step": 675 }, { "entropy": 0.4945011454587802, "epoch": 0.8987321457229979, "grad_norm": 0.06787109375, "learning_rate": 5.489612626189245e-06, "loss": 0.4754390335083008, "mean_token_accuracy": 0.8915398253500462, "num_tokens": 4043791.0, "step": 700 }, { "entropy": 0.5794008405366913, "epoch": 0.9308297223559622, "grad_norm": 0.0703125, "learning_rate": 2.607383131993424e-06, "loss": 0.5824679565429688, "mean_token_accuracy": 0.8722238828241825, "num_tokens": 4189274.0, "step": 725 }, { "entropy": 0.5977670075232163, "epoch": 0.9629272989889264, "grad_norm": 0.053955078125, "learning_rate": 7.781338686584927e-07, "loss": 0.5879628372192383, "mean_token_accuracy": 0.8674677063524723, "num_tokens": 4333425.0, "step": 750 }, { "entropy": 0.5161540248058736, "epoch": 0.9950248756218906, "grad_norm": 0.052978515625, "learning_rate": 2.164213936770576e-08, "loss": 0.497758674621582, "mean_token_accuracy": 0.8871022827923298, "num_tokens": 4479844.0, "step": 775 } ], "logging_steps": 25, "max_steps": 779, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9211835200717107e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }