| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 779, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.769014136493206, |
| "epoch": 0.03209757663296421, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.0002, |
| "loss": 1.927232666015625, |
| "mean_token_accuracy": 0.5845547493547201, |
| "num_tokens": 147388.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.5947067447006702, |
| "epoch": 0.06419515326592842, |
| "grad_norm": 0.0703125, |
| "learning_rate": 0.00019945941475610623, |
| "loss": 1.5756130981445313, |
| "mean_token_accuracy": 0.6429479970037937, |
| "num_tokens": 291902.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.4042670665308834, |
| "epoch": 0.09629272989889263, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.00019784350367254322, |
| "loss": 1.3896011352539062, |
| "mean_token_accuracy": 0.6840905399620533, |
| "num_tokens": 436739.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.2599401247128845, |
| "epoch": 0.12839030653185685, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.00019516973750305532, |
| "loss": 1.247992630004883, |
| "mean_token_accuracy": 0.7115654723346233, |
| "num_tokens": 578709.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.1777591889351606, |
| "epoch": 0.16048788316482104, |
| "grad_norm": 0.0859375, |
| "learning_rate": 0.0001914670242183795, |
| "loss": 1.1679031372070312, |
| "mean_token_accuracy": 0.7358129184693099, |
| "num_tokens": 723238.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.0801194526255131, |
| "epoch": 0.19258545979778527, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 0.00018677539646179707, |
| "loss": 1.0448343658447266, |
| "mean_token_accuracy": 0.7570553781092166, |
| "num_tokens": 866427.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.0227313496544956, |
| "epoch": 0.22468303643074947, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.00018114557872800905, |
| "loss": 1.023138198852539, |
| "mean_token_accuracy": 0.7729750864207745, |
| "num_tokens": 1012131.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.8637316187471151, |
| "epoch": 0.2567806130637137, |
| "grad_norm": 0.09033203125, |
| "learning_rate": 0.00017463843894486937, |
| "loss": 0.8490833282470703, |
| "mean_token_accuracy": 0.8087801007926464, |
| "num_tokens": 1157299.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8652995108067989, |
| "epoch": 0.2888781896966779, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 0.00016732433038731242, |
| "loss": 0.8502191162109375, |
| "mean_token_accuracy": 0.8063283850252628, |
| "num_tokens": 1299656.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.7840015215892344, |
| "epoch": 0.3209757663296421, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.0001592823310385073, |
| "loss": 0.7785140228271484, |
| "mean_token_accuracy": 0.8246484726667405, |
| "num_tokens": 1441547.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.8248297751229257, |
| "epoch": 0.35307334296260634, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 0.00015059938862204127, |
| "loss": 0.8086457824707032, |
| "mean_token_accuracy": 0.8171890932321548, |
| "num_tokens": 1582972.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.770206793518737, |
| "epoch": 0.38517091959557054, |
| "grad_norm": 0.12890625, |
| "learning_rate": 0.00014136938054879283, |
| "loss": 0.7686289978027344, |
| "mean_token_accuracy": 0.8326107160747052, |
| "num_tokens": 1731903.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.8494727142900228, |
| "epoch": 0.41726849622853474, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.0001316920989420703, |
| "loss": 0.842440185546875, |
| "mean_token_accuracy": 0.812195960432291, |
| "num_tokens": 1877046.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.7691456920653582, |
| "epoch": 0.44936607286149893, |
| "grad_norm": 0.08984375, |
| "learning_rate": 0.00012167217171462566, |
| "loss": 0.7498253631591797, |
| "mean_token_accuracy": 0.8340029817819595, |
| "num_tokens": 2020060.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.6002831929549575, |
| "epoch": 0.4814636494944632, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 0.00011141793136253986, |
| "loss": 0.574959716796875, |
| "mean_token_accuracy": 0.8719502376019954, |
| "num_tokens": 2163196.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.5365751892980188, |
| "epoch": 0.5135612261274274, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.00010104024370624644, |
| "loss": 0.5327357482910157, |
| "mean_token_accuracy": 0.8794836418330669, |
| "num_tokens": 2306310.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.6227946990681812, |
| "epoch": 0.5456588027603916, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 9.065130924199998e-05, |
| "loss": 0.6111849975585938, |
| "mean_token_accuracy": 0.8625345961749553, |
| "num_tokens": 2451606.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.5804533056542277, |
| "epoch": 0.5777563793933558, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 8.036345006322359e-05, |
| "loss": 0.5718699645996094, |
| "mean_token_accuracy": 0.8725813579559326, |
| "num_tokens": 2597270.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.6302943672612309, |
| "epoch": 0.60985395602632, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 7.028789546718326e-05, |
| "loss": 0.6353359985351562, |
| "mean_token_accuracy": 0.8635560862720013, |
| "num_tokens": 2742612.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.5299606989277527, |
| "epoch": 0.6419515326592842, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 6.053357937665237e-05, |
| "loss": 0.5236670303344727, |
| "mean_token_accuracy": 0.8828775423765183, |
| "num_tokens": 2887935.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.6732268288638443, |
| "epoch": 0.6740491092922485, |
| "grad_norm": 0.056640625, |
| "learning_rate": 5.1205962578487155e-05, |
| "loss": 0.6722711181640625, |
| "mean_token_accuracy": 0.8543066729605198, |
| "num_tokens": 3035549.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 0.4612098385160789, |
| "epoch": 0.7061466859252127, |
| "grad_norm": 0.041748046875, |
| "learning_rate": 4.240589251272342e-05, |
| "loss": 0.4541243743896484, |
| "mean_token_accuracy": 0.9010929284989834, |
| "num_tokens": 3180083.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.5990941160498187, |
| "epoch": 0.7382442625581769, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 3.422851293981676e-05, |
| "loss": 0.6069795989990234, |
| "mean_token_accuracy": 0.8644757103919983, |
| "num_tokens": 3325528.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 0.5454245122382417, |
| "epoch": 0.7703418391911411, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 2.6762235274383772e-05, |
| "loss": 0.5289861679077148, |
| "mean_token_accuracy": 0.8816468404233455, |
| "num_tokens": 3469264.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.5093659822596237, |
| "epoch": 0.8024394158241053, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 2.008778270707944e-05, |
| "loss": 0.503768424987793, |
| "mean_token_accuracy": 0.8897754719853401, |
| "num_tokens": 3614662.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 0.5269298075186089, |
| "epoch": 0.8345369924570695, |
| "grad_norm": 0.051025390625, |
| "learning_rate": 1.4277317449282834e-05, |
| "loss": 0.5159123229980469, |
| "mean_token_accuracy": 0.8846879424154759, |
| "num_tokens": 3757746.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.6054773937677964, |
| "epoch": 0.8666345690900337, |
| "grad_norm": 0.068359375, |
| "learning_rate": 9.393660536564408e-06, |
| "loss": 0.5959170532226562, |
| "mean_token_accuracy": 0.8715771237760782, |
| "num_tokens": 3904293.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 0.4945011454587802, |
| "epoch": 0.8987321457229979, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 5.489612626189245e-06, |
| "loss": 0.4754390335083008, |
| "mean_token_accuracy": 0.8915398253500462, |
| "num_tokens": 4043791.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.5794008405366913, |
| "epoch": 0.9308297223559622, |
| "grad_norm": 0.0703125, |
| "learning_rate": 2.607383131993424e-06, |
| "loss": 0.5824679565429688, |
| "mean_token_accuracy": 0.8722238828241825, |
| "num_tokens": 4189274.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 0.5977670075232163, |
| "epoch": 0.9629272989889264, |
| "grad_norm": 0.053955078125, |
| "learning_rate": 7.781338686584927e-07, |
| "loss": 0.5879628372192383, |
| "mean_token_accuracy": 0.8674677063524723, |
| "num_tokens": 4333425.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.5161540248058736, |
| "epoch": 0.9950248756218906, |
| "grad_norm": 0.052978515625, |
| "learning_rate": 2.164213936770576e-08, |
| "loss": 0.497758674621582, |
| "mean_token_accuracy": 0.8871022827923298, |
| "num_tokens": 4479844.0, |
| "step": 775 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 779, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9211835200717107e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|