| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7986821744122199, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.306920379027724, |
| "epoch": 0.019967054360305495, |
| "grad_norm": 0.03515625, |
| "learning_rate": 0.0001263157894736842, |
| "loss": 2.448968811035156, |
| "mean_token_accuracy": 0.48531667422503233, |
| "num_tokens": 122713.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 2.053416675031185, |
| "epoch": 0.03993410872061099, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 0.0001999595542133758, |
| "loss": 2.042926483154297, |
| "mean_token_accuracy": 0.5536270077899098, |
| "num_tokens": 245834.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.019237674474716, |
| "epoch": 0.05990116308091649, |
| "grad_norm": 0.0245361328125, |
| "learning_rate": 0.00019956707906498044, |
| "loss": 2.0004498291015627, |
| "mean_token_accuracy": 0.562225787602365, |
| "num_tokens": 367174.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 2.014568568766117, |
| "epoch": 0.07986821744122198, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 0.00019875870121543717, |
| "loss": 1.9949961853027345, |
| "mean_token_accuracy": 0.5627686691284179, |
| "num_tokens": 487834.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.9470337933301927, |
| "epoch": 0.09983527180152749, |
| "grad_norm": 0.0279541015625, |
| "learning_rate": 0.00019753779734842827, |
| "loss": 1.9355754089355468, |
| "mean_token_accuracy": 0.5725972962379455, |
| "num_tokens": 611352.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.9996179219335317, |
| "epoch": 0.11980232616183298, |
| "grad_norm": 0.0286865234375, |
| "learning_rate": 0.0001959094673144354, |
| "loss": 1.9678744506835937, |
| "mean_token_accuracy": 0.5630620580911636, |
| "num_tokens": 735053.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.0072034925222395, |
| "epoch": 0.13976938052213847, |
| "grad_norm": 0.0301513671875, |
| "learning_rate": 0.00019388051282810022, |
| "loss": 1.9737957763671874, |
| "mean_token_accuracy": 0.5624778818339109, |
| "num_tokens": 855860.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.9725533105432986, |
| "epoch": 0.15973643488244396, |
| "grad_norm": 0.029296875, |
| "learning_rate": 0.0001914594090567099, |
| "loss": 1.9277491760253906, |
| "mean_token_accuracy": 0.5647629579156637, |
| "num_tokens": 977334.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.9535374976694584, |
| "epoch": 0.17970348924274945, |
| "grad_norm": 0.033447265625, |
| "learning_rate": 0.00018865626921848615, |
| "loss": 1.9351695251464844, |
| "mean_token_accuracy": 0.5722655826061964, |
| "num_tokens": 1092583.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.9489152195304633, |
| "epoch": 0.19967054360305497, |
| "grad_norm": 0.025634765625, |
| "learning_rate": 0.0001854828023385541, |
| "loss": 1.9120469665527344, |
| "mean_token_accuracy": 0.5757150813564658, |
| "num_tokens": 1212652.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.997425957247615, |
| "epoch": 0.21963759796336046, |
| "grad_norm": 0.0235595703125, |
| "learning_rate": 0.00018195226433904957, |
| "loss": 1.9783148193359374, |
| "mean_token_accuracy": 0.566706589795649, |
| "num_tokens": 1332055.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.9235536295175553, |
| "epoch": 0.23960465232366596, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 0.00017807940266766593, |
| "loss": 1.9035797119140625, |
| "mean_token_accuracy": 0.5777761967480183, |
| "num_tokens": 1452841.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.9555407621711494, |
| "epoch": 0.25957170668397145, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00017388039469593428, |
| "loss": 1.922522735595703, |
| "mean_token_accuracy": 0.573435662984848, |
| "num_tokens": 1572803.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 1.9522953194752335, |
| "epoch": 0.27953876104427694, |
| "grad_norm": 0.02978515625, |
| "learning_rate": 0.00016937278014455336, |
| "loss": 1.914853057861328, |
| "mean_token_accuracy": 0.5744891692698002, |
| "num_tokens": 1697419.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.895245919264853, |
| "epoch": 0.29950581540458243, |
| "grad_norm": 0.026123046875, |
| "learning_rate": 0.00016457538781803623, |
| "loss": 1.8297265625, |
| "mean_token_accuracy": 0.5865043254941702, |
| "num_tokens": 1819231.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 1.9276008826121689, |
| "epoch": 0.3194728697648879, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 0.00015950825695471146, |
| "loss": 1.8969316101074218, |
| "mean_token_accuracy": 0.579356978982687, |
| "num_tokens": 1941170.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.9401621558889746, |
| "epoch": 0.3394399241251934, |
| "grad_norm": 0.0291748046875, |
| "learning_rate": 0.0001541925535206084, |
| "loss": 1.9100968933105469, |
| "mean_token_accuracy": 0.5764700850099325, |
| "num_tokens": 2065882.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 1.9133401766419411, |
| "epoch": 0.3594069784854989, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 0.000148650481796876, |
| "loss": 1.8422721862792968, |
| "mean_token_accuracy": 0.5874501725286245, |
| "num_tokens": 2185024.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.9126300086826087, |
| "epoch": 0.3793740328458044, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 0.00014290519163004495, |
| "loss": 1.8789381408691406, |
| "mean_token_accuracy": 0.5848022982478142, |
| "num_tokens": 2299587.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 1.909918104019016, |
| "epoch": 0.39934108720610995, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 0.0001369806817325581, |
| "loss": 1.8951301574707031, |
| "mean_token_accuracy": 0.582027070298791, |
| "num_tokens": 2421897.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.9182585052400827, |
| "epoch": 0.41930814156641544, |
| "grad_norm": 0.031494140625, |
| "learning_rate": 0.00013090169943749476, |
| "loss": 1.8786566162109375, |
| "mean_token_accuracy": 0.5797786585241557, |
| "num_tokens": 2544369.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 1.8837098168581725, |
| "epoch": 0.43927519592672093, |
| "grad_norm": 0.0301513671875, |
| "learning_rate": 0.00012469363732622296, |
| "loss": 1.8448243713378907, |
| "mean_token_accuracy": 0.5849873025715351, |
| "num_tokens": 2671654.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.8790070757828652, |
| "epoch": 0.4592422502870264, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 0.00011838242716077917, |
| "loss": 1.8447459411621094, |
| "mean_token_accuracy": 0.5876961750537157, |
| "num_tokens": 2796033.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 1.8867588526010513, |
| "epoch": 0.4792093046473319, |
| "grad_norm": 0.033447265625, |
| "learning_rate": 0.00011199443156402998, |
| "loss": 1.835140380859375, |
| "mean_token_accuracy": 0.5846484461426735, |
| "num_tokens": 2917243.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.9055894463136793, |
| "epoch": 0.4991763590076374, |
| "grad_norm": 0.0283203125, |
| "learning_rate": 0.00010555633390008086, |
| "loss": 1.858441619873047, |
| "mean_token_accuracy": 0.5845886848121882, |
| "num_tokens": 3042703.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 1.9573378081992268, |
| "epoch": 0.5191434133679429, |
| "grad_norm": 0.03173828125, |
| "learning_rate": 9.909502681491316e-05, |
| "loss": 1.8958790588378907, |
| "mean_token_accuracy": 0.5772438555955887, |
| "num_tokens": 3160162.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.8727275183051824, |
| "epoch": 0.5391104677282484, |
| "grad_norm": 0.03173828125, |
| "learning_rate": 9.263749990282754e-05, |
| "loss": 1.8269801330566406, |
| "mean_token_accuracy": 0.5881718883663416, |
| "num_tokens": 3285704.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 1.871655127387494, |
| "epoch": 0.5590775220885539, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 8.621072696792363e-05, |
| "loss": 1.8388119506835938, |
| "mean_token_accuracy": 0.5867326802760363, |
| "num_tokens": 3410119.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.8627979960665106, |
| "epoch": 0.5790445764488594, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 7.984155335153711e-05, |
| "loss": 1.799385986328125, |
| "mean_token_accuracy": 0.5873606249690055, |
| "num_tokens": 3533765.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 1.8317018933594227, |
| "epoch": 0.5990116308091649, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 7.35565837962798e-05, |
| "loss": 1.7772984313964844, |
| "mean_token_accuracy": 0.5965435421466827, |
| "num_tokens": 3657321.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.8882549648359417, |
| "epoch": 0.6189786851694704, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 6.738207131508735e-05, |
| "loss": 1.8385765075683593, |
| "mean_token_accuracy": 0.59088682141155, |
| "num_tokens": 3779575.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 1.8342267361842095, |
| "epoch": 0.6389457395297758, |
| "grad_norm": 0.032958984375, |
| "learning_rate": 6.134380752948085e-05, |
| "loss": 1.800379180908203, |
| "mean_token_accuracy": 0.5952950984984636, |
| "num_tokens": 3896543.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.873757717087865, |
| "epoch": 0.6589127938900814, |
| "grad_norm": 0.03564453125, |
| "learning_rate": 5.546701493511106e-05, |
| "loss": 1.8183651733398438, |
| "mean_token_accuracy": 0.5902918418496848, |
| "num_tokens": 4018098.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 1.83077443132177, |
| "epoch": 0.6788798482503868, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 4.977624154460464e-05, |
| "loss": 1.7461175537109375, |
| "mean_token_accuracy": 0.598720720410347, |
| "num_tokens": 4141174.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.918299620486796, |
| "epoch": 0.6988469026106924, |
| "grad_norm": 0.0301513671875, |
| "learning_rate": 4.42952583478004e-05, |
| "loss": 1.9020709228515624, |
| "mean_token_accuracy": 0.5798033401742577, |
| "num_tokens": 4259185.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 1.8444363391213119, |
| "epoch": 0.7188139569709978, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 3.904696001769571e-05, |
| "loss": 1.7869160461425782, |
| "mean_token_accuracy": 0.5964432079344988, |
| "num_tokens": 4377724.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.8295260372944175, |
| "epoch": 0.7387810113313034, |
| "grad_norm": 0.031982421875, |
| "learning_rate": 3.4053269276865285e-05, |
| "loss": 1.778699951171875, |
| "mean_token_accuracy": 0.6018728485703468, |
| "num_tokens": 4497691.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 1.8099911727011204, |
| "epoch": 0.7587480656916088, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 2.9335045323824496e-05, |
| "loss": 1.7513389587402344, |
| "mean_token_accuracy": 0.6039503507316113, |
| "num_tokens": 4616131.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.8742087873071431, |
| "epoch": 0.7787151200519143, |
| "grad_norm": 0.033203125, |
| "learning_rate": 2.491199670185008e-05, |
| "loss": 1.7982223510742188, |
| "mean_token_accuracy": 0.5908738762140274, |
| "num_tokens": 4737443.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 1.869517589211464, |
| "epoch": 0.7986821744122199, |
| "grad_norm": 0.032958984375, |
| "learning_rate": 2.0802598974215226e-05, |
| "loss": 1.8264849853515626, |
| "mean_token_accuracy": 0.5888447714596987, |
| "num_tokens": 4858100.0, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 1253, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.10805585396224e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|