{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7986821744122199, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.306920379027724, "epoch": 0.019967054360305495, "grad_norm": 0.03515625, "learning_rate": 0.0001263157894736842, "loss": 2.448968811035156, "mean_token_accuracy": 0.48531667422503233, "num_tokens": 122713.0, "step": 25 }, { "entropy": 2.053416675031185, "epoch": 0.03993410872061099, "grad_norm": 0.0302734375, "learning_rate": 0.0001999595542133758, "loss": 2.042926483154297, "mean_token_accuracy": 0.5536270077899098, "num_tokens": 245834.0, "step": 50 }, { "entropy": 2.019237674474716, "epoch": 0.05990116308091649, "grad_norm": 0.0245361328125, "learning_rate": 0.00019956707906498044, "loss": 2.0004498291015627, "mean_token_accuracy": 0.562225787602365, "num_tokens": 367174.0, "step": 75 }, { "entropy": 2.014568568766117, "epoch": 0.07986821744122198, "grad_norm": 0.029052734375, "learning_rate": 0.00019875870121543717, "loss": 1.9949961853027345, "mean_token_accuracy": 0.5627686691284179, "num_tokens": 487834.0, "step": 100 }, { "entropy": 1.9470337933301927, "epoch": 0.09983527180152749, "grad_norm": 0.0279541015625, "learning_rate": 0.00019753779734842827, "loss": 1.9355754089355468, "mean_token_accuracy": 0.5725972962379455, "num_tokens": 611352.0, "step": 125 }, { "entropy": 1.9996179219335317, "epoch": 0.11980232616183298, "grad_norm": 0.0286865234375, "learning_rate": 0.0001959094673144354, "loss": 1.9678744506835937, "mean_token_accuracy": 0.5630620580911636, "num_tokens": 735053.0, "step": 150 }, { "entropy": 2.0072034925222395, "epoch": 0.13976938052213847, "grad_norm": 0.0301513671875, "learning_rate": 0.00019388051282810022, "loss": 1.9737957763671874, "mean_token_accuracy": 0.5624778818339109, "num_tokens": 855860.0, "step": 175 }, { "entropy": 1.9725533105432986, "epoch": 0.15973643488244396, "grad_norm": 0.029296875, "learning_rate": 0.0001914594090567099, "loss": 1.9277491760253906, "mean_token_accuracy": 0.5647629579156637, "num_tokens": 977334.0, "step": 200 }, { "entropy": 1.9535374976694584, "epoch": 0.17970348924274945, "grad_norm": 0.033447265625, "learning_rate": 0.00018865626921848615, "loss": 1.9351695251464844, "mean_token_accuracy": 0.5722655826061964, "num_tokens": 1092583.0, "step": 225 }, { "entropy": 1.9489152195304633, "epoch": 0.19967054360305497, "grad_norm": 0.025634765625, "learning_rate": 0.0001854828023385541, "loss": 1.9120469665527344, "mean_token_accuracy": 0.5757150813564658, "num_tokens": 1212652.0, "step": 250 }, { "entropy": 1.997425957247615, "epoch": 0.21963759796336046, "grad_norm": 0.0235595703125, "learning_rate": 0.00018195226433904957, "loss": 1.9783148193359374, "mean_token_accuracy": 0.566706589795649, "num_tokens": 1332055.0, "step": 275 }, { "entropy": 1.9235536295175553, "epoch": 0.23960465232366596, "grad_norm": 0.032470703125, "learning_rate": 0.00017807940266766593, "loss": 1.9035797119140625, "mean_token_accuracy": 0.5777761967480183, "num_tokens": 1452841.0, "step": 300 }, { "entropy": 1.9555407621711494, "epoch": 0.25957170668397145, "grad_norm": 0.0247802734375, "learning_rate": 0.00017388039469593428, "loss": 1.922522735595703, "mean_token_accuracy": 0.573435662984848, "num_tokens": 1572803.0, "step": 325 }, { "entropy": 1.9522953194752335, "epoch": 0.27953876104427694, "grad_norm": 0.02978515625, "learning_rate": 0.00016937278014455336, "loss": 1.914853057861328, "mean_token_accuracy": 0.5744891692698002, "num_tokens": 1697419.0, "step": 350 }, { "entropy": 1.895245919264853, "epoch": 0.29950581540458243, "grad_norm": 0.026123046875, "learning_rate": 0.00016457538781803623, "loss": 1.8297265625, "mean_token_accuracy": 0.5865043254941702, "num_tokens": 1819231.0, "step": 375 }, { "entropy": 1.9276008826121689, "epoch": 0.3194728697648879, "grad_norm": 0.0302734375, "learning_rate": 0.00015950825695471146, "loss": 1.8969316101074218, "mean_token_accuracy": 0.579356978982687, "num_tokens": 1941170.0, "step": 400 }, { "entropy": 1.9401621558889746, "epoch": 0.3394399241251934, "grad_norm": 0.0291748046875, "learning_rate": 0.0001541925535206084, "loss": 1.9100968933105469, "mean_token_accuracy": 0.5764700850099325, "num_tokens": 2065882.0, "step": 425 }, { "entropy": 1.9133401766419411, "epoch": 0.3594069784854989, "grad_norm": 0.0303955078125, "learning_rate": 0.000148650481796876, "loss": 1.8422721862792968, "mean_token_accuracy": 0.5874501725286245, "num_tokens": 2185024.0, "step": 450 }, { "entropy": 1.9126300086826087, "epoch": 0.3793740328458044, "grad_norm": 0.03271484375, "learning_rate": 0.00014290519163004495, "loss": 1.8789381408691406, "mean_token_accuracy": 0.5848022982478142, "num_tokens": 2299587.0, "step": 475 }, { "entropy": 1.909918104019016, "epoch": 0.39934108720610995, "grad_norm": 0.032470703125, "learning_rate": 0.0001369806817325581, "loss": 1.8951301574707031, "mean_token_accuracy": 0.582027070298791, "num_tokens": 2421897.0, "step": 500 }, { "entropy": 1.9182585052400827, "epoch": 0.41930814156641544, "grad_norm": 0.031494140625, "learning_rate": 0.00013090169943749476, "loss": 1.8786566162109375, "mean_token_accuracy": 0.5797786585241557, "num_tokens": 2544369.0, "step": 525 }, { "entropy": 1.8837098168581725, "epoch": 0.43927519592672093, "grad_norm": 0.0301513671875, "learning_rate": 0.00012469363732622296, "loss": 1.8448243713378907, "mean_token_accuracy": 0.5849873025715351, "num_tokens": 2671654.0, "step": 550 }, { "entropy": 1.8790070757828652, "epoch": 0.4592422502870264, "grad_norm": 0.0289306640625, "learning_rate": 0.00011838242716077917, "loss": 1.8447459411621094, "mean_token_accuracy": 0.5876961750537157, "num_tokens": 2796033.0, "step": 575 }, { "entropy": 1.8867588526010513, "epoch": 0.4792093046473319, "grad_norm": 0.033447265625, "learning_rate": 0.00011199443156402998, "loss": 1.835140380859375, "mean_token_accuracy": 0.5846484461426735, "num_tokens": 2917243.0, "step": 600 }, { "entropy": 1.9055894463136793, "epoch": 0.4991763590076374, "grad_norm": 0.0283203125, "learning_rate": 0.00010555633390008086, "loss": 1.858441619873047, "mean_token_accuracy": 0.5845886848121882, "num_tokens": 3042703.0, "step": 625 }, { "entropy": 1.9573378081992268, "epoch": 0.5191434133679429, "grad_norm": 0.03173828125, "learning_rate": 9.909502681491316e-05, "loss": 1.8958790588378907, "mean_token_accuracy": 0.5772438555955887, "num_tokens": 3160162.0, "step": 650 }, { "entropy": 1.8727275183051824, "epoch": 0.5391104677282484, "grad_norm": 0.03173828125, "learning_rate": 9.263749990282754e-05, "loss": 1.8269801330566406, "mean_token_accuracy": 0.5881718883663416, "num_tokens": 3285704.0, "step": 675 }, { "entropy": 1.871655127387494, "epoch": 0.5590775220885539, "grad_norm": 0.0289306640625, "learning_rate": 8.621072696792363e-05, "loss": 1.8388119506835938, "mean_token_accuracy": 0.5867326802760363, "num_tokens": 3410119.0, "step": 700 }, { "entropy": 1.8627979960665106, "epoch": 0.5790445764488594, "grad_norm": 0.035888671875, "learning_rate": 7.984155335153711e-05, "loss": 1.799385986328125, "mean_token_accuracy": 0.5873606249690055, "num_tokens": 3533765.0, "step": 725 }, { "entropy": 1.8317018933594227, "epoch": 0.5990116308091649, "grad_norm": 0.0281982421875, "learning_rate": 7.35565837962798e-05, "loss": 1.7772984313964844, "mean_token_accuracy": 0.5965435421466827, "num_tokens": 3657321.0, "step": 750 }, { "entropy": 1.8882549648359417, "epoch": 0.6189786851694704, "grad_norm": 0.03076171875, "learning_rate": 6.738207131508735e-05, "loss": 1.8385765075683593, "mean_token_accuracy": 0.59088682141155, "num_tokens": 3779575.0, "step": 775 }, { "entropy": 1.8342267361842095, "epoch": 0.6389457395297758, "grad_norm": 0.032958984375, "learning_rate": 6.134380752948085e-05, "loss": 1.800379180908203, "mean_token_accuracy": 0.5952950984984636, "num_tokens": 3896543.0, "step": 800 }, { "entropy": 1.873757717087865, "epoch": 0.6589127938900814, "grad_norm": 0.03564453125, "learning_rate": 5.546701493511106e-05, "loss": 1.8183651733398438, "mean_token_accuracy": 0.5902918418496848, "num_tokens": 4018098.0, "step": 825 }, { "entropy": 1.83077443132177, "epoch": 0.6788798482503868, "grad_norm": 0.032470703125, "learning_rate": 4.977624154460464e-05, "loss": 1.7461175537109375, "mean_token_accuracy": 0.598720720410347, "num_tokens": 4141174.0, "step": 850 }, { "entropy": 1.918299620486796, "epoch": 0.6988469026106924, "grad_norm": 0.0301513671875, "learning_rate": 4.42952583478004e-05, "loss": 1.9020709228515624, "mean_token_accuracy": 0.5798033401742577, "num_tokens": 4259185.0, "step": 875 }, { "entropy": 1.8444363391213119, "epoch": 0.7188139569709978, "grad_norm": 0.0341796875, "learning_rate": 3.904696001769571e-05, "loss": 1.7869160461425782, "mean_token_accuracy": 0.5964432079344988, "num_tokens": 4377724.0, "step": 900 }, { "entropy": 1.8295260372944175, "epoch": 0.7387810113313034, "grad_norm": 0.031982421875, "learning_rate": 3.4053269276865285e-05, "loss": 1.778699951171875, "mean_token_accuracy": 0.6018728485703468, "num_tokens": 4497691.0, "step": 925 }, { "entropy": 1.8099911727011204, "epoch": 0.7587480656916088, "grad_norm": 0.03271484375, "learning_rate": 2.9335045323824496e-05, "loss": 1.7513389587402344, "mean_token_accuracy": 0.6039503507316113, "num_tokens": 4616131.0, "step": 950 }, { "entropy": 1.8742087873071431, "epoch": 0.7787151200519143, "grad_norm": 0.033203125, "learning_rate": 2.491199670185008e-05, "loss": 1.7982223510742188, "mean_token_accuracy": 0.5908738762140274, "num_tokens": 4737443.0, "step": 975 }, { "entropy": 1.869517589211464, "epoch": 0.7986821744122199, "grad_norm": 0.032958984375, "learning_rate": 2.0802598974215226e-05, "loss": 1.8264849853515626, "mean_token_accuracy": 0.5888447714596987, "num_tokens": 4858100.0, "step": 1000 } ], "logging_steps": 25, "max_steps": 1253, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.10805585396224e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }