Instructions to use FrontiersMind/Nandi-Mini-150M-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use FrontiersMind/Nandi-Mini-150M-Instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="FrontiersMind/Nandi-Mini-150M-Instruct", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("FrontiersMind/Nandi-Mini-150M-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use FrontiersMind/Nandi-Mini-150M-Instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "FrontiersMind/Nandi-Mini-150M-Instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FrontiersMind/Nandi-Mini-150M-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/FrontiersMind/Nandi-Mini-150M-Instruct
- SGLang
How to use FrontiersMind/Nandi-Mini-150M-Instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "FrontiersMind/Nandi-Mini-150M-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FrontiersMind/Nandi-Mini-150M-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "FrontiersMind/Nandi-Mini-150M-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FrontiersMind/Nandi-Mini-150M-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use FrontiersMind/Nandi-Mini-150M-Instruct with Docker Model Runner:
docker model run hf.co/FrontiersMind/Nandi-Mini-150M-Instruct
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7217610970768675, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.6324901580810547, | |
| "epoch": 0.0007217610970768675, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1449674367904663, | |
| "mean_token_accuracy": 0.730022668838501, | |
| "num_tokens": 281424.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.6011863350868225, | |
| "epoch": 0.001443522194153735, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1221046447753906, | |
| "mean_token_accuracy": 0.735398530960083, | |
| "num_tokens": 592658.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.6689565181732178, | |
| "epoch": 0.0021652832912306026, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1469786167144775, | |
| "mean_token_accuracy": 0.7297760546207428, | |
| "num_tokens": 881596.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.601054608821869, | |
| "epoch": 0.00288704438830747, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.116991639137268, | |
| "mean_token_accuracy": 0.7344618439674377, | |
| "num_tokens": 1177307.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.6265177726745605, | |
| "epoch": 0.003608805485384338, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.163395643234253, | |
| "mean_token_accuracy": 0.7276344895362854, | |
| "num_tokens": 1475796.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.6690555810928345, | |
| "epoch": 0.004330566582461205, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.144697904586792, | |
| "mean_token_accuracy": 0.7269938886165619, | |
| "num_tokens": 1750011.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.598021388053894, | |
| "epoch": 0.005052327679538073, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1182944774627686, | |
| "mean_token_accuracy": 0.7358252704143524, | |
| "num_tokens": 2055329.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.6165382862091064, | |
| "epoch": 0.00577408877661494, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1353559494018555, | |
| "mean_token_accuracy": 0.7317765951156616, | |
| "num_tokens": 2348224.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.5607696771621704, | |
| "epoch": 0.006495849873691808, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.071103811264038, | |
| "mean_token_accuracy": 0.7462583184242249, | |
| "num_tokens": 2637011.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.6212881803512573, | |
| "epoch": 0.007217610970768676, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1488314867019653, | |
| "mean_token_accuracy": 0.7294544279575348, | |
| "num_tokens": 2910347.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.5019596815109253, | |
| "epoch": 0.007939372067845544, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0116041898727417, | |
| "mean_token_accuracy": 0.7550928890705109, | |
| "num_tokens": 3206546.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.6398207545280457, | |
| "epoch": 0.00866113316492241, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.178581953048706, | |
| "mean_token_accuracy": 0.7239179015159607, | |
| "num_tokens": 3494761.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.678354799747467, | |
| "epoch": 0.009382894261999277, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1759941577911377, | |
| "mean_token_accuracy": 0.7249243557453156, | |
| "num_tokens": 3775372.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.6341375708580017, | |
| "epoch": 0.010104655359076146, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.116431713104248, | |
| "mean_token_accuracy": 0.7346004545688629, | |
| "num_tokens": 4060240.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.625670075416565, | |
| "epoch": 0.010826416456153013, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1036977767944336, | |
| "mean_token_accuracy": 0.7378830313682556, | |
| "num_tokens": 4346767.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.5567855834960938, | |
| "epoch": 0.01154817755322988, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0634684562683105, | |
| "mean_token_accuracy": 0.7465156018733978, | |
| "num_tokens": 4643903.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.5365270376205444, | |
| "epoch": 0.012269938650306749, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0452733039855957, | |
| "mean_token_accuracy": 0.7487658560276031, | |
| "num_tokens": 4948508.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.6101551055908203, | |
| "epoch": 0.012991699747383616, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.174149990081787, | |
| "mean_token_accuracy": 0.7236275970935822, | |
| "num_tokens": 5240673.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.582105040550232, | |
| "epoch": 0.013713460844460484, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0822830200195312, | |
| "mean_token_accuracy": 0.740031510591507, | |
| "num_tokens": 5529403.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.5892573595046997, | |
| "epoch": 0.014435221941537351, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1302001476287842, | |
| "mean_token_accuracy": 0.7349365949630737, | |
| "num_tokens": 5817564.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.6413474678993225, | |
| "epoch": 0.015156983038614218, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.157022476196289, | |
| "mean_token_accuracy": 0.7278459370136261, | |
| "num_tokens": 6108588.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.6144480109214783, | |
| "epoch": 0.015878744135691087, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1256628036499023, | |
| "mean_token_accuracy": 0.7353854775428772, | |
| "num_tokens": 6411583.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.6616796255111694, | |
| "epoch": 0.016600505232767952, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.168656587600708, | |
| "mean_token_accuracy": 0.7236612141132355, | |
| "num_tokens": 6698086.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.643827199935913, | |
| "epoch": 0.01732226632984482, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1655962467193604, | |
| "mean_token_accuracy": 0.7271822988986969, | |
| "num_tokens": 7000131.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.5970368385314941, | |
| "epoch": 0.01804402742692169, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0894560813903809, | |
| "mean_token_accuracy": 0.7399492561817169, | |
| "num_tokens": 7291836.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.503839135169983, | |
| "epoch": 0.018765788523998555, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0344961881637573, | |
| "mean_token_accuracy": 0.7478281855583191, | |
| "num_tokens": 7585491.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.6090006828308105, | |
| "epoch": 0.019487549621075424, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1419615745544434, | |
| "mean_token_accuracy": 0.7301072478294373, | |
| "num_tokens": 7879630.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.6687936782836914, | |
| "epoch": 0.020209310718152292, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.185781717300415, | |
| "mean_token_accuracy": 0.722177267074585, | |
| "num_tokens": 8169307.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.6137508153915405, | |
| "epoch": 0.020931071815229157, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.139797329902649, | |
| "mean_token_accuracy": 0.731590211391449, | |
| "num_tokens": 8454246.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.6452692747116089, | |
| "epoch": 0.021652832912306026, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.144111156463623, | |
| "mean_token_accuracy": 0.7287810444831848, | |
| "num_tokens": 8729575.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.5694754719734192, | |
| "epoch": 0.022374594009382895, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1091818809509277, | |
| "mean_token_accuracy": 0.7397504150867462, | |
| "num_tokens": 9041189.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.5972363352775574, | |
| "epoch": 0.02309635510645976, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0982288122177124, | |
| "mean_token_accuracy": 0.737795889377594, | |
| "num_tokens": 9326397.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.5754931569099426, | |
| "epoch": 0.02381811620353663, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1044893264770508, | |
| "mean_token_accuracy": 0.7347338497638702, | |
| "num_tokens": 9610401.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.6175318360328674, | |
| "epoch": 0.024539877300613498, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1177648305892944, | |
| "mean_token_accuracy": 0.7350936233997345, | |
| "num_tokens": 9905930.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.6128783226013184, | |
| "epoch": 0.025261638397690366, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1447393894195557, | |
| "mean_token_accuracy": 0.7303915917873383, | |
| "num_tokens": 10194872.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.550290584564209, | |
| "epoch": 0.02598339949476723, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0588897466659546, | |
| "mean_token_accuracy": 0.7457749843597412, | |
| "num_tokens": 10489033.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.6118902564048767, | |
| "epoch": 0.0267051605918441, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.129485845565796, | |
| "mean_token_accuracy": 0.7316156923770905, | |
| "num_tokens": 10785394.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.6288201808929443, | |
| "epoch": 0.02742692168892097, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.132784366607666, | |
| "mean_token_accuracy": 0.7318132221698761, | |
| "num_tokens": 11065027.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.6048038005828857, | |
| "epoch": 0.028148682785997834, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1273961067199707, | |
| "mean_token_accuracy": 0.7312443256378174, | |
| "num_tokens": 11343945.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.6598325967788696, | |
| "epoch": 0.028870443883074703, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1481671333312988, | |
| "mean_token_accuracy": 0.7304234504699707, | |
| "num_tokens": 11628441.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5797964334487915, | |
| "epoch": 0.02959220498015157, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1009016036987305, | |
| "mean_token_accuracy": 0.7380237579345703, | |
| "num_tokens": 11921495.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.6039154529571533, | |
| "epoch": 0.030313966077228437, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.14406156539917, | |
| "mean_token_accuracy": 0.731726735830307, | |
| "num_tokens": 12223908.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.5778825879096985, | |
| "epoch": 0.031035727174305305, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1219489574432373, | |
| "mean_token_accuracy": 0.7335020303726196, | |
| "num_tokens": 12520484.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.5796723365783691, | |
| "epoch": 0.031757488271382174, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1150004863739014, | |
| "mean_token_accuracy": 0.736197829246521, | |
| "num_tokens": 12814280.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.6669464111328125, | |
| "epoch": 0.03247924936845904, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1724576950073242, | |
| "mean_token_accuracy": 0.7243635356426239, | |
| "num_tokens": 13102325.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.5914201140403748, | |
| "epoch": 0.033201010465535905, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1153526306152344, | |
| "mean_token_accuracy": 0.7354307174682617, | |
| "num_tokens": 13391088.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.5858140587806702, | |
| "epoch": 0.03392277156261277, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1172500848770142, | |
| "mean_token_accuracy": 0.7344636917114258, | |
| "num_tokens": 13687168.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.6306005716323853, | |
| "epoch": 0.03464453265968964, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0948805809020996, | |
| "mean_token_accuracy": 0.736759752035141, | |
| "num_tokens": 13967538.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.5531249642372131, | |
| "epoch": 0.03536629375676651, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0810805559158325, | |
| "mean_token_accuracy": 0.7413678765296936, | |
| "num_tokens": 14276436.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.5869917273521423, | |
| "epoch": 0.03608805485384338, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0666441917419434, | |
| "mean_token_accuracy": 0.7437665462493896, | |
| "num_tokens": 14573767.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.6448417901992798, | |
| "epoch": 0.03680981595092025, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1699206829071045, | |
| "mean_token_accuracy": 0.725558340549469, | |
| "num_tokens": 14858053.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.5271926522254944, | |
| "epoch": 0.03753157704799711, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.054903268814087, | |
| "mean_token_accuracy": 0.744969516992569, | |
| "num_tokens": 15155281.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.6412206888198853, | |
| "epoch": 0.03825333814507398, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1678251028060913, | |
| "mean_token_accuracy": 0.7259888350963593, | |
| "num_tokens": 15455184.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.6215389966964722, | |
| "epoch": 0.03897509924215085, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1824066638946533, | |
| "mean_token_accuracy": 0.7280870676040649, | |
| "num_tokens": 15743012.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.5804604291915894, | |
| "epoch": 0.039696860339227716, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1091406345367432, | |
| "mean_token_accuracy": 0.737342357635498, | |
| "num_tokens": 16033511.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.5461040139198303, | |
| "epoch": 0.040418621436304585, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0643203258514404, | |
| "mean_token_accuracy": 0.7467976808547974, | |
| "num_tokens": 16332396.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.651955246925354, | |
| "epoch": 0.04114038253338145, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1751530170440674, | |
| "mean_token_accuracy": 0.7242036163806915, | |
| "num_tokens": 16612800.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.6684359312057495, | |
| "epoch": 0.041862143630458315, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1958093643188477, | |
| "mean_token_accuracy": 0.7191943228244781, | |
| "num_tokens": 16896720.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.6259081363677979, | |
| "epoch": 0.042583904727535184, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1620521545410156, | |
| "mean_token_accuracy": 0.7249590158462524, | |
| "num_tokens": 17192420.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.6183463335037231, | |
| "epoch": 0.04330566582461205, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1168042421340942, | |
| "mean_token_accuracy": 0.7350777387619019, | |
| "num_tokens": 17484538.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.658570647239685, | |
| "epoch": 0.04402742692168892, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1338030099868774, | |
| "mean_token_accuracy": 0.7320903241634369, | |
| "num_tokens": 17773308.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.5996414422988892, | |
| "epoch": 0.04474918801876579, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0847420692443848, | |
| "mean_token_accuracy": 0.7392292320728302, | |
| "num_tokens": 18058017.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.5665233135223389, | |
| "epoch": 0.04547094911584266, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0845024585723877, | |
| "mean_token_accuracy": 0.7437696754932404, | |
| "num_tokens": 18339688.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.5260943174362183, | |
| "epoch": 0.04619271021291952, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0538274049758911, | |
| "mean_token_accuracy": 0.7461147308349609, | |
| "num_tokens": 18641640.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.5681921243667603, | |
| "epoch": 0.04691447130999639, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0877799987792969, | |
| "mean_token_accuracy": 0.7396962344646454, | |
| "num_tokens": 18939497.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.5850136876106262, | |
| "epoch": 0.04763623240707326, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0904234647750854, | |
| "mean_token_accuracy": 0.7401041090488434, | |
| "num_tokens": 19231331.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.575853705406189, | |
| "epoch": 0.048357993504150126, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0784809589385986, | |
| "mean_token_accuracy": 0.740749180316925, | |
| "num_tokens": 19506369.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.5586780905723572, | |
| "epoch": 0.049079754601226995, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0770025253295898, | |
| "mean_token_accuracy": 0.7410430610179901, | |
| "num_tokens": 19811802.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.5827566385269165, | |
| "epoch": 0.049801515698303864, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0753060579299927, | |
| "mean_token_accuracy": 0.7423750758171082, | |
| "num_tokens": 20100711.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.5529567003250122, | |
| "epoch": 0.05052327679538073, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0601444244384766, | |
| "mean_token_accuracy": 0.7456181049346924, | |
| "num_tokens": 20405489.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.6224084496498108, | |
| "epoch": 0.051245037892457594, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.125450611114502, | |
| "mean_token_accuracy": 0.7337780594825745, | |
| "num_tokens": 20687770.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.7074684500694275, | |
| "epoch": 0.05196679898953446, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.238325595855713, | |
| "mean_token_accuracy": 0.7112078368663788, | |
| "num_tokens": 20981731.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.5249865055084229, | |
| "epoch": 0.05268856008661133, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.043988585472107, | |
| "mean_token_accuracy": 0.7478486001491547, | |
| "num_tokens": 21272401.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.5727348923683167, | |
| "epoch": 0.0534103211836882, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.100590467453003, | |
| "mean_token_accuracy": 0.7388165593147278, | |
| "num_tokens": 21571928.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.5993722677230835, | |
| "epoch": 0.05413208228076507, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1405231952667236, | |
| "mean_token_accuracy": 0.7291399836540222, | |
| "num_tokens": 21870798.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.5938835740089417, | |
| "epoch": 0.05485384337784194, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1131370067596436, | |
| "mean_token_accuracy": 0.737176388502121, | |
| "num_tokens": 22162788.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.5931457877159119, | |
| "epoch": 0.0555756044749188, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0873945951461792, | |
| "mean_token_accuracy": 0.7404476404190063, | |
| "num_tokens": 22444932.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.5987274646759033, | |
| "epoch": 0.05629736557199567, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1396456956863403, | |
| "mean_token_accuracy": 0.730906754732132, | |
| "num_tokens": 22754808.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.559260368347168, | |
| "epoch": 0.05701912666907254, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0608243942260742, | |
| "mean_token_accuracy": 0.7463341951370239, | |
| "num_tokens": 23055255.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.554847240447998, | |
| "epoch": 0.057740887766149405, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0778532028198242, | |
| "mean_token_accuracy": 0.741190105676651, | |
| "num_tokens": 23349351.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.5926709175109863, | |
| "epoch": 0.058462648863226274, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1319208145141602, | |
| "mean_token_accuracy": 0.7340517938137054, | |
| "num_tokens": 23658735.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.594882607460022, | |
| "epoch": 0.05918440996030314, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.101097583770752, | |
| "mean_token_accuracy": 0.7379715144634247, | |
| "num_tokens": 23946168.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.5877431631088257, | |
| "epoch": 0.059906171057380005, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1063778400421143, | |
| "mean_token_accuracy": 0.7368501722812653, | |
| "num_tokens": 24253073.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.6259468793869019, | |
| "epoch": 0.06062793215445687, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.145182490348816, | |
| "mean_token_accuracy": 0.729894608259201, | |
| "num_tokens": 24527107.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.6315274834632874, | |
| "epoch": 0.06134969325153374, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1483168601989746, | |
| "mean_token_accuracy": 0.7306644916534424, | |
| "num_tokens": 24809767.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.6348057985305786, | |
| "epoch": 0.06207145434861061, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1626529693603516, | |
| "mean_token_accuracy": 0.7261816561222076, | |
| "num_tokens": 25110854.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.5630750060081482, | |
| "epoch": 0.06279321544568747, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0739233493804932, | |
| "mean_token_accuracy": 0.7445209324359894, | |
| "num_tokens": 25408317.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.5765607357025146, | |
| "epoch": 0.06351497654276435, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0702788829803467, | |
| "mean_token_accuracy": 0.7435714304447174, | |
| "num_tokens": 25685505.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.6107383966445923, | |
| "epoch": 0.06423673763984121, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.144263505935669, | |
| "mean_token_accuracy": 0.7299089133739471, | |
| "num_tokens": 25979705.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.5659379363059998, | |
| "epoch": 0.06495849873691809, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1233690977096558, | |
| "mean_token_accuracy": 0.7367878556251526, | |
| "num_tokens": 26286475.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.5758460760116577, | |
| "epoch": 0.06568025983399495, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1166677474975586, | |
| "mean_token_accuracy": 0.7332916557788849, | |
| "num_tokens": 26593964.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.5544713139533997, | |
| "epoch": 0.06640202093107181, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.068277359008789, | |
| "mean_token_accuracy": 0.7450745403766632, | |
| "num_tokens": 26898035.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.6286870241165161, | |
| "epoch": 0.06712378202814868, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1468693017959595, | |
| "mean_token_accuracy": 0.729914516210556, | |
| "num_tokens": 27186248.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.600565254688263, | |
| "epoch": 0.06784554312522555, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.103116512298584, | |
| "mean_token_accuracy": 0.7376452386379242, | |
| "num_tokens": 27463349.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.654672384262085, | |
| "epoch": 0.06856730422230242, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1753764152526855, | |
| "mean_token_accuracy": 0.7205879390239716, | |
| "num_tokens": 27741040.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.677686333656311, | |
| "epoch": 0.06928906531937928, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1765377521514893, | |
| "mean_token_accuracy": 0.7225688397884369, | |
| "num_tokens": 27998041.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.557913601398468, | |
| "epoch": 0.07001082641645616, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1140339374542236, | |
| "mean_token_accuracy": 0.7374096214771271, | |
| "num_tokens": 28307531.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.557511806488037, | |
| "epoch": 0.07073258751353302, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1007049083709717, | |
| "mean_token_accuracy": 0.7391440272331238, | |
| "num_tokens": 28605805.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.6885855793952942, | |
| "epoch": 0.07145434861060988, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2102177143096924, | |
| "mean_token_accuracy": 0.7192183136940002, | |
| "num_tokens": 28887326.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.6391792297363281, | |
| "epoch": 0.07217610970768676, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1536040306091309, | |
| "mean_token_accuracy": 0.7297197878360748, | |
| "num_tokens": 29178989.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.6849936246871948, | |
| "epoch": 0.07289787080476362, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.204404592514038, | |
| "mean_token_accuracy": 0.7192646861076355, | |
| "num_tokens": 29454691.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.612755537033081, | |
| "epoch": 0.0736196319018405, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1387853622436523, | |
| "mean_token_accuracy": 0.7301047444343567, | |
| "num_tokens": 29741223.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.552790880203247, | |
| "epoch": 0.07434139299891736, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.060426950454712, | |
| "mean_token_accuracy": 0.7446425557136536, | |
| "num_tokens": 30037906.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.694750726222992, | |
| "epoch": 0.07506315409599422, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2107906341552734, | |
| "mean_token_accuracy": 0.7195940315723419, | |
| "num_tokens": 30312568.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.6190918684005737, | |
| "epoch": 0.0757849151930711, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1408714056015015, | |
| "mean_token_accuracy": 0.7299633920192719, | |
| "num_tokens": 30610392.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.6252397894859314, | |
| "epoch": 0.07650667629014796, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1361322402954102, | |
| "mean_token_accuracy": 0.7305669784545898, | |
| "num_tokens": 30881477.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.6025376319885254, | |
| "epoch": 0.07722843738722483, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0834941864013672, | |
| "mean_token_accuracy": 0.7420128285884857, | |
| "num_tokens": 31162472.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.599751591682434, | |
| "epoch": 0.0779501984843017, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1158347129821777, | |
| "mean_token_accuracy": 0.7352641224861145, | |
| "num_tokens": 31449196.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.587877333164215, | |
| "epoch": 0.07867195958137857, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1317908763885498, | |
| "mean_token_accuracy": 0.7313638925552368, | |
| "num_tokens": 31758945.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.6531342267990112, | |
| "epoch": 0.07939372067845543, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1566457748413086, | |
| "mean_token_accuracy": 0.726512610912323, | |
| "num_tokens": 32045731.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.5650938749313354, | |
| "epoch": 0.0801154817755323, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0553148984909058, | |
| "mean_token_accuracy": 0.7471771240234375, | |
| "num_tokens": 32352858.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.5255182981491089, | |
| "epoch": 0.08083724287260917, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0547511577606201, | |
| "mean_token_accuracy": 0.7459031641483307, | |
| "num_tokens": 32663218.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.6216821074485779, | |
| "epoch": 0.08155900396968603, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1458425521850586, | |
| "mean_token_accuracy": 0.7294427752494812, | |
| "num_tokens": 32945475.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.5940646529197693, | |
| "epoch": 0.0822807650667629, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1227571964263916, | |
| "mean_token_accuracy": 0.733859658241272, | |
| "num_tokens": 33235586.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.6727558374404907, | |
| "epoch": 0.08300252616383977, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1951568126678467, | |
| "mean_token_accuracy": 0.7208792269229889, | |
| "num_tokens": 33523553.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.5916215181350708, | |
| "epoch": 0.08372428726091663, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.10416841506958, | |
| "mean_token_accuracy": 0.7365945875644684, | |
| "num_tokens": 33818876.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.665009319782257, | |
| "epoch": 0.0844460483579935, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.192238450050354, | |
| "mean_token_accuracy": 0.7212351262569427, | |
| "num_tokens": 34101367.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.6264830827713013, | |
| "epoch": 0.08516780945507037, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1303846836090088, | |
| "mean_token_accuracy": 0.7318049073219299, | |
| "num_tokens": 34370369.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.5952454209327698, | |
| "epoch": 0.08588957055214724, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.113147497177124, | |
| "mean_token_accuracy": 0.7369092106819153, | |
| "num_tokens": 34658530.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.5981455445289612, | |
| "epoch": 0.0866113316492241, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1117277145385742, | |
| "mean_token_accuracy": 0.7370716035366058, | |
| "num_tokens": 34956868.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5606166124343872, | |
| "epoch": 0.08733309274630098, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0918508768081665, | |
| "mean_token_accuracy": 0.7394787967205048, | |
| "num_tokens": 35244017.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.6160239577293396, | |
| "epoch": 0.08805485384337784, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1329431533813477, | |
| "mean_token_accuracy": 0.7326081395149231, | |
| "num_tokens": 35535423.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.5834547877311707, | |
| "epoch": 0.0887766149404547, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.094203233718872, | |
| "mean_token_accuracy": 0.7399301528930664, | |
| "num_tokens": 35823301.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.6251248121261597, | |
| "epoch": 0.08949837603753158, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1025975942611694, | |
| "mean_token_accuracy": 0.7363601326942444, | |
| "num_tokens": 36100249.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.6510186195373535, | |
| "epoch": 0.09022013713460844, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1776456832885742, | |
| "mean_token_accuracy": 0.7233887016773224, | |
| "num_tokens": 36379625.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.5958117842674255, | |
| "epoch": 0.09094189823168532, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1187841892242432, | |
| "mean_token_accuracy": 0.732407957315445, | |
| "num_tokens": 36665912.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.5842035412788391, | |
| "epoch": 0.09166365932876218, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0788733959197998, | |
| "mean_token_accuracy": 0.7431860268115997, | |
| "num_tokens": 36956505.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.5923900604248047, | |
| "epoch": 0.09238542042583904, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.109296202659607, | |
| "mean_token_accuracy": 0.7371940612792969, | |
| "num_tokens": 37251431.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.5973605513572693, | |
| "epoch": 0.09310718152291592, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0678976774215698, | |
| "mean_token_accuracy": 0.7456335425376892, | |
| "num_tokens": 37548807.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.6780987977981567, | |
| "epoch": 0.09382894261999278, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.211254596710205, | |
| "mean_token_accuracy": 0.7181490659713745, | |
| "num_tokens": 37829723.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.616599440574646, | |
| "epoch": 0.09455070371706965, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1332050561904907, | |
| "mean_token_accuracy": 0.7317441999912262, | |
| "num_tokens": 38126810.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.6254015564918518, | |
| "epoch": 0.09527246481414652, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1633479595184326, | |
| "mean_token_accuracy": 0.72529536485672, | |
| "num_tokens": 38424748.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.533314287662506, | |
| "epoch": 0.09599422591122339, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0424294471740723, | |
| "mean_token_accuracy": 0.7479520738124847, | |
| "num_tokens": 38745412.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.651694416999817, | |
| "epoch": 0.09671598700830025, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1980639696121216, | |
| "mean_token_accuracy": 0.7201534509658813, | |
| "num_tokens": 39032874.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.6101423501968384, | |
| "epoch": 0.09743774810537711, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1173202991485596, | |
| "mean_token_accuracy": 0.7320694625377655, | |
| "num_tokens": 39334849.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.6213688254356384, | |
| "epoch": 0.09815950920245399, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1414477825164795, | |
| "mean_token_accuracy": 0.7313390374183655, | |
| "num_tokens": 39623521.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.5909383893013, | |
| "epoch": 0.09888127029953085, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1475062370300293, | |
| "mean_token_accuracy": 0.7306889295578003, | |
| "num_tokens": 39921847.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.6342026591300964, | |
| "epoch": 0.09960303139660773, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1249594688415527, | |
| "mean_token_accuracy": 0.7333996295928955, | |
| "num_tokens": 40205430.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.534162998199463, | |
| "epoch": 0.10032479249368459, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0463987588882446, | |
| "mean_token_accuracy": 0.7481896579265594, | |
| "num_tokens": 40498391.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.582057237625122, | |
| "epoch": 0.10104655359076146, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.087873101234436, | |
| "mean_token_accuracy": 0.7401741147041321, | |
| "num_tokens": 40784952.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.5767365097999573, | |
| "epoch": 0.10176831468783833, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0600736141204834, | |
| "mean_token_accuracy": 0.743198812007904, | |
| "num_tokens": 41075890.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.5812320709228516, | |
| "epoch": 0.10249007578491519, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0936431884765625, | |
| "mean_token_accuracy": 0.739851325750351, | |
| "num_tokens": 41367809.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.6068961024284363, | |
| "epoch": 0.10321183688199206, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1377291679382324, | |
| "mean_token_accuracy": 0.7343553006649017, | |
| "num_tokens": 41681860.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.653354823589325, | |
| "epoch": 0.10393359797906893, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1678221225738525, | |
| "mean_token_accuracy": 0.7247883081436157, | |
| "num_tokens": 41975302.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.5417386293411255, | |
| "epoch": 0.1046553590761458, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0448545217514038, | |
| "mean_token_accuracy": 0.7471159398555756, | |
| "num_tokens": 42271798.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.6259198784828186, | |
| "epoch": 0.10537712017322266, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1437795162200928, | |
| "mean_token_accuracy": 0.729465514421463, | |
| "num_tokens": 42559987.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.5697144865989685, | |
| "epoch": 0.10609888127029952, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0444915294647217, | |
| "mean_token_accuracy": 0.7469617426395416, | |
| "num_tokens": 42869890.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.6129130125045776, | |
| "epoch": 0.1068206423673764, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0998916625976562, | |
| "mean_token_accuracy": 0.7378185391426086, | |
| "num_tokens": 43150453.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.5707987546920776, | |
| "epoch": 0.10754240346445326, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.095113754272461, | |
| "mean_token_accuracy": 0.7406199872493744, | |
| "num_tokens": 43456505.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.6888920068740845, | |
| "epoch": 0.10826416456153014, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1995296478271484, | |
| "mean_token_accuracy": 0.7208355665206909, | |
| "num_tokens": 43744893.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.610870599746704, | |
| "epoch": 0.108985925658607, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1402409076690674, | |
| "mean_token_accuracy": 0.7321662306785583, | |
| "num_tokens": 44042982.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.5969061255455017, | |
| "epoch": 0.10970768675568388, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1209485530853271, | |
| "mean_token_accuracy": 0.7340458929538727, | |
| "num_tokens": 44348068.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.5557780265808105, | |
| "epoch": 0.11042944785276074, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0866204500198364, | |
| "mean_token_accuracy": 0.7392199635505676, | |
| "num_tokens": 44641468.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.540208101272583, | |
| "epoch": 0.1111512089498376, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0612618923187256, | |
| "mean_token_accuracy": 0.7472435534000397, | |
| "num_tokens": 44951345.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.6566722989082336, | |
| "epoch": 0.11187297004691447, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1554898023605347, | |
| "mean_token_accuracy": 0.7262442111968994, | |
| "num_tokens": 45241886.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.6134308576583862, | |
| "epoch": 0.11259473114399134, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1307741403579712, | |
| "mean_token_accuracy": 0.7296045422554016, | |
| "num_tokens": 45524820.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.556853175163269, | |
| "epoch": 0.11331649224106821, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0568945407867432, | |
| "mean_token_accuracy": 0.7467330694198608, | |
| "num_tokens": 45823705.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.6447445750236511, | |
| "epoch": 0.11403825333814507, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.182278037071228, | |
| "mean_token_accuracy": 0.7233797311782837, | |
| "num_tokens": 46120487.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.6017093658447266, | |
| "epoch": 0.11476001443522194, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.090350866317749, | |
| "mean_token_accuracy": 0.7409092485904694, | |
| "num_tokens": 46420975.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.588232398033142, | |
| "epoch": 0.11548177553229881, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1047959327697754, | |
| "mean_token_accuracy": 0.7373028099536896, | |
| "num_tokens": 46713947.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.5794617533683777, | |
| "epoch": 0.11620353662937567, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1010756492614746, | |
| "mean_token_accuracy": 0.7385070323944092, | |
| "num_tokens": 47011149.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.5960445404052734, | |
| "epoch": 0.11692529772645255, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1273193359375, | |
| "mean_token_accuracy": 0.7358591258525848, | |
| "num_tokens": 47307350.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.6217103004455566, | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1224124431610107, | |
| "mean_token_accuracy": 0.7337493896484375, | |
| "num_tokens": 47608324.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.6066383123397827, | |
| "epoch": 0.11836881992060629, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0966877937316895, | |
| "mean_token_accuracy": 0.7392807304859161, | |
| "num_tokens": 47890347.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.616932213306427, | |
| "epoch": 0.11909058101768315, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1433804035186768, | |
| "mean_token_accuracy": 0.7306821346282959, | |
| "num_tokens": 48197987.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.630455732345581, | |
| "epoch": 0.11981234211476001, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1604477167129517, | |
| "mean_token_accuracy": 0.7266038060188293, | |
| "num_tokens": 48478643.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.620269775390625, | |
| "epoch": 0.12053410321183688, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1414644718170166, | |
| "mean_token_accuracy": 0.7304779589176178, | |
| "num_tokens": 48773729.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.549348771572113, | |
| "epoch": 0.12125586430891375, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0744342803955078, | |
| "mean_token_accuracy": 0.7440922558307648, | |
| "num_tokens": 49073180.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.6178706288337708, | |
| "epoch": 0.12197762540599062, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1490850448608398, | |
| "mean_token_accuracy": 0.7291074395179749, | |
| "num_tokens": 49358880.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.6120877861976624, | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1199241876602173, | |
| "mean_token_accuracy": 0.7333821356296539, | |
| "num_tokens": 49649223.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.616333782672882, | |
| "epoch": 0.12342114760014435, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1361584663391113, | |
| "mean_token_accuracy": 0.7322733998298645, | |
| "num_tokens": 49946365.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.6349468231201172, | |
| "epoch": 0.12414290869722122, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1455856561660767, | |
| "mean_token_accuracy": 0.729077011346817, | |
| "num_tokens": 50231685.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.5861142873764038, | |
| "epoch": 0.12486466979429808, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0667896270751953, | |
| "mean_token_accuracy": 0.7429926097393036, | |
| "num_tokens": 50529548.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.697783648967743, | |
| "epoch": 0.12558643089137494, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.223460078239441, | |
| "mean_token_accuracy": 0.7153672575950623, | |
| "num_tokens": 50805752.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.6097814440727234, | |
| "epoch": 0.12630819198845183, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.117940068244934, | |
| "mean_token_accuracy": 0.7336777746677399, | |
| "num_tokens": 51092908.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.5442038178443909, | |
| "epoch": 0.1270299530855287, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0618278980255127, | |
| "mean_token_accuracy": 0.7468374967575073, | |
| "num_tokens": 51392606.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.596940517425537, | |
| "epoch": 0.12775171418260556, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.102985143661499, | |
| "mean_token_accuracy": 0.7383373975753784, | |
| "num_tokens": 51686179.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.6225216388702393, | |
| "epoch": 0.12847347527968242, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1477952003479004, | |
| "mean_token_accuracy": 0.7299016118049622, | |
| "num_tokens": 51977932.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.5693591833114624, | |
| "epoch": 0.12919523637675928, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.051598072052002, | |
| "mean_token_accuracy": 0.7483044266700745, | |
| "num_tokens": 52275537.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.6091846823692322, | |
| "epoch": 0.12991699747383617, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1196022033691406, | |
| "mean_token_accuracy": 0.7327866554260254, | |
| "num_tokens": 52570765.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.6507315635681152, | |
| "epoch": 0.13063875857091303, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1470608711242676, | |
| "mean_token_accuracy": 0.7305409610271454, | |
| "num_tokens": 52857807.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.5862771272659302, | |
| "epoch": 0.1313605196679899, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1336809396743774, | |
| "mean_token_accuracy": 0.7325917780399323, | |
| "num_tokens": 53162785.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.5938412547111511, | |
| "epoch": 0.13208228076506676, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0940954685211182, | |
| "mean_token_accuracy": 0.7407487332820892, | |
| "num_tokens": 53459998.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.5617841482162476, | |
| "epoch": 0.13280404186214362, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0803189277648926, | |
| "mean_token_accuracy": 0.7419928312301636, | |
| "num_tokens": 53767680.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.5950895547866821, | |
| "epoch": 0.1335258029592205, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.140985369682312, | |
| "mean_token_accuracy": 0.7313254773616791, | |
| "num_tokens": 54065041.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.6319683194160461, | |
| "epoch": 0.13424756405629737, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1067640781402588, | |
| "mean_token_accuracy": 0.7361359894275665, | |
| "num_tokens": 54359630.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.5261160731315613, | |
| "epoch": 0.13496932515337423, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0494418144226074, | |
| "mean_token_accuracy": 0.7505187392234802, | |
| "num_tokens": 54663539.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.6055362224578857, | |
| "epoch": 0.1356910862504511, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.112131118774414, | |
| "mean_token_accuracy": 0.7344719469547272, | |
| "num_tokens": 54953177.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.5803040862083435, | |
| "epoch": 0.13641284734752795, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1010226011276245, | |
| "mean_token_accuracy": 0.7370550334453583, | |
| "num_tokens": 55260532.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.5833672881126404, | |
| "epoch": 0.13713460844460484, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.099800944328308, | |
| "mean_token_accuracy": 0.7389223277568817, | |
| "num_tokens": 55549848.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.5840274691581726, | |
| "epoch": 0.1378563695416817, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1308026313781738, | |
| "mean_token_accuracy": 0.7336736619472504, | |
| "num_tokens": 55834039.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.5514160990715027, | |
| "epoch": 0.13857813063875857, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0642787218093872, | |
| "mean_token_accuracy": 0.7461328506469727, | |
| "num_tokens": 56121586.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.5501348972320557, | |
| "epoch": 0.13929989173583543, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.072887659072876, | |
| "mean_token_accuracy": 0.7418433427810669, | |
| "num_tokens": 56408407.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.553143560886383, | |
| "epoch": 0.14002165283291232, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1032896041870117, | |
| "mean_token_accuracy": 0.7381362020969391, | |
| "num_tokens": 56715305.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.7000082731246948, | |
| "epoch": 0.14074341392998918, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.239403486251831, | |
| "mean_token_accuracy": 0.7113346457481384, | |
| "num_tokens": 56992113.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.7120900750160217, | |
| "epoch": 0.14146517502706604, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2463223934173584, | |
| "mean_token_accuracy": 0.7116808891296387, | |
| "num_tokens": 57252307.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.5754520297050476, | |
| "epoch": 0.1421869361241429, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1114786863327026, | |
| "mean_token_accuracy": 0.7377319633960724, | |
| "num_tokens": 57543182.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.5696571469306946, | |
| "epoch": 0.14290869722121977, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1020081043243408, | |
| "mean_token_accuracy": 0.7378035187721252, | |
| "num_tokens": 57842029.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.6717844009399414, | |
| "epoch": 0.14363045831829666, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2202887535095215, | |
| "mean_token_accuracy": 0.7136721909046173, | |
| "num_tokens": 58128283.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.5761234164237976, | |
| "epoch": 0.14435221941537352, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0957626104354858, | |
| "mean_token_accuracy": 0.740239292383194, | |
| "num_tokens": 58435843.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.6337082982063293, | |
| "epoch": 0.14507398051245038, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1427903175354004, | |
| "mean_token_accuracy": 0.7304844856262207, | |
| "num_tokens": 58724269.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.604508101940155, | |
| "epoch": 0.14579574160952724, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.126597285270691, | |
| "mean_token_accuracy": 0.7315808534622192, | |
| "num_tokens": 59013334.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.5763038396835327, | |
| "epoch": 0.1465175027066041, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0807019472122192, | |
| "mean_token_accuracy": 0.7419225871562958, | |
| "num_tokens": 59297623.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.600170075893402, | |
| "epoch": 0.147239263803681, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1386055946350098, | |
| "mean_token_accuracy": 0.7297577261924744, | |
| "num_tokens": 59590616.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.595987617969513, | |
| "epoch": 0.14796102490075785, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.136359453201294, | |
| "mean_token_accuracy": 0.7322618365287781, | |
| "num_tokens": 59883661.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.5879700183868408, | |
| "epoch": 0.14868278599783472, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1032295227050781, | |
| "mean_token_accuracy": 0.7377834022045135, | |
| "num_tokens": 60175961.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.6706086993217468, | |
| "epoch": 0.14940454709491158, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.155031681060791, | |
| "mean_token_accuracy": 0.7300650477409363, | |
| "num_tokens": 60474578.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.5186784863471985, | |
| "epoch": 0.15012630819198844, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0484704971313477, | |
| "mean_token_accuracy": 0.7492237687110901, | |
| "num_tokens": 60771512.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.5801656246185303, | |
| "epoch": 0.15084806928906533, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.098115086555481, | |
| "mean_token_accuracy": 0.7389200925827026, | |
| "num_tokens": 61061842.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.5672805905342102, | |
| "epoch": 0.1515698303861422, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0838905572891235, | |
| "mean_token_accuracy": 0.7417920827865601, | |
| "num_tokens": 61360326.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.579549491405487, | |
| "epoch": 0.15229159148321905, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0529100894927979, | |
| "mean_token_accuracy": 0.7480099499225616, | |
| "num_tokens": 61658449.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.5598174333572388, | |
| "epoch": 0.1530133525802959, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0727403163909912, | |
| "mean_token_accuracy": 0.7430621087551117, | |
| "num_tokens": 61951239.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.5908202528953552, | |
| "epoch": 0.1537351136773728, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1253775358200073, | |
| "mean_token_accuracy": 0.7318282127380371, | |
| "num_tokens": 62242751.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.5687437057495117, | |
| "epoch": 0.15445687477444967, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1026842594146729, | |
| "mean_token_accuracy": 0.7367376983165741, | |
| "num_tokens": 62536601.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.6023658514022827, | |
| "epoch": 0.15517863587152653, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1378355026245117, | |
| "mean_token_accuracy": 0.732462614774704, | |
| "num_tokens": 62819578.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.6167571544647217, | |
| "epoch": 0.1559003969686034, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1253905296325684, | |
| "mean_token_accuracy": 0.7334583401679993, | |
| "num_tokens": 63108958.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.6414763927459717, | |
| "epoch": 0.15662215806568025, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.17036771774292, | |
| "mean_token_accuracy": 0.7267955243587494, | |
| "num_tokens": 63381264.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.5237603187561035, | |
| "epoch": 0.15734391916275714, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0499615669250488, | |
| "mean_token_accuracy": 0.7472123801708221, | |
| "num_tokens": 63682847.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.5988153219223022, | |
| "epoch": 0.158065680259834, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.115735650062561, | |
| "mean_token_accuracy": 0.7355196475982666, | |
| "num_tokens": 63972004.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.56021386384964, | |
| "epoch": 0.15878744135691086, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.084505319595337, | |
| "mean_token_accuracy": 0.7400890588760376, | |
| "num_tokens": 64268089.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.6091080904006958, | |
| "epoch": 0.15950920245398773, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1299200057983398, | |
| "mean_token_accuracy": 0.7308341264724731, | |
| "num_tokens": 64556329.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.6860791444778442, | |
| "epoch": 0.1602309635510646, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2056512832641602, | |
| "mean_token_accuracy": 0.7196942269802094, | |
| "num_tokens": 64830678.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.5704833269119263, | |
| "epoch": 0.16095272464814148, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1010000705718994, | |
| "mean_token_accuracy": 0.7390528619289398, | |
| "num_tokens": 65125088.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.564166247844696, | |
| "epoch": 0.16167448574521834, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0788018703460693, | |
| "mean_token_accuracy": 0.7453125715255737, | |
| "num_tokens": 65423252.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.608834445476532, | |
| "epoch": 0.1623962468422952, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1163859367370605, | |
| "mean_token_accuracy": 0.7352893948554993, | |
| "num_tokens": 65716679.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.5942700505256653, | |
| "epoch": 0.16311800793937206, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0865188837051392, | |
| "mean_token_accuracy": 0.7389642298221588, | |
| "num_tokens": 66012003.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.6661954522132874, | |
| "epoch": 0.16383976903644892, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1896711587905884, | |
| "mean_token_accuracy": 0.7227528095245361, | |
| "num_tokens": 66285377.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.5607267618179321, | |
| "epoch": 0.1645615301335258, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0866575241088867, | |
| "mean_token_accuracy": 0.7414681017398834, | |
| "num_tokens": 66566339.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.5533599257469177, | |
| "epoch": 0.16528329123060267, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.047506332397461, | |
| "mean_token_accuracy": 0.7488828003406525, | |
| "num_tokens": 66862378.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.573257029056549, | |
| "epoch": 0.16600505232767954, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1059901714324951, | |
| "mean_token_accuracy": 0.7356294393539429, | |
| "num_tokens": 67159780.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.5941730737686157, | |
| "epoch": 0.1667268134247564, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0761213302612305, | |
| "mean_token_accuracy": 0.7448401749134064, | |
| "num_tokens": 67444225.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.6607208847999573, | |
| "epoch": 0.16744857452183326, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1537370681762695, | |
| "mean_token_accuracy": 0.7284188568592072, | |
| "num_tokens": 67718903.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.5397337675094604, | |
| "epoch": 0.16817033561891015, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0568033456802368, | |
| "mean_token_accuracy": 0.7469969987869263, | |
| "num_tokens": 68015146.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.5377466678619385, | |
| "epoch": 0.168892096715987, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0745047330856323, | |
| "mean_token_accuracy": 0.7433846592903137, | |
| "num_tokens": 68306835.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.579733431339264, | |
| "epoch": 0.16961385781306387, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0858261585235596, | |
| "mean_token_accuracy": 0.7429001033306122, | |
| "num_tokens": 68602122.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.5555682182312012, | |
| "epoch": 0.17033561891014073, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0346972942352295, | |
| "mean_token_accuracy": 0.750887006521225, | |
| "num_tokens": 68893611.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.5493282079696655, | |
| "epoch": 0.17105738000721762, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0876100063323975, | |
| "mean_token_accuracy": 0.7397695779800415, | |
| "num_tokens": 69186774.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.5804590582847595, | |
| "epoch": 0.17177914110429449, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.080456018447876, | |
| "mean_token_accuracy": 0.7402198612689972, | |
| "num_tokens": 69492654.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.5888137221336365, | |
| "epoch": 0.17250090220137135, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.082033395767212, | |
| "mean_token_accuracy": 0.7414085268974304, | |
| "num_tokens": 69776475.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.6491570472717285, | |
| "epoch": 0.1732226632984482, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1173239946365356, | |
| "mean_token_accuracy": 0.7356294989585876, | |
| "num_tokens": 70057080.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.5879584550857544, | |
| "epoch": 0.17394442439552507, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1218128204345703, | |
| "mean_token_accuracy": 0.7341985702514648, | |
| "num_tokens": 70354806.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.6168740391731262, | |
| "epoch": 0.17466618549260196, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1141493320465088, | |
| "mean_token_accuracy": 0.7372621595859528, | |
| "num_tokens": 70636352.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.5495715141296387, | |
| "epoch": 0.17538794658967882, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0647706985473633, | |
| "mean_token_accuracy": 0.7462162375450134, | |
| "num_tokens": 70919749.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.6378893852233887, | |
| "epoch": 0.17610970768675568, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1448150873184204, | |
| "mean_token_accuracy": 0.730687826871872, | |
| "num_tokens": 71203503.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.5832432508468628, | |
| "epoch": 0.17683146878383255, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1058218479156494, | |
| "mean_token_accuracy": 0.7381950318813324, | |
| "num_tokens": 71493217.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.598574161529541, | |
| "epoch": 0.1775532298809094, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.122831106185913, | |
| "mean_token_accuracy": 0.7311376929283142, | |
| "num_tokens": 71787650.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.5889981985092163, | |
| "epoch": 0.1782749909779863, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1141576766967773, | |
| "mean_token_accuracy": 0.7340268492698669, | |
| "num_tokens": 72079734.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.6443175077438354, | |
| "epoch": 0.17899675207506316, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1060137748718262, | |
| "mean_token_accuracy": 0.7362356781959534, | |
| "num_tokens": 72350849.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.6835674047470093, | |
| "epoch": 0.17971851317214002, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.217130422592163, | |
| "mean_token_accuracy": 0.7220622003078461, | |
| "num_tokens": 72639592.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.5901318788528442, | |
| "epoch": 0.18044027426921688, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1281319856643677, | |
| "mean_token_accuracy": 0.7342364490032196, | |
| "num_tokens": 72940263.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.5831700563430786, | |
| "epoch": 0.18116203536629374, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1345491409301758, | |
| "mean_token_accuracy": 0.7317695617675781, | |
| "num_tokens": 73235882.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.6259974241256714, | |
| "epoch": 0.18188379646337063, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1235109567642212, | |
| "mean_token_accuracy": 0.733851820230484, | |
| "num_tokens": 73518612.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.6486980319023132, | |
| "epoch": 0.1826055575604475, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1762523651123047, | |
| "mean_token_accuracy": 0.7238500118255615, | |
| "num_tokens": 73816929.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.5457316040992737, | |
| "epoch": 0.18332731865752436, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0686001777648926, | |
| "mean_token_accuracy": 0.7457734048366547, | |
| "num_tokens": 74115446.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.6474520564079285, | |
| "epoch": 0.18404907975460122, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.137544870376587, | |
| "mean_token_accuracy": 0.7279047667980194, | |
| "num_tokens": 74400689.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.6248449087142944, | |
| "epoch": 0.18477084085167808, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.166927456855774, | |
| "mean_token_accuracy": 0.7259635329246521, | |
| "num_tokens": 74696852.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.549107015132904, | |
| "epoch": 0.18549260194875497, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0575287342071533, | |
| "mean_token_accuracy": 0.7450879514217377, | |
| "num_tokens": 74990708.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.5585582256317139, | |
| "epoch": 0.18621436304583183, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0599555969238281, | |
| "mean_token_accuracy": 0.745728075504303, | |
| "num_tokens": 75288349.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.6173226833343506, | |
| "epoch": 0.1869361241429087, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1172890663146973, | |
| "mean_token_accuracy": 0.7338734269142151, | |
| "num_tokens": 75575619.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.5381511449813843, | |
| "epoch": 0.18765788523998556, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0807030200958252, | |
| "mean_token_accuracy": 0.7421856224536896, | |
| "num_tokens": 75874826.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.5512885451316833, | |
| "epoch": 0.18837964633706245, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0178349018096924, | |
| "mean_token_accuracy": 0.7539387345314026, | |
| "num_tokens": 76147754.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.6587774157524109, | |
| "epoch": 0.1891014074341393, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.179133415222168, | |
| "mean_token_accuracy": 0.7226833999156952, | |
| "num_tokens": 76431544.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.6380688548088074, | |
| "epoch": 0.18982316853121617, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.143570065498352, | |
| "mean_token_accuracy": 0.730453372001648, | |
| "num_tokens": 76719372.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.617394506931305, | |
| "epoch": 0.19054492962829303, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.116464376449585, | |
| "mean_token_accuracy": 0.7363521158695221, | |
| "num_tokens": 76993195.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.5907557606697083, | |
| "epoch": 0.1912666907253699, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.079745888710022, | |
| "mean_token_accuracy": 0.7418153882026672, | |
| "num_tokens": 77288134.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.551349401473999, | |
| "epoch": 0.19198845182244678, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0483108758926392, | |
| "mean_token_accuracy": 0.7472957074642181, | |
| "num_tokens": 77579504.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.634832739830017, | |
| "epoch": 0.19271021291952364, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1494357585906982, | |
| "mean_token_accuracy": 0.7278711795806885, | |
| "num_tokens": 77867989.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.5733810067176819, | |
| "epoch": 0.1934319740166005, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0633783340454102, | |
| "mean_token_accuracy": 0.7452329695224762, | |
| "num_tokens": 78161998.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.550721824169159, | |
| "epoch": 0.19415373511367737, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0748469829559326, | |
| "mean_token_accuracy": 0.7441512942314148, | |
| "num_tokens": 78464748.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.5564887523651123, | |
| "epoch": 0.19487549621075423, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0867667198181152, | |
| "mean_token_accuracy": 0.738989531993866, | |
| "num_tokens": 78749250.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.654929518699646, | |
| "epoch": 0.19559725730783112, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.161920428276062, | |
| "mean_token_accuracy": 0.7269494533538818, | |
| "num_tokens": 79028963.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.6154484748840332, | |
| "epoch": 0.19631901840490798, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1183388233184814, | |
| "mean_token_accuracy": 0.7369323670864105, | |
| "num_tokens": 79311328.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.6132763028144836, | |
| "epoch": 0.19704077950198484, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0926940441131592, | |
| "mean_token_accuracy": 0.7412376403808594, | |
| "num_tokens": 79604811.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.6336576342582703, | |
| "epoch": 0.1977625405990617, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1127973794937134, | |
| "mean_token_accuracy": 0.7360274195671082, | |
| "num_tokens": 79885611.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.6097277402877808, | |
| "epoch": 0.19848430169613857, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.087917447090149, | |
| "mean_token_accuracy": 0.7404178082942963, | |
| "num_tokens": 80176286.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.5881891250610352, | |
| "epoch": 0.19920606279321545, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1032476425170898, | |
| "mean_token_accuracy": 0.7352743744850159, | |
| "num_tokens": 80469540.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.6285476684570312, | |
| "epoch": 0.19992782389029232, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.148996114730835, | |
| "mean_token_accuracy": 0.729354053735733, | |
| "num_tokens": 80766125.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.559075951576233, | |
| "epoch": 0.20064958498736918, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0810372829437256, | |
| "mean_token_accuracy": 0.7454397082328796, | |
| "num_tokens": 81060927.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.6498482823371887, | |
| "epoch": 0.20137134608444604, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1383163928985596, | |
| "mean_token_accuracy": 0.7311033010482788, | |
| "num_tokens": 81342818.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.6028773188591003, | |
| "epoch": 0.20209310718152293, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1319961547851562, | |
| "mean_token_accuracy": 0.733493447303772, | |
| "num_tokens": 81642266.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.5628262162208557, | |
| "epoch": 0.2028148682785998, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0875414609909058, | |
| "mean_token_accuracy": 0.7420202493667603, | |
| "num_tokens": 81934778.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.5558908581733704, | |
| "epoch": 0.20353662937567665, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0800241231918335, | |
| "mean_token_accuracy": 0.7427114546298981, | |
| "num_tokens": 82250510.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 1.5173797011375427, | |
| "epoch": 0.20425839047275351, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0458990335464478, | |
| "mean_token_accuracy": 0.7488558888435364, | |
| "num_tokens": 82549742.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 1.5814528465270996, | |
| "epoch": 0.20498015156983038, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1072943210601807, | |
| "mean_token_accuracy": 0.7369115948677063, | |
| "num_tokens": 82847821.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 1.6451613903045654, | |
| "epoch": 0.20570191266690727, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1332635879516602, | |
| "mean_token_accuracy": 0.7314316928386688, | |
| "num_tokens": 83129689.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.5618088245391846, | |
| "epoch": 0.20642367376398413, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0615777969360352, | |
| "mean_token_accuracy": 0.7434300184249878, | |
| "num_tokens": 83430211.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 1.5673176646232605, | |
| "epoch": 0.207145434861061, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0742175579071045, | |
| "mean_token_accuracy": 0.7424734532833099, | |
| "num_tokens": 83724983.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 1.5345569252967834, | |
| "epoch": 0.20786719595813785, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.065215826034546, | |
| "mean_token_accuracy": 0.7459068596363068, | |
| "num_tokens": 84030639.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 1.6994251012802124, | |
| "epoch": 0.2085889570552147, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.196901798248291, | |
| "mean_token_accuracy": 0.7201257050037384, | |
| "num_tokens": 84315192.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 1.5231322646141052, | |
| "epoch": 0.2093107181522916, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.053889274597168, | |
| "mean_token_accuracy": 0.7466905117034912, | |
| "num_tokens": 84606369.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.6492601037025452, | |
| "epoch": 0.21003247924936846, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1400116682052612, | |
| "mean_token_accuracy": 0.7312108874320984, | |
| "num_tokens": 84895850.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 1.5784066915512085, | |
| "epoch": 0.21075424034644533, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.102087140083313, | |
| "mean_token_accuracy": 0.7349793910980225, | |
| "num_tokens": 85183171.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 1.5913513898849487, | |
| "epoch": 0.2114760014435222, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0935804843902588, | |
| "mean_token_accuracy": 0.7383421361446381, | |
| "num_tokens": 85474059.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 1.6026569604873657, | |
| "epoch": 0.21219776254059905, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0891571044921875, | |
| "mean_token_accuracy": 0.7408276498317719, | |
| "num_tokens": 85768105.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 1.5164355039596558, | |
| "epoch": 0.21291952363767594, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0366206169128418, | |
| "mean_token_accuracy": 0.7495356202125549, | |
| "num_tokens": 86069230.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.6361202001571655, | |
| "epoch": 0.2136412847347528, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.140409231185913, | |
| "mean_token_accuracy": 0.732457160949707, | |
| "num_tokens": 86353691.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 1.58295476436615, | |
| "epoch": 0.21436304583182966, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1056493520736694, | |
| "mean_token_accuracy": 0.7381295561790466, | |
| "num_tokens": 86653177.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 1.6271981000900269, | |
| "epoch": 0.21508480692890652, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1379573345184326, | |
| "mean_token_accuracy": 0.73199063539505, | |
| "num_tokens": 86938210.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 1.5393192768096924, | |
| "epoch": 0.2158065680259834, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0658035278320312, | |
| "mean_token_accuracy": 0.7423197031021118, | |
| "num_tokens": 87235202.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 1.6467152833938599, | |
| "epoch": 0.21652832912306028, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1680774688720703, | |
| "mean_token_accuracy": 0.7262611985206604, | |
| "num_tokens": 87528216.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.5831130146980286, | |
| "epoch": 0.21725009022013714, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0638785362243652, | |
| "mean_token_accuracy": 0.7456824779510498, | |
| "num_tokens": 87825631.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 1.616044521331787, | |
| "epoch": 0.217971851317214, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1280238628387451, | |
| "mean_token_accuracy": 0.7316821217536926, | |
| "num_tokens": 88123002.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 1.6430625319480896, | |
| "epoch": 0.21869361241429086, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1628177165985107, | |
| "mean_token_accuracy": 0.7282373607158661, | |
| "num_tokens": 88421530.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 1.6133695840835571, | |
| "epoch": 0.21941537351136775, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1160101890563965, | |
| "mean_token_accuracy": 0.7356510162353516, | |
| "num_tokens": 88707601.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 1.5547195672988892, | |
| "epoch": 0.2201371346084446, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0515682697296143, | |
| "mean_token_accuracy": 0.7457767724990845, | |
| "num_tokens": 89007658.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.6346923112869263, | |
| "epoch": 0.22085889570552147, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1421478986740112, | |
| "mean_token_accuracy": 0.7313405871391296, | |
| "num_tokens": 89288459.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 1.608325481414795, | |
| "epoch": 0.22158065680259834, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1340771913528442, | |
| "mean_token_accuracy": 0.7317951321601868, | |
| "num_tokens": 89565751.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 1.5642022490501404, | |
| "epoch": 0.2223024178996752, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0933425426483154, | |
| "mean_token_accuracy": 0.7372935116291046, | |
| "num_tokens": 89864669.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 1.6126330494880676, | |
| "epoch": 0.2230241789967521, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1483060121536255, | |
| "mean_token_accuracy": 0.7313107848167419, | |
| "num_tokens": 90167200.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 1.5908660888671875, | |
| "epoch": 0.22374594009382895, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0998247861862183, | |
| "mean_token_accuracy": 0.7387517094612122, | |
| "num_tokens": 90440018.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.6193674802780151, | |
| "epoch": 0.2244677011909058, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1553058624267578, | |
| "mean_token_accuracy": 0.728503406047821, | |
| "num_tokens": 90720592.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 1.6450672149658203, | |
| "epoch": 0.22518946228798267, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.183763027191162, | |
| "mean_token_accuracy": 0.7229969501495361, | |
| "num_tokens": 91006905.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 1.6283334493637085, | |
| "epoch": 0.22591122338505953, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1556572914123535, | |
| "mean_token_accuracy": 0.7277766168117523, | |
| "num_tokens": 91303086.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 1.5947458148002625, | |
| "epoch": 0.22663298448213642, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1113152503967285, | |
| "mean_token_accuracy": 0.7374732494354248, | |
| "num_tokens": 91598453.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 1.5744538307189941, | |
| "epoch": 0.22735474557921329, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0567553043365479, | |
| "mean_token_accuracy": 0.7447769343852997, | |
| "num_tokens": 91881390.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.5623655915260315, | |
| "epoch": 0.22807650667629015, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.062221884727478, | |
| "mean_token_accuracy": 0.7462704479694366, | |
| "num_tokens": 92173971.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 1.610425055027008, | |
| "epoch": 0.228798267773367, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.111588716506958, | |
| "mean_token_accuracy": 0.7339754700660706, | |
| "num_tokens": 92456616.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 1.5818783640861511, | |
| "epoch": 0.22952002887044387, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0868871212005615, | |
| "mean_token_accuracy": 0.7412054240703583, | |
| "num_tokens": 92763665.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 1.672781527042389, | |
| "epoch": 0.23024178996752076, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.203850269317627, | |
| "mean_token_accuracy": 0.7183066606521606, | |
| "num_tokens": 93044918.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 1.5833418369293213, | |
| "epoch": 0.23096355106459762, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0991994142532349, | |
| "mean_token_accuracy": 0.7396262884140015, | |
| "num_tokens": 93353242.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.5691972374916077, | |
| "epoch": 0.23168531216167448, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0766429901123047, | |
| "mean_token_accuracy": 0.7417539358139038, | |
| "num_tokens": 93637475.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.6654770970344543, | |
| "epoch": 0.23240707325875135, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1741797924041748, | |
| "mean_token_accuracy": 0.7250281572341919, | |
| "num_tokens": 93915523.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 1.6544506549835205, | |
| "epoch": 0.2331288343558282, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2018771171569824, | |
| "mean_token_accuracy": 0.7165911197662354, | |
| "num_tokens": 94198902.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.6063737273216248, | |
| "epoch": 0.2338505954529051, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1512327194213867, | |
| "mean_token_accuracy": 0.7309001088142395, | |
| "num_tokens": 94490643.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.6191027760505676, | |
| "epoch": 0.23457235654998196, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.162632703781128, | |
| "mean_token_accuracy": 0.727214515209198, | |
| "num_tokens": 94782189.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.5912017226219177, | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1059738397598267, | |
| "mean_token_accuracy": 0.737178772687912, | |
| "num_tokens": 95077021.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.5529471039772034, | |
| "epoch": 0.23601587874413568, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0771679878234863, | |
| "mean_token_accuracy": 0.7431033849716187, | |
| "num_tokens": 95366535.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.466816782951355, | |
| "epoch": 0.23673763984121257, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9721765518188477, | |
| "mean_token_accuracy": 0.7615460455417633, | |
| "num_tokens": 95673375.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.5658625364303589, | |
| "epoch": 0.23745940093828943, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0420087575912476, | |
| "mean_token_accuracy": 0.7472658455371857, | |
| "num_tokens": 95973948.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.5709980726242065, | |
| "epoch": 0.2381811620353663, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0947855710983276, | |
| "mean_token_accuracy": 0.7376574575901031, | |
| "num_tokens": 96276029.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.6148045063018799, | |
| "epoch": 0.23890292313244316, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1201262474060059, | |
| "mean_token_accuracy": 0.7361614406108856, | |
| "num_tokens": 96571377.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.6669683456420898, | |
| "epoch": 0.23962468422952002, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1661792993545532, | |
| "mean_token_accuracy": 0.7251894474029541, | |
| "num_tokens": 96861214.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.6179706454277039, | |
| "epoch": 0.2403464453265969, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1275434494018555, | |
| "mean_token_accuracy": 0.7350395321846008, | |
| "num_tokens": 97174161.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.6105008125305176, | |
| "epoch": 0.24106820642367377, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.137376308441162, | |
| "mean_token_accuracy": 0.7327196896076202, | |
| "num_tokens": 97468749.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.566885769367218, | |
| "epoch": 0.24178996752075063, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0893937349319458, | |
| "mean_token_accuracy": 0.7376500964164734, | |
| "num_tokens": 97760613.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.5833361148834229, | |
| "epoch": 0.2425117286178275, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.103290319442749, | |
| "mean_token_accuracy": 0.7371641397476196, | |
| "num_tokens": 98051031.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.6214879751205444, | |
| "epoch": 0.24323348971490436, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1597447395324707, | |
| "mean_token_accuracy": 0.7275690734386444, | |
| "num_tokens": 98346849.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.635483741760254, | |
| "epoch": 0.24395525081198124, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1274981498718262, | |
| "mean_token_accuracy": 0.7341068983078003, | |
| "num_tokens": 98630349.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.606435239315033, | |
| "epoch": 0.2446770119090581, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1290794610977173, | |
| "mean_token_accuracy": 0.7340087592601776, | |
| "num_tokens": 98914926.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.636328935623169, | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1395862102508545, | |
| "mean_token_accuracy": 0.7321614921092987, | |
| "num_tokens": 99206730.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.566875696182251, | |
| "epoch": 0.24612053410321183, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.078291893005371, | |
| "mean_token_accuracy": 0.7421151399612427, | |
| "num_tokens": 99509074.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.5805391669273376, | |
| "epoch": 0.2468422952002887, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0995038747787476, | |
| "mean_token_accuracy": 0.7372367680072784, | |
| "num_tokens": 99813266.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.6048060655593872, | |
| "epoch": 0.24756405629736558, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1160517930984497, | |
| "mean_token_accuracy": 0.7343011498451233, | |
| "num_tokens": 100092623.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.6094213128089905, | |
| "epoch": 0.24828581739444244, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1342377662658691, | |
| "mean_token_accuracy": 0.7326390743255615, | |
| "num_tokens": 100390200.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.6181209683418274, | |
| "epoch": 0.2490075784915193, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1213080883026123, | |
| "mean_token_accuracy": 0.7341360747814178, | |
| "num_tokens": 100673812.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.571325957775116, | |
| "epoch": 0.24972933958859617, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0748450756072998, | |
| "mean_token_accuracy": 0.7429514825344086, | |
| "num_tokens": 100983568.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.5778157711029053, | |
| "epoch": 0.25045110068567306, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0720490217208862, | |
| "mean_token_accuracy": 0.742848813533783, | |
| "num_tokens": 101281558.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.6091675162315369, | |
| "epoch": 0.2511728617827499, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1079795360565186, | |
| "mean_token_accuracy": 0.7359097301959991, | |
| "num_tokens": 101574482.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.659930169582367, | |
| "epoch": 0.2518946228798268, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.161527395248413, | |
| "mean_token_accuracy": 0.7259911298751831, | |
| "num_tokens": 101848234.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.6411542892456055, | |
| "epoch": 0.25261638397690367, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.191873550415039, | |
| "mean_token_accuracy": 0.7206378877162933, | |
| "num_tokens": 102130348.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.5794683694839478, | |
| "epoch": 0.2533381450739805, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.076074242591858, | |
| "mean_token_accuracy": 0.74205082654953, | |
| "num_tokens": 102433331.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.548401951789856, | |
| "epoch": 0.2540599061710574, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0553632974624634, | |
| "mean_token_accuracy": 0.7475990355014801, | |
| "num_tokens": 102730834.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.603158414363861, | |
| "epoch": 0.2547816672681342, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1156569719314575, | |
| "mean_token_accuracy": 0.7362101376056671, | |
| "num_tokens": 103019642.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.63592129945755, | |
| "epoch": 0.2555034283652111, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1644940376281738, | |
| "mean_token_accuracy": 0.726719081401825, | |
| "num_tokens": 103304650.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.633001446723938, | |
| "epoch": 0.256225189462288, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1229968070983887, | |
| "mean_token_accuracy": 0.7319926619529724, | |
| "num_tokens": 103587081.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.6421423554420471, | |
| "epoch": 0.25694695055936484, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1512815952301025, | |
| "mean_token_accuracy": 0.7283817231655121, | |
| "num_tokens": 103871593.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.582658290863037, | |
| "epoch": 0.25766871165644173, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1154148578643799, | |
| "mean_token_accuracy": 0.7344390451908112, | |
| "num_tokens": 104156755.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.630010724067688, | |
| "epoch": 0.25839047275351856, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1764171123504639, | |
| "mean_token_accuracy": 0.7254190146923065, | |
| "num_tokens": 104454102.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.596200406551361, | |
| "epoch": 0.25911223385059545, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1375744342803955, | |
| "mean_token_accuracy": 0.7316363751888275, | |
| "num_tokens": 104753234.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.5771100521087646, | |
| "epoch": 0.25983399494767234, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0960545539855957, | |
| "mean_token_accuracy": 0.7377767860889435, | |
| "num_tokens": 105048228.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.5988454818725586, | |
| "epoch": 0.2605557560447492, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0996403694152832, | |
| "mean_token_accuracy": 0.7435691952705383, | |
| "num_tokens": 105334256.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.5978621244430542, | |
| "epoch": 0.26127751714182607, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0966078042984009, | |
| "mean_token_accuracy": 0.7385125458240509, | |
| "num_tokens": 105616215.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.6075668931007385, | |
| "epoch": 0.2619992782389029, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1161551475524902, | |
| "mean_token_accuracy": 0.7361478209495544, | |
| "num_tokens": 105916879.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.6536166071891785, | |
| "epoch": 0.2627210393359798, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1494966745376587, | |
| "mean_token_accuracy": 0.7290840446949005, | |
| "num_tokens": 106190489.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.588008999824524, | |
| "epoch": 0.2634428004330567, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0926753282546997, | |
| "mean_token_accuracy": 0.7393996119499207, | |
| "num_tokens": 106489485.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.594562292098999, | |
| "epoch": 0.2641645615301335, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1209088563919067, | |
| "mean_token_accuracy": 0.7329501509666443, | |
| "num_tokens": 106781615.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.577095866203308, | |
| "epoch": 0.2648863226272104, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1085336208343506, | |
| "mean_token_accuracy": 0.7375071942806244, | |
| "num_tokens": 107086646.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.600272297859192, | |
| "epoch": 0.26560808372428724, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.097273826599121, | |
| "mean_token_accuracy": 0.7393884658813477, | |
| "num_tokens": 107379619.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.635534942150116, | |
| "epoch": 0.2663298448213641, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1447855234146118, | |
| "mean_token_accuracy": 0.7306419312953949, | |
| "num_tokens": 107667160.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.591282844543457, | |
| "epoch": 0.267051605918441, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0821408033370972, | |
| "mean_token_accuracy": 0.7417797744274139, | |
| "num_tokens": 107951740.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.5657145977020264, | |
| "epoch": 0.26777336701551785, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0968927145004272, | |
| "mean_token_accuracy": 0.7399294376373291, | |
| "num_tokens": 108258765.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.6584608554840088, | |
| "epoch": 0.26849512811259474, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1994308233261108, | |
| "mean_token_accuracy": 0.71898153424263, | |
| "num_tokens": 108547310.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.6167534589767456, | |
| "epoch": 0.2692168892096716, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0867693424224854, | |
| "mean_token_accuracy": 0.7407282590866089, | |
| "num_tokens": 108847386.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.6013047099113464, | |
| "epoch": 0.26993865030674846, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.15181303024292, | |
| "mean_token_accuracy": 0.7297594249248505, | |
| "num_tokens": 109138934.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.616756558418274, | |
| "epoch": 0.27066041140382535, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1475341320037842, | |
| "mean_token_accuracy": 0.7298292219638824, | |
| "num_tokens": 109432724.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.5823055505752563, | |
| "epoch": 0.2713821725009022, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0748032331466675, | |
| "mean_token_accuracy": 0.7434148192405701, | |
| "num_tokens": 109711458.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.6488802433013916, | |
| "epoch": 0.2721039335979791, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1251047849655151, | |
| "mean_token_accuracy": 0.7332720160484314, | |
| "num_tokens": 109995098.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.56565260887146, | |
| "epoch": 0.2728256946950559, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0607374906539917, | |
| "mean_token_accuracy": 0.745273768901825, | |
| "num_tokens": 110296661.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.606612205505371, | |
| "epoch": 0.2735474557921328, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1090481281280518, | |
| "mean_token_accuracy": 0.7364325225353241, | |
| "num_tokens": 110592997.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.6048057079315186, | |
| "epoch": 0.2742692168892097, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.08303701877594, | |
| "mean_token_accuracy": 0.7388699352741241, | |
| "num_tokens": 110880944.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.6027131080627441, | |
| "epoch": 0.2749909779862865, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1006629467010498, | |
| "mean_token_accuracy": 0.7390763759613037, | |
| "num_tokens": 111172500.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.6541957259178162, | |
| "epoch": 0.2757127390833634, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.164910912513733, | |
| "mean_token_accuracy": 0.7267945408821106, | |
| "num_tokens": 111455351.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.5504454374313354, | |
| "epoch": 0.2764345001804403, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.055530309677124, | |
| "mean_token_accuracy": 0.7469462156295776, | |
| "num_tokens": 111749346.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.5682101845741272, | |
| "epoch": 0.27715626127751714, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0994484424591064, | |
| "mean_token_accuracy": 0.7375427782535553, | |
| "num_tokens": 112037914.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.5960384011268616, | |
| "epoch": 0.277878022374594, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1066681146621704, | |
| "mean_token_accuracy": 0.7366850078105927, | |
| "num_tokens": 112337291.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.6709198951721191, | |
| "epoch": 0.27859978347167086, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.170462727546692, | |
| "mean_token_accuracy": 0.7253004312515259, | |
| "num_tokens": 112614482.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.602424442768097, | |
| "epoch": 0.27932154456874775, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1215298175811768, | |
| "mean_token_accuracy": 0.7319803833961487, | |
| "num_tokens": 112897081.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.58213472366333, | |
| "epoch": 0.28004330566582464, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0810825824737549, | |
| "mean_token_accuracy": 0.7436984181404114, | |
| "num_tokens": 113197747.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.5367575287818909, | |
| "epoch": 0.28076506676290147, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0336525440216064, | |
| "mean_token_accuracy": 0.7494478523731232, | |
| "num_tokens": 113491798.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.6005035638809204, | |
| "epoch": 0.28148682785997836, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0992530584335327, | |
| "mean_token_accuracy": 0.7401552200317383, | |
| "num_tokens": 113788144.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.6207069754600525, | |
| "epoch": 0.2822085889570552, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1467058658599854, | |
| "mean_token_accuracy": 0.7290371060371399, | |
| "num_tokens": 114084826.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 1.5789991617202759, | |
| "epoch": 0.2829303500541321, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.102229356765747, | |
| "mean_token_accuracy": 0.7374478280544281, | |
| "num_tokens": 114377292.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 1.6652973294258118, | |
| "epoch": 0.283652111151209, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.194149374961853, | |
| "mean_token_accuracy": 0.7183903157711029, | |
| "num_tokens": 114649164.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 1.6017484664916992, | |
| "epoch": 0.2843738722482858, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1297574043273926, | |
| "mean_token_accuracy": 0.735154777765274, | |
| "num_tokens": 114966493.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 1.5885826349258423, | |
| "epoch": 0.2850956333453627, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0762158632278442, | |
| "mean_token_accuracy": 0.7431436479091644, | |
| "num_tokens": 115251006.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.640641212463379, | |
| "epoch": 0.28581739444243953, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1321057081222534, | |
| "mean_token_accuracy": 0.731113076210022, | |
| "num_tokens": 115527153.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 1.630197286605835, | |
| "epoch": 0.2865391555395164, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1196831464767456, | |
| "mean_token_accuracy": 0.7339911162853241, | |
| "num_tokens": 115804661.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.6231989860534668, | |
| "epoch": 0.2872609166365933, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1364479064941406, | |
| "mean_token_accuracy": 0.7309294044971466, | |
| "num_tokens": 116089692.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 1.5791901350021362, | |
| "epoch": 0.28798267773367014, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0699180364608765, | |
| "mean_token_accuracy": 0.7435059249401093, | |
| "num_tokens": 116393591.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 1.6184316873550415, | |
| "epoch": 0.28870443883074703, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1489251852035522, | |
| "mean_token_accuracy": 0.727633148431778, | |
| "num_tokens": 116674519.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.6062357425689697, | |
| "epoch": 0.28942619992782387, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1328123807907104, | |
| "mean_token_accuracy": 0.7309476733207703, | |
| "num_tokens": 116958916.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 1.5487224459648132, | |
| "epoch": 0.29014796102490076, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.063873291015625, | |
| "mean_token_accuracy": 0.7443958520889282, | |
| "num_tokens": 117248814.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 1.6010112762451172, | |
| "epoch": 0.29086972212197765, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0671896934509277, | |
| "mean_token_accuracy": 0.7450003921985626, | |
| "num_tokens": 117533947.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 1.579895794391632, | |
| "epoch": 0.2915914832190545, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1394544839859009, | |
| "mean_token_accuracy": 0.7301437854766846, | |
| "num_tokens": 117845069.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 1.5849671363830566, | |
| "epoch": 0.29231324431613137, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0695340633392334, | |
| "mean_token_accuracy": 0.7431894242763519, | |
| "num_tokens": 118131334.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.546720564365387, | |
| "epoch": 0.2930350054132082, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0756176710128784, | |
| "mean_token_accuracy": 0.7445120215415955, | |
| "num_tokens": 118437316.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 1.59204763174057, | |
| "epoch": 0.2937567665102851, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1205204725265503, | |
| "mean_token_accuracy": 0.7364739179611206, | |
| "num_tokens": 118739859.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 1.5447207689285278, | |
| "epoch": 0.294478527607362, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0381128787994385, | |
| "mean_token_accuracy": 0.7481309175491333, | |
| "num_tokens": 119031195.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 1.5859974026679993, | |
| "epoch": 0.2952002887044388, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0857067108154297, | |
| "mean_token_accuracy": 0.7424412071704865, | |
| "num_tokens": 119332258.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 1.5652551054954529, | |
| "epoch": 0.2959220498015157, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0672721862792969, | |
| "mean_token_accuracy": 0.7446885704994202, | |
| "num_tokens": 119630062.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.6346011757850647, | |
| "epoch": 0.29664381089859254, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1486409902572632, | |
| "mean_token_accuracy": 0.7281367778778076, | |
| "num_tokens": 119915133.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 1.5910564661026, | |
| "epoch": 0.29736557199566943, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1041498184204102, | |
| "mean_token_accuracy": 0.7374801933765411, | |
| "num_tokens": 120203628.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 1.527147889137268, | |
| "epoch": 0.2980873330927463, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0515824556350708, | |
| "mean_token_accuracy": 0.7464866042137146, | |
| "num_tokens": 120509109.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 1.5581042170524597, | |
| "epoch": 0.29880909418982315, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0540176630020142, | |
| "mean_token_accuracy": 0.7475551962852478, | |
| "num_tokens": 120814781.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 1.5346571207046509, | |
| "epoch": 0.29953085528690004, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0804376602172852, | |
| "mean_token_accuracy": 0.7423532903194427, | |
| "num_tokens": 121125800.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.567983329296112, | |
| "epoch": 0.3002526163839769, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.071397066116333, | |
| "mean_token_accuracy": 0.7454961836338043, | |
| "num_tokens": 121421929.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 1.5205201506614685, | |
| "epoch": 0.30097437748105377, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0548027753829956, | |
| "mean_token_accuracy": 0.74764683842659, | |
| "num_tokens": 121729745.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 1.6438958644866943, | |
| "epoch": 0.30169613857813066, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.160313367843628, | |
| "mean_token_accuracy": 0.7243135571479797, | |
| "num_tokens": 122015791.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 1.6002413034439087, | |
| "epoch": 0.3024178996752075, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.121572732925415, | |
| "mean_token_accuracy": 0.7328309416770935, | |
| "num_tokens": 122299206.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 1.6129282712936401, | |
| "epoch": 0.3031396607722844, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1413319110870361, | |
| "mean_token_accuracy": 0.7306722700595856, | |
| "num_tokens": 122604959.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.6229748725891113, | |
| "epoch": 0.3038614218693612, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1095683574676514, | |
| "mean_token_accuracy": 0.7389085292816162, | |
| "num_tokens": 122875773.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 1.6748819947242737, | |
| "epoch": 0.3045831829664381, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1822125911712646, | |
| "mean_token_accuracy": 0.7234383523464203, | |
| "num_tokens": 123161602.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 1.6030778288841248, | |
| "epoch": 0.305304944063515, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1511262655258179, | |
| "mean_token_accuracy": 0.7288818955421448, | |
| "num_tokens": 123455104.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 1.668658196926117, | |
| "epoch": 0.3060267051605918, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1879892349243164, | |
| "mean_token_accuracy": 0.7217608988285065, | |
| "num_tokens": 123740248.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 1.687535047531128, | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.204169750213623, | |
| "mean_token_accuracy": 0.7203571796417236, | |
| "num_tokens": 124012678.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.5997314453125, | |
| "epoch": 0.3074702273547456, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0892412662506104, | |
| "mean_token_accuracy": 0.7403751909732819, | |
| "num_tokens": 124313467.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 1.5840814113616943, | |
| "epoch": 0.30819198845182244, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0812218189239502, | |
| "mean_token_accuracy": 0.7419284284114838, | |
| "num_tokens": 124604799.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 1.645096242427826, | |
| "epoch": 0.30891374954889933, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1761665344238281, | |
| "mean_token_accuracy": 0.725730836391449, | |
| "num_tokens": 124899891.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 1.6193301677703857, | |
| "epoch": 0.30963551064597616, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1154736280441284, | |
| "mean_token_accuracy": 0.736019492149353, | |
| "num_tokens": 125186004.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 1.6325101852416992, | |
| "epoch": 0.31035727174305305, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1436183452606201, | |
| "mean_token_accuracy": 0.7297486364841461, | |
| "num_tokens": 125475199.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.6604394316673279, | |
| "epoch": 0.31107903284012994, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1491129398345947, | |
| "mean_token_accuracy": 0.73002690076828, | |
| "num_tokens": 125760793.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 1.5758911967277527, | |
| "epoch": 0.3118007939372068, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1309423446655273, | |
| "mean_token_accuracy": 0.7351952791213989, | |
| "num_tokens": 126068288.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 1.6220659017562866, | |
| "epoch": 0.31252255503428367, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.121761679649353, | |
| "mean_token_accuracy": 0.7342638373374939, | |
| "num_tokens": 126348146.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 1.56497061252594, | |
| "epoch": 0.3132443161313605, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.109041690826416, | |
| "mean_token_accuracy": 0.7383081316947937, | |
| "num_tokens": 126657079.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 1.6391427516937256, | |
| "epoch": 0.3139660772284374, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1200578212738037, | |
| "mean_token_accuracy": 0.7342452108860016, | |
| "num_tokens": 126941741.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.575550377368927, | |
| "epoch": 0.3146878383255143, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.087411642074585, | |
| "mean_token_accuracy": 0.7420357763767242, | |
| "num_tokens": 127234958.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 1.5465736389160156, | |
| "epoch": 0.3154095994225911, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.07535982131958, | |
| "mean_token_accuracy": 0.7410696148872375, | |
| "num_tokens": 127537347.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 1.5248339772224426, | |
| "epoch": 0.316131360519668, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0640586614608765, | |
| "mean_token_accuracy": 0.7473861277103424, | |
| "num_tokens": 127837999.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 1.6124067306518555, | |
| "epoch": 0.31685312161674484, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0720772743225098, | |
| "mean_token_accuracy": 0.7425844967365265, | |
| "num_tokens": 128112520.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 1.5651718974113464, | |
| "epoch": 0.3175748827138217, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0646960735321045, | |
| "mean_token_accuracy": 0.7459940314292908, | |
| "num_tokens": 128400958.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.637646496295929, | |
| "epoch": 0.3182966438108986, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1567387580871582, | |
| "mean_token_accuracy": 0.7288875877857208, | |
| "num_tokens": 128690151.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 1.5976744294166565, | |
| "epoch": 0.31901840490797545, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1042414903640747, | |
| "mean_token_accuracy": 0.7392546236515045, | |
| "num_tokens": 128986179.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 1.5358789563179016, | |
| "epoch": 0.31974016600505234, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0574394464492798, | |
| "mean_token_accuracy": 0.7455575466156006, | |
| "num_tokens": 129281997.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 1.6041855216026306, | |
| "epoch": 0.3204619271021292, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1089904308319092, | |
| "mean_token_accuracy": 0.7357610464096069, | |
| "num_tokens": 129562780.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.608000636100769, | |
| "epoch": 0.32118368819920606, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1251460313796997, | |
| "mean_token_accuracy": 0.7312854528427124, | |
| "num_tokens": 129856429.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.6611520051956177, | |
| "epoch": 0.32190544929628295, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1829650402069092, | |
| "mean_token_accuracy": 0.7249109148979187, | |
| "num_tokens": 130150262.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 1.6667674779891968, | |
| "epoch": 0.3226272103933598, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.181103229522705, | |
| "mean_token_accuracy": 0.7272431552410126, | |
| "num_tokens": 130449072.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.6586183309555054, | |
| "epoch": 0.3233489714904367, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1884377002716064, | |
| "mean_token_accuracy": 0.7232946455478668, | |
| "num_tokens": 130740173.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.5931247472763062, | |
| "epoch": 0.3240707325875135, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1147750616073608, | |
| "mean_token_accuracy": 0.7380184829235077, | |
| "num_tokens": 131037645.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 1.6648858189582825, | |
| "epoch": 0.3247924936845904, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2026402950286865, | |
| "mean_token_accuracy": 0.7213493883609772, | |
| "num_tokens": 131335228.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.6087056398391724, | |
| "epoch": 0.3255142547816673, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.095062017440796, | |
| "mean_token_accuracy": 0.7391766011714935, | |
| "num_tokens": 131622522.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 1.6324971318244934, | |
| "epoch": 0.3262360158787441, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1624951362609863, | |
| "mean_token_accuracy": 0.728905439376831, | |
| "num_tokens": 131917918.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 1.6124878525733948, | |
| "epoch": 0.326957776975821, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1487758159637451, | |
| "mean_token_accuracy": 0.7288512587547302, | |
| "num_tokens": 132205196.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.59714275598526, | |
| "epoch": 0.32767953807289785, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0881301164627075, | |
| "mean_token_accuracy": 0.7396830022335052, | |
| "num_tokens": 132488488.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 1.635063886642456, | |
| "epoch": 0.32840129916997474, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.180572748184204, | |
| "mean_token_accuracy": 0.7232075333595276, | |
| "num_tokens": 132775431.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.5686790943145752, | |
| "epoch": 0.3291230602670516, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0860564708709717, | |
| "mean_token_accuracy": 0.7422479689121246, | |
| "num_tokens": 133072415.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 1.556956171989441, | |
| "epoch": 0.32984482136412846, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0811514854431152, | |
| "mean_token_accuracy": 0.741037517786026, | |
| "num_tokens": 133375718.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 1.620395541191101, | |
| "epoch": 0.33056658246120535, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.148855447769165, | |
| "mean_token_accuracy": 0.72932568192482, | |
| "num_tokens": 133672339.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 1.636777400970459, | |
| "epoch": 0.3312883435582822, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1659705638885498, | |
| "mean_token_accuracy": 0.72730952501297, | |
| "num_tokens": 133959006.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 1.5831456780433655, | |
| "epoch": 0.3320101046553591, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0621190071105957, | |
| "mean_token_accuracy": 0.7466300129890442, | |
| "num_tokens": 134262734.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.6329560279846191, | |
| "epoch": 0.33273186575243596, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1542713642120361, | |
| "mean_token_accuracy": 0.726966917514801, | |
| "num_tokens": 134542713.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 1.573943316936493, | |
| "epoch": 0.3334536268495128, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0954251289367676, | |
| "mean_token_accuracy": 0.7401972115039825, | |
| "num_tokens": 134844952.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 1.5713019967079163, | |
| "epoch": 0.3341753879465897, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0914303064346313, | |
| "mean_token_accuracy": 0.7400886118412018, | |
| "num_tokens": 135152180.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.5598775744438171, | |
| "epoch": 0.3348971490436665, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0884873867034912, | |
| "mean_token_accuracy": 0.7402353882789612, | |
| "num_tokens": 135442693.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 1.554007112979889, | |
| "epoch": 0.3356189101407434, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0843267440795898, | |
| "mean_token_accuracy": 0.74355748295784, | |
| "num_tokens": 135748037.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.6000927090644836, | |
| "epoch": 0.3363406712378203, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.120779275894165, | |
| "mean_token_accuracy": 0.7342838048934937, | |
| "num_tokens": 136042263.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 1.6250577569007874, | |
| "epoch": 0.33706243233489713, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1517987251281738, | |
| "mean_token_accuracy": 0.727559506893158, | |
| "num_tokens": 136338230.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 1.6429235935211182, | |
| "epoch": 0.337784193431974, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.163648009300232, | |
| "mean_token_accuracy": 0.7238691449165344, | |
| "num_tokens": 136618908.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 1.5953425168991089, | |
| "epoch": 0.33850595452905086, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1051502227783203, | |
| "mean_token_accuracy": 0.7386227548122406, | |
| "num_tokens": 136914568.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 1.5803745985031128, | |
| "epoch": 0.33922771562612775, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1067967414855957, | |
| "mean_token_accuracy": 0.7373368442058563, | |
| "num_tokens": 137207168.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.5293388366699219, | |
| "epoch": 0.33994947672320464, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0488719940185547, | |
| "mean_token_accuracy": 0.7486203908920288, | |
| "num_tokens": 137507313.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 1.6503487825393677, | |
| "epoch": 0.34067123782028147, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.176361322402954, | |
| "mean_token_accuracy": 0.7250353693962097, | |
| "num_tokens": 137796949.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 1.6222673654556274, | |
| "epoch": 0.34139299891735836, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1528205871582031, | |
| "mean_token_accuracy": 0.7326656877994537, | |
| "num_tokens": 138099985.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 1.6981632113456726, | |
| "epoch": 0.34211476001443525, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2220008373260498, | |
| "mean_token_accuracy": 0.7169374227523804, | |
| "num_tokens": 138394664.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 1.5425214171409607, | |
| "epoch": 0.3428365211115121, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0560615062713623, | |
| "mean_token_accuracy": 0.7462894022464752, | |
| "num_tokens": 138707125.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.6574317812919617, | |
| "epoch": 0.34355828220858897, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1729273796081543, | |
| "mean_token_accuracy": 0.7234189510345459, | |
| "num_tokens": 138988950.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 1.5274913311004639, | |
| "epoch": 0.3442800433056658, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0758335590362549, | |
| "mean_token_accuracy": 0.7413328886032104, | |
| "num_tokens": 139292949.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 1.6365973353385925, | |
| "epoch": 0.3450018044027427, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1590934991836548, | |
| "mean_token_accuracy": 0.7286208271980286, | |
| "num_tokens": 139580133.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 1.5947922468185425, | |
| "epoch": 0.3457235654998196, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1198360919952393, | |
| "mean_token_accuracy": 0.7345921695232391, | |
| "num_tokens": 139869022.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 1.694524109363556, | |
| "epoch": 0.3464453265968964, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1883211135864258, | |
| "mean_token_accuracy": 0.7211547493934631, | |
| "num_tokens": 140150569.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.5630242824554443, | |
| "epoch": 0.3471670876939733, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0875773429870605, | |
| "mean_token_accuracy": 0.7402580082416534, | |
| "num_tokens": 140434750.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 1.6516925692558289, | |
| "epoch": 0.34788884879105014, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1840791702270508, | |
| "mean_token_accuracy": 0.7221535444259644, | |
| "num_tokens": 140720441.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 1.7140132188796997, | |
| "epoch": 0.34861060988812703, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2227469682693481, | |
| "mean_token_accuracy": 0.7167387306690216, | |
| "num_tokens": 140999992.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 1.6601051688194275, | |
| "epoch": 0.3493323709852039, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.161919355392456, | |
| "mean_token_accuracy": 0.7271238267421722, | |
| "num_tokens": 141287519.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 1.604166030883789, | |
| "epoch": 0.35005413208228076, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.154355764389038, | |
| "mean_token_accuracy": 0.7296633720397949, | |
| "num_tokens": 141584679.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.5881201028823853, | |
| "epoch": 0.35077589317935765, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0983965396881104, | |
| "mean_token_accuracy": 0.7383449971675873, | |
| "num_tokens": 141875862.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 1.5642328262329102, | |
| "epoch": 0.3514976542764345, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0659217834472656, | |
| "mean_token_accuracy": 0.7433837652206421, | |
| "num_tokens": 142164611.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 1.686498761177063, | |
| "epoch": 0.35221941537351137, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1902477741241455, | |
| "mean_token_accuracy": 0.7220829427242279, | |
| "num_tokens": 142448355.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 1.640208661556244, | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1746337413787842, | |
| "mean_token_accuracy": 0.7261711359024048, | |
| "num_tokens": 142733589.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 1.5611519813537598, | |
| "epoch": 0.3536629375676651, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0779387950897217, | |
| "mean_token_accuracy": 0.7451270520687103, | |
| "num_tokens": 143033403.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.6265342831611633, | |
| "epoch": 0.354384698664742, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1600427627563477, | |
| "mean_token_accuracy": 0.7262018322944641, | |
| "num_tokens": 143327143.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 1.619410514831543, | |
| "epoch": 0.3551064597618188, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1135916709899902, | |
| "mean_token_accuracy": 0.7344923913478851, | |
| "num_tokens": 143615166.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 1.563113272190094, | |
| "epoch": 0.3558282208588957, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0901010036468506, | |
| "mean_token_accuracy": 0.7397356629371643, | |
| "num_tokens": 143914756.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 1.6229987144470215, | |
| "epoch": 0.3565499819559726, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1394100189208984, | |
| "mean_token_accuracy": 0.7302195429801941, | |
| "num_tokens": 144208689.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 1.5569855570793152, | |
| "epoch": 0.35727174305304943, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0614022016525269, | |
| "mean_token_accuracy": 0.7458212077617645, | |
| "num_tokens": 144507322.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.6149272322654724, | |
| "epoch": 0.3579935041501263, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1146332025527954, | |
| "mean_token_accuracy": 0.7338621318340302, | |
| "num_tokens": 144813778.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 1.6173784136772156, | |
| "epoch": 0.35871526524720315, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1169593334197998, | |
| "mean_token_accuracy": 0.7356456518173218, | |
| "num_tokens": 145107467.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 1.5864734649658203, | |
| "epoch": 0.35943702634428004, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.113652229309082, | |
| "mean_token_accuracy": 0.736335813999176, | |
| "num_tokens": 145395822.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 1.6256049871444702, | |
| "epoch": 0.36015878744135693, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1368718147277832, | |
| "mean_token_accuracy": 0.7322273254394531, | |
| "num_tokens": 145682027.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 1.650820791721344, | |
| "epoch": 0.36088054853843377, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1954458951950073, | |
| "mean_token_accuracy": 0.7189677953720093, | |
| "num_tokens": 145973099.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.6234460473060608, | |
| "epoch": 0.36160230963551065, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1179085969924927, | |
| "mean_token_accuracy": 0.7338059842586517, | |
| "num_tokens": 146264526.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 1.521238386631012, | |
| "epoch": 0.3623240707325875, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0389149188995361, | |
| "mean_token_accuracy": 0.749197393655777, | |
| "num_tokens": 146558488.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 1.639874815940857, | |
| "epoch": 0.3630458318296644, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1338131427764893, | |
| "mean_token_accuracy": 0.7293655574321747, | |
| "num_tokens": 146846354.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 1.6013176441192627, | |
| "epoch": 0.36376759292674127, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1406679153442383, | |
| "mean_token_accuracy": 0.7299897372722626, | |
| "num_tokens": 147140581.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 1.5941340327262878, | |
| "epoch": 0.3644893540238181, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1104350090026855, | |
| "mean_token_accuracy": 0.7347866594791412, | |
| "num_tokens": 147437231.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.6880006194114685, | |
| "epoch": 0.365211115120895, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1863410472869873, | |
| "mean_token_accuracy": 0.7211757600307465, | |
| "num_tokens": 147715105.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 1.6045259237289429, | |
| "epoch": 0.3659328762179718, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0905433893203735, | |
| "mean_token_accuracy": 0.738747239112854, | |
| "num_tokens": 148006113.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 1.5972481966018677, | |
| "epoch": 0.3666546373150487, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.07442307472229, | |
| "mean_token_accuracy": 0.7435567378997803, | |
| "num_tokens": 148294812.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 1.5952850580215454, | |
| "epoch": 0.3673763984121256, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0867395401000977, | |
| "mean_token_accuracy": 0.738838255405426, | |
| "num_tokens": 148581680.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 1.654758095741272, | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2238576412200928, | |
| "mean_token_accuracy": 0.7157804071903229, | |
| "num_tokens": 148875357.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.5775583982467651, | |
| "epoch": 0.36881992060627933, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.100191593170166, | |
| "mean_token_accuracy": 0.7378706634044647, | |
| "num_tokens": 149168032.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 1.6066675186157227, | |
| "epoch": 0.36954168170335616, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1110823154449463, | |
| "mean_token_accuracy": 0.7374165058135986, | |
| "num_tokens": 149456863.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 1.5269203186035156, | |
| "epoch": 0.37026344280043305, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.045570731163025, | |
| "mean_token_accuracy": 0.7492022514343262, | |
| "num_tokens": 149761317.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 1.6037903428077698, | |
| "epoch": 0.37098520389750994, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0834832191467285, | |
| "mean_token_accuracy": 0.7384363412857056, | |
| "num_tokens": 150040731.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 1.575134038925171, | |
| "epoch": 0.3717069649945868, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0952000617980957, | |
| "mean_token_accuracy": 0.7380004823207855, | |
| "num_tokens": 150332462.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.5994169116020203, | |
| "epoch": 0.37242872609166366, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1556189060211182, | |
| "mean_token_accuracy": 0.7256032228469849, | |
| "num_tokens": 150636674.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 1.5312954187393188, | |
| "epoch": 0.37315048718874055, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0574302673339844, | |
| "mean_token_accuracy": 0.745067685842514, | |
| "num_tokens": 150930660.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 1.67794930934906, | |
| "epoch": 0.3738722482858174, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1918485164642334, | |
| "mean_token_accuracy": 0.7211729884147644, | |
| "num_tokens": 151219803.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 1.6255905628204346, | |
| "epoch": 0.3745940093828943, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1673014163970947, | |
| "mean_token_accuracy": 0.7239619791507721, | |
| "num_tokens": 151512477.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 1.5946731567382812, | |
| "epoch": 0.3753157704799711, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1183192729949951, | |
| "mean_token_accuracy": 0.735003262758255, | |
| "num_tokens": 151812465.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.5862481594085693, | |
| "epoch": 0.376037531577048, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1259503364562988, | |
| "mean_token_accuracy": 0.733224481344223, | |
| "num_tokens": 152117234.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 1.6424601078033447, | |
| "epoch": 0.3767592926741249, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1575448513031006, | |
| "mean_token_accuracy": 0.7274467945098877, | |
| "num_tokens": 152406858.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 1.6223523616790771, | |
| "epoch": 0.3774810537712017, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1335670948028564, | |
| "mean_token_accuracy": 0.731343150138855, | |
| "num_tokens": 152698817.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 1.5604919791221619, | |
| "epoch": 0.3782028148682786, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.05845046043396, | |
| "mean_token_accuracy": 0.7442804574966431, | |
| "num_tokens": 152988206.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 1.5547953844070435, | |
| "epoch": 0.37892457596535545, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0744178295135498, | |
| "mean_token_accuracy": 0.7433354556560516, | |
| "num_tokens": 153287520.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.60216623544693, | |
| "epoch": 0.37964633706243234, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1089946031570435, | |
| "mean_token_accuracy": 0.7342951893806458, | |
| "num_tokens": 153585099.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 1.6457003355026245, | |
| "epoch": 0.3803680981595092, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1213321685791016, | |
| "mean_token_accuracy": 0.7341791689395905, | |
| "num_tokens": 153862802.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 1.5871397852897644, | |
| "epoch": 0.38108985925658606, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1059651374816895, | |
| "mean_token_accuracy": 0.7380181550979614, | |
| "num_tokens": 154156937.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 1.5240612626075745, | |
| "epoch": 0.38181162035366295, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0416841506958008, | |
| "mean_token_accuracy": 0.7477717697620392, | |
| "num_tokens": 154449044.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 1.6725059151649475, | |
| "epoch": 0.3825333814507398, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.206442952156067, | |
| "mean_token_accuracy": 0.7201514542102814, | |
| "num_tokens": 154745747.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.5932756066322327, | |
| "epoch": 0.3832551425478167, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.104332685470581, | |
| "mean_token_accuracy": 0.7362224757671356, | |
| "num_tokens": 155041763.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 1.619477391242981, | |
| "epoch": 0.38397690364489356, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1414731740951538, | |
| "mean_token_accuracy": 0.7293963134288788, | |
| "num_tokens": 155343233.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 1.6683776378631592, | |
| "epoch": 0.3846986647419704, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2070395946502686, | |
| "mean_token_accuracy": 0.7182894647121429, | |
| "num_tokens": 155616541.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 1.6081397533416748, | |
| "epoch": 0.3854204258390473, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.11167311668396, | |
| "mean_token_accuracy": 0.735749214887619, | |
| "num_tokens": 155898796.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 1.5718166828155518, | |
| "epoch": 0.3861421869361241, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0903773307800293, | |
| "mean_token_accuracy": 0.742428183555603, | |
| "num_tokens": 156185038.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 1.5812262296676636, | |
| "epoch": 0.386863948033201, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1161175966262817, | |
| "mean_token_accuracy": 0.7331889867782593, | |
| "num_tokens": 156478529.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 1.6485041379928589, | |
| "epoch": 0.3875857091302779, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1999492645263672, | |
| "mean_token_accuracy": 0.7204713821411133, | |
| "num_tokens": 156767019.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 1.6294238567352295, | |
| "epoch": 0.38830747022735473, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.170448660850525, | |
| "mean_token_accuracy": 0.7244228422641754, | |
| "num_tokens": 157053449.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 1.609671413898468, | |
| "epoch": 0.3890292313244316, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1324965953826904, | |
| "mean_token_accuracy": 0.7311051785945892, | |
| "num_tokens": 157347534.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 1.5387115478515625, | |
| "epoch": 0.38975099242150846, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.042873501777649, | |
| "mean_token_accuracy": 0.7503548562526703, | |
| "num_tokens": 157645101.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.6158974766731262, | |
| "epoch": 0.39047275351858535, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1192989349365234, | |
| "mean_token_accuracy": 0.7362866103649139, | |
| "num_tokens": 157936813.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 1.5527795553207397, | |
| "epoch": 0.39119451461566224, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0566433668136597, | |
| "mean_token_accuracy": 0.7469334006309509, | |
| "num_tokens": 158243610.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 1.66873037815094, | |
| "epoch": 0.39191627571273907, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1918654441833496, | |
| "mean_token_accuracy": 0.7211013734340668, | |
| "num_tokens": 158523467.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 1.6005805134773254, | |
| "epoch": 0.39263803680981596, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1130868196487427, | |
| "mean_token_accuracy": 0.7366756498813629, | |
| "num_tokens": 158794263.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 1.6101391315460205, | |
| "epoch": 0.3933597979068928, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0829110145568848, | |
| "mean_token_accuracy": 0.7414791584014893, | |
| "num_tokens": 159067792.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 1.5730063319206238, | |
| "epoch": 0.3940815590039697, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1043363809585571, | |
| "mean_token_accuracy": 0.7362913191318512, | |
| "num_tokens": 159358082.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 1.574350655078888, | |
| "epoch": 0.3948033201010466, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0880415439605713, | |
| "mean_token_accuracy": 0.7417057156562805, | |
| "num_tokens": 159646729.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 1.6222990155220032, | |
| "epoch": 0.3955250811981234, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1325570344924927, | |
| "mean_token_accuracy": 0.7328086793422699, | |
| "num_tokens": 159943992.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 1.5958244800567627, | |
| "epoch": 0.3962468422952003, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1163383722305298, | |
| "mean_token_accuracy": 0.7354134619235992, | |
| "num_tokens": 160232263.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 1.5758973360061646, | |
| "epoch": 0.39696860339227713, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1021897792816162, | |
| "mean_token_accuracy": 0.7413262128829956, | |
| "num_tokens": 160517606.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.6239079236984253, | |
| "epoch": 0.397690364489354, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1313995122909546, | |
| "mean_token_accuracy": 0.7314778864383698, | |
| "num_tokens": 160785773.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 1.5758706331253052, | |
| "epoch": 0.3984121255864309, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0905269384384155, | |
| "mean_token_accuracy": 0.7405203878879547, | |
| "num_tokens": 161080326.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 1.5831953287124634, | |
| "epoch": 0.39913388668350774, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.08497953414917, | |
| "mean_token_accuracy": 0.7396326065063477, | |
| "num_tokens": 161370820.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 1.5894896388053894, | |
| "epoch": 0.39985564778058463, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.101989984512329, | |
| "mean_token_accuracy": 0.7365857660770416, | |
| "num_tokens": 161664811.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 1.5507331490516663, | |
| "epoch": 0.40057740887766147, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0602821111679077, | |
| "mean_token_accuracy": 0.7463509738445282, | |
| "num_tokens": 161972369.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 1.5711430311203003, | |
| "epoch": 0.40129916997473836, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0783026218414307, | |
| "mean_token_accuracy": 0.7421069145202637, | |
| "num_tokens": 162261889.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 1.6123376488685608, | |
| "epoch": 0.40202093107181525, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1508086919784546, | |
| "mean_token_accuracy": 0.7289157211780548, | |
| "num_tokens": 162559254.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 1.5799905061721802, | |
| "epoch": 0.4027426921688921, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0845707654953003, | |
| "mean_token_accuracy": 0.7415472269058228, | |
| "num_tokens": 162858478.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 1.587117075920105, | |
| "epoch": 0.40346445326596897, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0976632833480835, | |
| "mean_token_accuracy": 0.7396935820579529, | |
| "num_tokens": 163165145.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 1.613345980644226, | |
| "epoch": 0.40418621436304586, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.114994764328003, | |
| "mean_token_accuracy": 0.7337165474891663, | |
| "num_tokens": 163451954.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.668515920639038, | |
| "epoch": 0.4049079754601227, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1915613412857056, | |
| "mean_token_accuracy": 0.7234081327915192, | |
| "num_tokens": 163737641.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 1.5208616256713867, | |
| "epoch": 0.4056297365571996, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0531387329101562, | |
| "mean_token_accuracy": 0.749419242143631, | |
| "num_tokens": 164037024.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 1.6033462285995483, | |
| "epoch": 0.4063514976542764, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1200551986694336, | |
| "mean_token_accuracy": 0.7345983982086182, | |
| "num_tokens": 164329172.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 1.5849531888961792, | |
| "epoch": 0.4070732587513533, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1128225326538086, | |
| "mean_token_accuracy": 0.734798938035965, | |
| "num_tokens": 164623938.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 1.646580457687378, | |
| "epoch": 0.4077950198484302, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.133863091468811, | |
| "mean_token_accuracy": 0.7330552637577057, | |
| "num_tokens": 164920363.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 1.577094852924347, | |
| "epoch": 0.40851678094550703, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1257433891296387, | |
| "mean_token_accuracy": 0.7362552881240845, | |
| "num_tokens": 165216763.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 1.5967845916748047, | |
| "epoch": 0.4092385420425839, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.111537218093872, | |
| "mean_token_accuracy": 0.7356201708316803, | |
| "num_tokens": 165509854.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 1.6197506785392761, | |
| "epoch": 0.40996030313966075, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1577668190002441, | |
| "mean_token_accuracy": 0.7305867671966553, | |
| "num_tokens": 165799189.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 1.5556058883666992, | |
| "epoch": 0.41068206423673764, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.068633794784546, | |
| "mean_token_accuracy": 0.7456537187099457, | |
| "num_tokens": 166118062.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 1.6202342510223389, | |
| "epoch": 0.41140382533381453, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1549452543258667, | |
| "mean_token_accuracy": 0.7305187284946442, | |
| "num_tokens": 166416033.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.5584017038345337, | |
| "epoch": 0.41212558643089137, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0887541770935059, | |
| "mean_token_accuracy": 0.7395859658718109, | |
| "num_tokens": 166705562.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 1.5947588682174683, | |
| "epoch": 0.41284734752796826, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.135185956954956, | |
| "mean_token_accuracy": 0.733391523361206, | |
| "num_tokens": 166989123.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 1.6097568273544312, | |
| "epoch": 0.4135691086250451, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1329129934310913, | |
| "mean_token_accuracy": 0.7322137951850891, | |
| "num_tokens": 167283829.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 1.5255873799324036, | |
| "epoch": 0.414290869722122, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0224134922027588, | |
| "mean_token_accuracy": 0.7541805505752563, | |
| "num_tokens": 167575588.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 1.6699268817901611, | |
| "epoch": 0.41501263081919887, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.21880042552948, | |
| "mean_token_accuracy": 0.7172015011310577, | |
| "num_tokens": 167864879.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 1.5860467553138733, | |
| "epoch": 0.4157343919162757, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1169731616973877, | |
| "mean_token_accuracy": 0.7358094453811646, | |
| "num_tokens": 168179429.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 1.638533890247345, | |
| "epoch": 0.4164561530133526, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1146371364593506, | |
| "mean_token_accuracy": 0.7352512776851654, | |
| "num_tokens": 168464276.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 1.5950394868850708, | |
| "epoch": 0.4171779141104294, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0820252895355225, | |
| "mean_token_accuracy": 0.7411385774612427, | |
| "num_tokens": 168754408.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 1.604824960231781, | |
| "epoch": 0.4178996752075063, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.107188105583191, | |
| "mean_token_accuracy": 0.7359773814678192, | |
| "num_tokens": 169057792.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 1.6170887351036072, | |
| "epoch": 0.4186214363045832, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1709554195404053, | |
| "mean_token_accuracy": 0.7239348292350769, | |
| "num_tokens": 169343645.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.6033573746681213, | |
| "epoch": 0.41934319740166004, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0957493782043457, | |
| "mean_token_accuracy": 0.7377727627754211, | |
| "num_tokens": 169615900.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 1.6110195517539978, | |
| "epoch": 0.42006495849873693, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1462652683258057, | |
| "mean_token_accuracy": 0.7303279042243958, | |
| "num_tokens": 169912844.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 1.5689895153045654, | |
| "epoch": 0.42078671959581376, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0666075944900513, | |
| "mean_token_accuracy": 0.7409502267837524, | |
| "num_tokens": 170210824.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 1.6190383434295654, | |
| "epoch": 0.42150848069289065, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1103408336639404, | |
| "mean_token_accuracy": 0.7364379465579987, | |
| "num_tokens": 170504556.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 1.6314138174057007, | |
| "epoch": 0.42223024178996754, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1286349296569824, | |
| "mean_token_accuracy": 0.7336636483669281, | |
| "num_tokens": 170798181.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 1.5862641334533691, | |
| "epoch": 0.4229520028870444, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.094423770904541, | |
| "mean_token_accuracy": 0.7372421324253082, | |
| "num_tokens": 171088368.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 1.5879143476486206, | |
| "epoch": 0.42367376398412127, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.056355357170105, | |
| "mean_token_accuracy": 0.7458110451698303, | |
| "num_tokens": 171385652.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 1.5860403776168823, | |
| "epoch": 0.4243955250811981, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.097168207168579, | |
| "mean_token_accuracy": 0.7403504252433777, | |
| "num_tokens": 171684931.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 1.6759730577468872, | |
| "epoch": 0.425117286178275, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1897363662719727, | |
| "mean_token_accuracy": 0.7232668101787567, | |
| "num_tokens": 171964514.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 1.520399808883667, | |
| "epoch": 0.4258390472753519, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.035184383392334, | |
| "mean_token_accuracy": 0.7492556869983673, | |
| "num_tokens": 172264390.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.6313410997390747, | |
| "epoch": 0.4265608083724287, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.163499116897583, | |
| "mean_token_accuracy": 0.7279493510723114, | |
| "num_tokens": 172547615.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 1.5925041437149048, | |
| "epoch": 0.4272825694695056, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0813722610473633, | |
| "mean_token_accuracy": 0.7417286932468414, | |
| "num_tokens": 172826644.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 1.5489137768745422, | |
| "epoch": 0.42800433056658244, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0677440166473389, | |
| "mean_token_accuracy": 0.7456187009811401, | |
| "num_tokens": 173123956.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 1.5734184980392456, | |
| "epoch": 0.4287260916636593, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1042060852050781, | |
| "mean_token_accuracy": 0.7373843789100647, | |
| "num_tokens": 173400487.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 1.525161862373352, | |
| "epoch": 0.4294478527607362, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0642828941345215, | |
| "mean_token_accuracy": 0.743177741765976, | |
| "num_tokens": 173705679.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 1.6212319135665894, | |
| "epoch": 0.43016961385781305, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0785200595855713, | |
| "mean_token_accuracy": 0.7418456375598907, | |
| "num_tokens": 173994257.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 1.537688434123993, | |
| "epoch": 0.43089137495488994, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.105448603630066, | |
| "mean_token_accuracy": 0.7366429567337036, | |
| "num_tokens": 174299972.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 1.4883345365524292, | |
| "epoch": 0.4316131360519668, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0198166370391846, | |
| "mean_token_accuracy": 0.7522725164890289, | |
| "num_tokens": 174621935.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 1.6207319498062134, | |
| "epoch": 0.43233489714904366, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1435185670852661, | |
| "mean_token_accuracy": 0.7298184335231781, | |
| "num_tokens": 174916856.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 1.5687963962554932, | |
| "epoch": 0.43305665824612055, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0685855150222778, | |
| "mean_token_accuracy": 0.746061235666275, | |
| "num_tokens": 175199086.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.6493581533432007, | |
| "epoch": 0.4337784193431974, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1782745122909546, | |
| "mean_token_accuracy": 0.7219648659229279, | |
| "num_tokens": 175478040.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 1.5671702027320862, | |
| "epoch": 0.4345001804402743, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.107558012008667, | |
| "mean_token_accuracy": 0.7378929853439331, | |
| "num_tokens": 175779576.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 1.634715735912323, | |
| "epoch": 0.4352219415373511, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1946144104003906, | |
| "mean_token_accuracy": 0.7233321666717529, | |
| "num_tokens": 176068704.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 1.6247583627700806, | |
| "epoch": 0.435943702634428, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1194121837615967, | |
| "mean_token_accuracy": 0.7345558106899261, | |
| "num_tokens": 176363459.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 1.5927094221115112, | |
| "epoch": 0.4366654637315049, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.133838176727295, | |
| "mean_token_accuracy": 0.7321049571037292, | |
| "num_tokens": 176662152.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 1.595038890838623, | |
| "epoch": 0.4373872248285817, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1084964275360107, | |
| "mean_token_accuracy": 0.7408521771430969, | |
| "num_tokens": 176961599.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 1.5658250451087952, | |
| "epoch": 0.4381089859256586, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0990076065063477, | |
| "mean_token_accuracy": 0.7400900721549988, | |
| "num_tokens": 177275443.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 1.568897008895874, | |
| "epoch": 0.4388307470227355, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0616424083709717, | |
| "mean_token_accuracy": 0.7452067732810974, | |
| "num_tokens": 177570170.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 1.6283130645751953, | |
| "epoch": 0.43955250811981234, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1616148948669434, | |
| "mean_token_accuracy": 0.7277311980724335, | |
| "num_tokens": 177853293.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 1.6555038094520569, | |
| "epoch": 0.4402742692168892, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1288264989852905, | |
| "mean_token_accuracy": 0.7303913235664368, | |
| "num_tokens": 178117342.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.7070843577384949, | |
| "epoch": 0.44099603031396606, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2027482986450195, | |
| "mean_token_accuracy": 0.7181344926357269, | |
| "num_tokens": 178378140.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 1.5714011192321777, | |
| "epoch": 0.44171779141104295, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1155734062194824, | |
| "mean_token_accuracy": 0.7347998321056366, | |
| "num_tokens": 178688075.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 1.6551893949508667, | |
| "epoch": 0.44243955250811984, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1968450546264648, | |
| "mean_token_accuracy": 0.7199910581111908, | |
| "num_tokens": 178970176.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 1.620234489440918, | |
| "epoch": 0.44316131360519667, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.117511510848999, | |
| "mean_token_accuracy": 0.7368182241916656, | |
| "num_tokens": 179254258.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 1.5781685709953308, | |
| "epoch": 0.44388307470227356, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1195204257965088, | |
| "mean_token_accuracy": 0.7345280647277832, | |
| "num_tokens": 179558501.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 1.618272304534912, | |
| "epoch": 0.4446048357993504, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1281987428665161, | |
| "mean_token_accuracy": 0.7336892485618591, | |
| "num_tokens": 179837870.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 1.6428250074386597, | |
| "epoch": 0.4453265968964273, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.162510633468628, | |
| "mean_token_accuracy": 0.7278911769390106, | |
| "num_tokens": 180132435.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 1.5328102111816406, | |
| "epoch": 0.4460483579935042, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.052917718887329, | |
| "mean_token_accuracy": 0.7460933029651642, | |
| "num_tokens": 180425097.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 1.6320355534553528, | |
| "epoch": 0.446770119090581, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.139484167098999, | |
| "mean_token_accuracy": 0.7285744547843933, | |
| "num_tokens": 180706516.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 1.5652714371681213, | |
| "epoch": 0.4474918801876579, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.086570382118225, | |
| "mean_token_accuracy": 0.7394826710224152, | |
| "num_tokens": 180996007.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.606342375278473, | |
| "epoch": 0.44821364128473473, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1467411518096924, | |
| "mean_token_accuracy": 0.729450911283493, | |
| "num_tokens": 181281113.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 1.6066087484359741, | |
| "epoch": 0.4489354023818116, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1205601692199707, | |
| "mean_token_accuracy": 0.7346542477607727, | |
| "num_tokens": 181578322.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 1.5791630148887634, | |
| "epoch": 0.4496571634788885, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0966300964355469, | |
| "mean_token_accuracy": 0.7400188744068146, | |
| "num_tokens": 181878723.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 1.6084871888160706, | |
| "epoch": 0.45037892457596534, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.156432867050171, | |
| "mean_token_accuracy": 0.7263132631778717, | |
| "num_tokens": 182172247.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 1.5561562180519104, | |
| "epoch": 0.45110068567304223, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0497633218765259, | |
| "mean_token_accuracy": 0.7450034916400909, | |
| "num_tokens": 182462762.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 1.6379727721214294, | |
| "epoch": 0.45182244677011907, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1475250720977783, | |
| "mean_token_accuracy": 0.7291487157344818, | |
| "num_tokens": 182756157.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 1.6894765496253967, | |
| "epoch": 0.45254420786719596, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2011867761611938, | |
| "mean_token_accuracy": 0.7200133204460144, | |
| "num_tokens": 183036617.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 1.5869178175926208, | |
| "epoch": 0.45326596896427285, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1228361129760742, | |
| "mean_token_accuracy": 0.7344334423542023, | |
| "num_tokens": 183348140.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 1.6546363234519958, | |
| "epoch": 0.4539877300613497, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.188132643699646, | |
| "mean_token_accuracy": 0.7210691571235657, | |
| "num_tokens": 183632590.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 1.5817923545837402, | |
| "epoch": 0.45470949115842657, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0979231595993042, | |
| "mean_token_accuracy": 0.7370417416095734, | |
| "num_tokens": 183941080.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.587474763393402, | |
| "epoch": 0.4554312522555034, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1231613159179688, | |
| "mean_token_accuracy": 0.7337916791439056, | |
| "num_tokens": 184243362.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 1.603373408317566, | |
| "epoch": 0.4561530133525803, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0991640090942383, | |
| "mean_token_accuracy": 0.7383999228477478, | |
| "num_tokens": 184537236.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 1.564449429512024, | |
| "epoch": 0.4568747744496572, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0939512252807617, | |
| "mean_token_accuracy": 0.737929105758667, | |
| "num_tokens": 184833680.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 1.6056365370750427, | |
| "epoch": 0.457596535546734, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.115877628326416, | |
| "mean_token_accuracy": 0.7351059019565582, | |
| "num_tokens": 185123176.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 1.5983636975288391, | |
| "epoch": 0.4583182966438109, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0739091634750366, | |
| "mean_token_accuracy": 0.7426251769065857, | |
| "num_tokens": 185412323.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 1.6540947556495667, | |
| "epoch": 0.45904005774088774, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.162091851234436, | |
| "mean_token_accuracy": 0.726731926202774, | |
| "num_tokens": 185689214.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 1.622604250907898, | |
| "epoch": 0.45976181883796463, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.114464282989502, | |
| "mean_token_accuracy": 0.7352300584316254, | |
| "num_tokens": 185970599.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 1.6669327020645142, | |
| "epoch": 0.4604835799350415, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1498149633407593, | |
| "mean_token_accuracy": 0.7285056114196777, | |
| "num_tokens": 186252357.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 1.6214895844459534, | |
| "epoch": 0.46120534103211835, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1100432872772217, | |
| "mean_token_accuracy": 0.7351499199867249, | |
| "num_tokens": 186539557.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 1.6481537818908691, | |
| "epoch": 0.46192710212919524, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1853032112121582, | |
| "mean_token_accuracy": 0.7189866304397583, | |
| "num_tokens": 186823460.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.5723902583122253, | |
| "epoch": 0.4626488632262721, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0628620386123657, | |
| "mean_token_accuracy": 0.7429164946079254, | |
| "num_tokens": 187116446.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 1.5936442613601685, | |
| "epoch": 0.46337062432334897, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0995248556137085, | |
| "mean_token_accuracy": 0.7391497790813446, | |
| "num_tokens": 187412019.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 1.605537235736847, | |
| "epoch": 0.46409238542042586, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.11756432056427, | |
| "mean_token_accuracy": 0.7350789308547974, | |
| "num_tokens": 187700361.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 1.5693391561508179, | |
| "epoch": 0.4648141465175027, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.073732614517212, | |
| "mean_token_accuracy": 0.7433812916278839, | |
| "num_tokens": 187991327.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 1.6424800753593445, | |
| "epoch": 0.4655359076145796, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1322355270385742, | |
| "mean_token_accuracy": 0.7325375080108643, | |
| "num_tokens": 188278118.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 1.611929714679718, | |
| "epoch": 0.4662576687116564, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1274209022521973, | |
| "mean_token_accuracy": 0.7318012714385986, | |
| "num_tokens": 188566399.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 1.5719982385635376, | |
| "epoch": 0.4669794298087333, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1048223972320557, | |
| "mean_token_accuracy": 0.738763689994812, | |
| "num_tokens": 188864913.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 1.6606945991516113, | |
| "epoch": 0.4677011909058102, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1570045948028564, | |
| "mean_token_accuracy": 0.7274243533611298, | |
| "num_tokens": 189150148.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 1.6256017684936523, | |
| "epoch": 0.468422952002887, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1306378841400146, | |
| "mean_token_accuracy": 0.7327773571014404, | |
| "num_tokens": 189439982.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 1.5620282292366028, | |
| "epoch": 0.4691447130999639, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0708484649658203, | |
| "mean_token_accuracy": 0.7419192790985107, | |
| "num_tokens": 189727299.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.6509064435958862, | |
| "epoch": 0.4698664741970408, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1609551906585693, | |
| "mean_token_accuracy": 0.7286869585514069, | |
| "num_tokens": 189998066.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 1.5195523500442505, | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.041045904159546, | |
| "mean_token_accuracy": 0.7484868466854095, | |
| "num_tokens": 190295353.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 1.5961876511573792, | |
| "epoch": 0.47130999639119453, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.121506690979004, | |
| "mean_token_accuracy": 0.7356399297714233, | |
| "num_tokens": 190585486.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 1.5453141927719116, | |
| "epoch": 0.47203175748827136, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0569976568222046, | |
| "mean_token_accuracy": 0.7470882534980774, | |
| "num_tokens": 190879665.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 1.6514986157417297, | |
| "epoch": 0.47275351858534825, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1591620445251465, | |
| "mean_token_accuracy": 0.7283570468425751, | |
| "num_tokens": 191170902.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 1.587424099445343, | |
| "epoch": 0.47347527968242514, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1275274753570557, | |
| "mean_token_accuracy": 0.7337370216846466, | |
| "num_tokens": 191461695.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 1.5560713410377502, | |
| "epoch": 0.474197040779502, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0729756355285645, | |
| "mean_token_accuracy": 0.745042473077774, | |
| "num_tokens": 191766997.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 1.568974256515503, | |
| "epoch": 0.47491880187657887, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0732755661010742, | |
| "mean_token_accuracy": 0.7431631684303284, | |
| "num_tokens": 192067054.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 1.6971306204795837, | |
| "epoch": 0.4756405629736557, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2363803386688232, | |
| "mean_token_accuracy": 0.7114757001399994, | |
| "num_tokens": 192339792.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 1.6199973821640015, | |
| "epoch": 0.4763623240707326, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1400244235992432, | |
| "mean_token_accuracy": 0.7326892018318176, | |
| "num_tokens": 192624463.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.5810664296150208, | |
| "epoch": 0.4770840851678095, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.123009443283081, | |
| "mean_token_accuracy": 0.7335048913955688, | |
| "num_tokens": 192923613.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 1.5902155637741089, | |
| "epoch": 0.4778058462648863, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1028482913970947, | |
| "mean_token_accuracy": 0.7386082112789154, | |
| "num_tokens": 193212115.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 1.5616875290870667, | |
| "epoch": 0.4785276073619632, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.065674901008606, | |
| "mean_token_accuracy": 0.7435760498046875, | |
| "num_tokens": 193497670.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 1.6460754871368408, | |
| "epoch": 0.47924936845904004, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.136084794998169, | |
| "mean_token_accuracy": 0.7333773374557495, | |
| "num_tokens": 193788000.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 1.5939441919326782, | |
| "epoch": 0.4799711295561169, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1227505207061768, | |
| "mean_token_accuracy": 0.7324938774108887, | |
| "num_tokens": 194078849.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 1.5230796337127686, | |
| "epoch": 0.4806928906531938, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.050563097000122, | |
| "mean_token_accuracy": 0.7465680241584778, | |
| "num_tokens": 194385043.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 1.6082165241241455, | |
| "epoch": 0.48141465175027065, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1573779582977295, | |
| "mean_token_accuracy": 0.7278335690498352, | |
| "num_tokens": 194670188.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 1.637342929840088, | |
| "epoch": 0.48213641284734754, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1635621786117554, | |
| "mean_token_accuracy": 0.7252436280250549, | |
| "num_tokens": 194958968.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 1.5465983152389526, | |
| "epoch": 0.4828581739444244, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0449684858322144, | |
| "mean_token_accuracy": 0.7488506436347961, | |
| "num_tokens": 195255181.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 1.6894134283065796, | |
| "epoch": 0.48357993504150126, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.173258662223816, | |
| "mean_token_accuracy": 0.7246471047401428, | |
| "num_tokens": 195537152.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.6284174919128418, | |
| "epoch": 0.48430169613857815, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1489765644073486, | |
| "mean_token_accuracy": 0.7306946516036987, | |
| "num_tokens": 195824582.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 1.5324783325195312, | |
| "epoch": 0.485023457235655, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0439010858535767, | |
| "mean_token_accuracy": 0.7482569813728333, | |
| "num_tokens": 196129732.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 1.6040911674499512, | |
| "epoch": 0.4857452183327319, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.144684076309204, | |
| "mean_token_accuracy": 0.7303643524646759, | |
| "num_tokens": 196425107.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 1.5590643882751465, | |
| "epoch": 0.4864669794298087, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0613505840301514, | |
| "mean_token_accuracy": 0.7450262010097504, | |
| "num_tokens": 196714340.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 1.5467580556869507, | |
| "epoch": 0.4871887405268856, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0995728969573975, | |
| "mean_token_accuracy": 0.7413800358772278, | |
| "num_tokens": 197014444.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 1.5725948214530945, | |
| "epoch": 0.4879105016239625, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0914405584335327, | |
| "mean_token_accuracy": 0.741468071937561, | |
| "num_tokens": 197313172.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 1.5964953303337097, | |
| "epoch": 0.4886322627210393, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1271400451660156, | |
| "mean_token_accuracy": 0.7343786656856537, | |
| "num_tokens": 197597697.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 1.5927131175994873, | |
| "epoch": 0.4893540238181162, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1044011116027832, | |
| "mean_token_accuracy": 0.7394413650035858, | |
| "num_tokens": 197900874.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 1.5892881155014038, | |
| "epoch": 0.49007578491519305, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1124653816223145, | |
| "mean_token_accuracy": 0.7377249896526337, | |
| "num_tokens": 198196955.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 1.6059691309928894, | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1000442504882812, | |
| "mean_token_accuracy": 0.739370733499527, | |
| "num_tokens": 198493108.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.54014652967453, | |
| "epoch": 0.4915193071093468, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0418875217437744, | |
| "mean_token_accuracy": 0.7487592995166779, | |
| "num_tokens": 198791969.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 1.6647250056266785, | |
| "epoch": 0.49224106820642366, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1950716972351074, | |
| "mean_token_accuracy": 0.7244281470775604, | |
| "num_tokens": 199063673.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 1.5725098848342896, | |
| "epoch": 0.49296282930350055, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0622737407684326, | |
| "mean_token_accuracy": 0.7449676990509033, | |
| "num_tokens": 199348725.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 1.6303915977478027, | |
| "epoch": 0.4936845904005774, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.176119089126587, | |
| "mean_token_accuracy": 0.7237577736377716, | |
| "num_tokens": 199638940.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 1.534554362297058, | |
| "epoch": 0.4944063514976543, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.067025899887085, | |
| "mean_token_accuracy": 0.7451322376728058, | |
| "num_tokens": 199951747.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 1.6310531497001648, | |
| "epoch": 0.49512811259473116, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1522763967514038, | |
| "mean_token_accuracy": 0.730984091758728, | |
| "num_tokens": 200228943.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 1.5954148769378662, | |
| "epoch": 0.495849873691808, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.130525827407837, | |
| "mean_token_accuracy": 0.7352426648139954, | |
| "num_tokens": 200522695.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 1.5577545762062073, | |
| "epoch": 0.4965716347888849, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9982820749282837, | |
| "mean_token_accuracy": 0.7559614181518555, | |
| "num_tokens": 200803558.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 1.595203697681427, | |
| "epoch": 0.4972933958859617, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.077322244644165, | |
| "mean_token_accuracy": 0.7435383200645447, | |
| "num_tokens": 201101711.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 1.650526225566864, | |
| "epoch": 0.4980151569830386, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1505811214447021, | |
| "mean_token_accuracy": 0.7304760217666626, | |
| "num_tokens": 201387237.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.5869037508964539, | |
| "epoch": 0.4987369180801155, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0749313831329346, | |
| "mean_token_accuracy": 0.743472695350647, | |
| "num_tokens": 201681505.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 1.609952688217163, | |
| "epoch": 0.49945867917719233, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1253128051757812, | |
| "mean_token_accuracy": 0.7331607341766357, | |
| "num_tokens": 201975086.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 1.6648921966552734, | |
| "epoch": 0.5001804402742692, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2114580869674683, | |
| "mean_token_accuracy": 0.7181891202926636, | |
| "num_tokens": 202268769.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 1.6200402975082397, | |
| "epoch": 0.5009022013713461, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1498284339904785, | |
| "mean_token_accuracy": 0.7282987534999847, | |
| "num_tokens": 202560197.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 1.5734732747077942, | |
| "epoch": 0.501623962468423, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1015775203704834, | |
| "mean_token_accuracy": 0.7396606802940369, | |
| "num_tokens": 202860320.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 1.6191856861114502, | |
| "epoch": 0.5023457235654998, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1405696868896484, | |
| "mean_token_accuracy": 0.7288414835929871, | |
| "num_tokens": 203140737.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 1.6207420825958252, | |
| "epoch": 0.5030674846625767, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1624443531036377, | |
| "mean_token_accuracy": 0.7294009029865265, | |
| "num_tokens": 203440927.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 1.5846516489982605, | |
| "epoch": 0.5037892457596536, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0885640382766724, | |
| "mean_token_accuracy": 0.742593377828598, | |
| "num_tokens": 203720121.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 1.5653942227363586, | |
| "epoch": 0.5045110068567304, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0944230556488037, | |
| "mean_token_accuracy": 0.7408529818058014, | |
| "num_tokens": 204010618.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 1.543853998184204, | |
| "epoch": 0.5052327679538073, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0595479011535645, | |
| "mean_token_accuracy": 0.750019758939743, | |
| "num_tokens": 204297559.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.5826244354248047, | |
| "epoch": 0.5059545290508841, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0853207111358643, | |
| "mean_token_accuracy": 0.741440087556839, | |
| "num_tokens": 204598088.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 1.5929073095321655, | |
| "epoch": 0.506676290147961, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1123278141021729, | |
| "mean_token_accuracy": 0.7367078959941864, | |
| "num_tokens": 204883696.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 1.562481939792633, | |
| "epoch": 0.5073980512450379, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0944299697875977, | |
| "mean_token_accuracy": 0.7395003736019135, | |
| "num_tokens": 205170910.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 1.610314965248108, | |
| "epoch": 0.5081198123421148, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.14693021774292, | |
| "mean_token_accuracy": 0.7300017774105072, | |
| "num_tokens": 205462592.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 1.6552006006240845, | |
| "epoch": 0.5088415734391917, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.182011604309082, | |
| "mean_token_accuracy": 0.724445641040802, | |
| "num_tokens": 205755920.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 1.5695324540138245, | |
| "epoch": 0.5095633345362685, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.079665184020996, | |
| "mean_token_accuracy": 0.7417963445186615, | |
| "num_tokens": 206049664.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 1.5728851556777954, | |
| "epoch": 0.5102850956333453, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0848987102508545, | |
| "mean_token_accuracy": 0.7398684620857239, | |
| "num_tokens": 206337675.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 1.5587612986564636, | |
| "epoch": 0.5110068567304222, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0988192558288574, | |
| "mean_token_accuracy": 0.7362302541732788, | |
| "num_tokens": 206641009.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 1.5805190801620483, | |
| "epoch": 0.5117286178274991, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0927263498306274, | |
| "mean_token_accuracy": 0.7389773726463318, | |
| "num_tokens": 206939065.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 1.6272093653678894, | |
| "epoch": 0.512450378924576, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.159834623336792, | |
| "mean_token_accuracy": 0.7274581789970398, | |
| "num_tokens": 207224261.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.5737531185150146, | |
| "epoch": 0.5131721400216528, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0826325416564941, | |
| "mean_token_accuracy": 0.7398369312286377, | |
| "num_tokens": 207503697.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 1.6702247858047485, | |
| "epoch": 0.5138939011187297, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2031714916229248, | |
| "mean_token_accuracy": 0.7175164520740509, | |
| "num_tokens": 207783395.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 1.5755795240402222, | |
| "epoch": 0.5146156622158066, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0748627185821533, | |
| "mean_token_accuracy": 0.7444513440132141, | |
| "num_tokens": 208085969.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 1.6208624839782715, | |
| "epoch": 0.5153374233128835, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.183680772781372, | |
| "mean_token_accuracy": 0.7229005992412567, | |
| "num_tokens": 208376141.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 1.5311552286148071, | |
| "epoch": 0.5160591844099603, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0270320177078247, | |
| "mean_token_accuracy": 0.750772088766098, | |
| "num_tokens": 208679981.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 1.5969658493995667, | |
| "epoch": 0.5167809455070371, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1303260326385498, | |
| "mean_token_accuracy": 0.734829843044281, | |
| "num_tokens": 208977455.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 1.623799204826355, | |
| "epoch": 0.517502706604114, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1339950561523438, | |
| "mean_token_accuracy": 0.7317747175693512, | |
| "num_tokens": 209268325.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 1.5979817509651184, | |
| "epoch": 0.5182244677011909, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1302764415740967, | |
| "mean_token_accuracy": 0.7326715290546417, | |
| "num_tokens": 209573915.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 1.6247981786727905, | |
| "epoch": 0.5189462287982678, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1484184265136719, | |
| "mean_token_accuracy": 0.7294766902923584, | |
| "num_tokens": 209861269.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 1.556791067123413, | |
| "epoch": 0.5196679898953447, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0870962142944336, | |
| "mean_token_accuracy": 0.7413330376148224, | |
| "num_tokens": 210161404.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.5457544922828674, | |
| "epoch": 0.5203897509924215, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.064863920211792, | |
| "mean_token_accuracy": 0.7458622455596924, | |
| "num_tokens": 210450616.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 1.6112803220748901, | |
| "epoch": 0.5211115120894984, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1411374807357788, | |
| "mean_token_accuracy": 0.7312836050987244, | |
| "num_tokens": 210755250.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 1.6802918910980225, | |
| "epoch": 0.5218332731865752, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2060627937316895, | |
| "mean_token_accuracy": 0.7200701534748077, | |
| "num_tokens": 211038739.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 1.6193804740905762, | |
| "epoch": 0.5225550342836521, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1414573192596436, | |
| "mean_token_accuracy": 0.7297541201114655, | |
| "num_tokens": 211324256.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 1.5679888129234314, | |
| "epoch": 0.523276795380729, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0963985919952393, | |
| "mean_token_accuracy": 0.7375109791755676, | |
| "num_tokens": 211626419.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 1.6310662031173706, | |
| "epoch": 0.5239985564778058, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1537364721298218, | |
| "mean_token_accuracy": 0.7281556725502014, | |
| "num_tokens": 211901539.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 1.5483182072639465, | |
| "epoch": 0.5247203175748827, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0793248414993286, | |
| "mean_token_accuracy": 0.7428569197654724, | |
| "num_tokens": 212216106.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 1.6444163918495178, | |
| "epoch": 0.5254420786719596, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1351898908615112, | |
| "mean_token_accuracy": 0.7313067317008972, | |
| "num_tokens": 212505574.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 1.6563920974731445, | |
| "epoch": 0.5261638397690365, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1693782806396484, | |
| "mean_token_accuracy": 0.7237509489059448, | |
| "num_tokens": 212785524.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 1.6383143067359924, | |
| "epoch": 0.5268856008661134, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1533615589141846, | |
| "mean_token_accuracy": 0.7285424172878265, | |
| "num_tokens": 213069244.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.6152408719062805, | |
| "epoch": 0.5276073619631901, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1325241327285767, | |
| "mean_token_accuracy": 0.7335585355758667, | |
| "num_tokens": 213353319.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 1.6316519379615784, | |
| "epoch": 0.528329123060267, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1464107036590576, | |
| "mean_token_accuracy": 0.7307578325271606, | |
| "num_tokens": 213641823.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 1.5490845441818237, | |
| "epoch": 0.5290508841573439, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.036945104598999, | |
| "mean_token_accuracy": 0.7499020099639893, | |
| "num_tokens": 213931030.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 1.601906955242157, | |
| "epoch": 0.5297726452544208, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1039929389953613, | |
| "mean_token_accuracy": 0.7359420657157898, | |
| "num_tokens": 214237040.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 1.5261776447296143, | |
| "epoch": 0.5304944063514977, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0283230543136597, | |
| "mean_token_accuracy": 0.7512286603450775, | |
| "num_tokens": 214525487.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 1.5909109711647034, | |
| "epoch": 0.5312161674485745, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1032295227050781, | |
| "mean_token_accuracy": 0.7379952669143677, | |
| "num_tokens": 214824766.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 1.6112946271896362, | |
| "epoch": 0.5319379285456514, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1571892499923706, | |
| "mean_token_accuracy": 0.728611558675766, | |
| "num_tokens": 215108168.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 1.6201320886611938, | |
| "epoch": 0.5326596896427283, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1244066953659058, | |
| "mean_token_accuracy": 0.7328232526779175, | |
| "num_tokens": 215405154.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 1.62282395362854, | |
| "epoch": 0.5333814507398051, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1463348865509033, | |
| "mean_token_accuracy": 0.7319855988025665, | |
| "num_tokens": 215700997.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 1.6334552764892578, | |
| "epoch": 0.534103211836882, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.125380516052246, | |
| "mean_token_accuracy": 0.7308821380138397, | |
| "num_tokens": 215980949.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.5650272369384766, | |
| "epoch": 0.5348249729339588, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.109032392501831, | |
| "mean_token_accuracy": 0.73592209815979, | |
| "num_tokens": 216269420.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 1.5470651388168335, | |
| "epoch": 0.5355467340310357, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0759892463684082, | |
| "mean_token_accuracy": 0.741902083158493, | |
| "num_tokens": 216563116.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 1.557494044303894, | |
| "epoch": 0.5362684951281126, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0595998764038086, | |
| "mean_token_accuracy": 0.7465083301067352, | |
| "num_tokens": 216852019.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 1.544226348400116, | |
| "epoch": 0.5369902562251895, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.047929048538208, | |
| "mean_token_accuracy": 0.7467096149921417, | |
| "num_tokens": 217153768.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 1.6541085839271545, | |
| "epoch": 0.5377120173222664, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1345123052597046, | |
| "mean_token_accuracy": 0.7301164865493774, | |
| "num_tokens": 217448613.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 1.5435039401054382, | |
| "epoch": 0.5384337784193431, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0374845266342163, | |
| "mean_token_accuracy": 0.7508023083209991, | |
| "num_tokens": 217753223.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 1.5598674416542053, | |
| "epoch": 0.53915553951642, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0819848775863647, | |
| "mean_token_accuracy": 0.7415176033973694, | |
| "num_tokens": 218048926.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 1.629036545753479, | |
| "epoch": 0.5398773006134969, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1421637535095215, | |
| "mean_token_accuracy": 0.7299045026302338, | |
| "num_tokens": 218331313.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 1.685838222503662, | |
| "epoch": 0.5405990617105738, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2454309463500977, | |
| "mean_token_accuracy": 0.7117385566234589, | |
| "num_tokens": 218627942.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 1.5957022905349731, | |
| "epoch": 0.5413208228076507, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1155003309249878, | |
| "mean_token_accuracy": 0.7351410686969757, | |
| "num_tokens": 218943828.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.5749086141586304, | |
| "epoch": 0.5420425839047275, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0841670036315918, | |
| "mean_token_accuracy": 0.7402811944484711, | |
| "num_tokens": 219225072.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 1.6269233226776123, | |
| "epoch": 0.5427643450018044, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1289806365966797, | |
| "mean_token_accuracy": 0.7308678030967712, | |
| "num_tokens": 219518554.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 1.5796434879302979, | |
| "epoch": 0.5434861060988813, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0627174377441406, | |
| "mean_token_accuracy": 0.746688038110733, | |
| "num_tokens": 219825271.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 1.6288336515426636, | |
| "epoch": 0.5442078671959582, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1438758373260498, | |
| "mean_token_accuracy": 0.7289855182170868, | |
| "num_tokens": 220118104.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 1.6030755639076233, | |
| "epoch": 0.544929628293035, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1219998598098755, | |
| "mean_token_accuracy": 0.7331314086914062, | |
| "num_tokens": 220420691.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 1.6133848428726196, | |
| "epoch": 0.5456513893901118, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1374256610870361, | |
| "mean_token_accuracy": 0.7301732897758484, | |
| "num_tokens": 220702451.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 1.6308754682540894, | |
| "epoch": 0.5463731504871887, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.14863920211792, | |
| "mean_token_accuracy": 0.7287968397140503, | |
| "num_tokens": 220987117.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 1.6142635345458984, | |
| "epoch": 0.5470949115842656, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1083922386169434, | |
| "mean_token_accuracy": 0.7354268729686737, | |
| "num_tokens": 221271908.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 1.630098819732666, | |
| "epoch": 0.5478166726813425, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1474804878234863, | |
| "mean_token_accuracy": 0.7301556468009949, | |
| "num_tokens": 221552965.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 1.591932773590088, | |
| "epoch": 0.5485384337784194, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1405705213546753, | |
| "mean_token_accuracy": 0.7305808067321777, | |
| "num_tokens": 221844361.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.6235320568084717, | |
| "epoch": 0.5492601948754963, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1320708990097046, | |
| "mean_token_accuracy": 0.7303408980369568, | |
| "num_tokens": 222137005.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 1.5472183227539062, | |
| "epoch": 0.549981955972573, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.082100510597229, | |
| "mean_token_accuracy": 0.7425680458545685, | |
| "num_tokens": 222445577.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 1.6391677260398865, | |
| "epoch": 0.5507037170696499, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.156656265258789, | |
| "mean_token_accuracy": 0.7282198667526245, | |
| "num_tokens": 222734766.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 1.5806949138641357, | |
| "epoch": 0.5514254781667268, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0894607305526733, | |
| "mean_token_accuracy": 0.7413555979728699, | |
| "num_tokens": 223024882.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 1.6494798064231873, | |
| "epoch": 0.5521472392638037, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1354026794433594, | |
| "mean_token_accuracy": 0.7325569689273834, | |
| "num_tokens": 223300972.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 1.6756582260131836, | |
| "epoch": 0.5528690003608806, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1743569374084473, | |
| "mean_token_accuracy": 0.7269199192523956, | |
| "num_tokens": 223595612.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 1.6009389758110046, | |
| "epoch": 0.5535907614579574, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1179879903793335, | |
| "mean_token_accuracy": 0.7335212528705597, | |
| "num_tokens": 223895608.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 1.5963427424430847, | |
| "epoch": 0.5543125225550343, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1309130191802979, | |
| "mean_token_accuracy": 0.733909398317337, | |
| "num_tokens": 224188531.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 1.6220159530639648, | |
| "epoch": 0.5550342836521112, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.131821870803833, | |
| "mean_token_accuracy": 0.7359085381031036, | |
| "num_tokens": 224470391.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 1.5496108531951904, | |
| "epoch": 0.555756044749188, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0798261165618896, | |
| "mean_token_accuracy": 0.7418549656867981, | |
| "num_tokens": 224782620.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.6066091060638428, | |
| "epoch": 0.5564778058462649, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.143924355506897, | |
| "mean_token_accuracy": 0.7313476204872131, | |
| "num_tokens": 225077949.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 1.6423416137695312, | |
| "epoch": 0.5571995669433417, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1612894535064697, | |
| "mean_token_accuracy": 0.7264752089977264, | |
| "num_tokens": 225360863.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 1.628053069114685, | |
| "epoch": 0.5579213280404186, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1273075342178345, | |
| "mean_token_accuracy": 0.7317852079868317, | |
| "num_tokens": 225659408.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 1.6222748160362244, | |
| "epoch": 0.5586430891374955, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1602423191070557, | |
| "mean_token_accuracy": 0.7270052134990692, | |
| "num_tokens": 225954561.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 1.5906856656074524, | |
| "epoch": 0.5593648502345724, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1184130907058716, | |
| "mean_token_accuracy": 0.7324536144733429, | |
| "num_tokens": 226250518.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 1.6030939221382141, | |
| "epoch": 0.5600866113316493, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0950555801391602, | |
| "mean_token_accuracy": 0.7394770979881287, | |
| "num_tokens": 226532673.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 1.5924795866012573, | |
| "epoch": 0.560808372428726, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1120665073394775, | |
| "mean_token_accuracy": 0.7360101342201233, | |
| "num_tokens": 226829900.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 1.5758381485939026, | |
| "epoch": 0.5615301335258029, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0907894372940063, | |
| "mean_token_accuracy": 0.7394452691078186, | |
| "num_tokens": 227123175.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 1.5781712532043457, | |
| "epoch": 0.5622518946228798, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0869512557983398, | |
| "mean_token_accuracy": 0.7403784096240997, | |
| "num_tokens": 227422444.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 1.6636009216308594, | |
| "epoch": 0.5629736557199567, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.198331356048584, | |
| "mean_token_accuracy": 0.7186297476291656, | |
| "num_tokens": 227730231.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.5778379440307617, | |
| "epoch": 0.5636954168170336, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0727307796478271, | |
| "mean_token_accuracy": 0.7411287426948547, | |
| "num_tokens": 228022142.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 1.535947561264038, | |
| "epoch": 0.5644171779141104, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0535205602645874, | |
| "mean_token_accuracy": 0.7461379766464233, | |
| "num_tokens": 228330142.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 1.5787633061408997, | |
| "epoch": 0.5651389390111873, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0756618976593018, | |
| "mean_token_accuracy": 0.7427201569080353, | |
| "num_tokens": 228627155.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 1.5963982343673706, | |
| "epoch": 0.5658607001082642, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1443114280700684, | |
| "mean_token_accuracy": 0.7296222448348999, | |
| "num_tokens": 228924588.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 1.6304583549499512, | |
| "epoch": 0.5665824612053411, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1516461372375488, | |
| "mean_token_accuracy": 0.7281636297702789, | |
| "num_tokens": 229218488.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 1.5999409556388855, | |
| "epoch": 0.567304222302418, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1249396800994873, | |
| "mean_token_accuracy": 0.7317104339599609, | |
| "num_tokens": 229514549.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 1.6203616261482239, | |
| "epoch": 0.5680259833994947, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1535447835922241, | |
| "mean_token_accuracy": 0.7284615933895111, | |
| "num_tokens": 229810235.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 1.641538381576538, | |
| "epoch": 0.5687477444965716, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1865267753601074, | |
| "mean_token_accuracy": 0.724246621131897, | |
| "num_tokens": 230115175.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 1.7468316555023193, | |
| "epoch": 0.5694695055936485, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2349027395248413, | |
| "mean_token_accuracy": 0.7116598784923553, | |
| "num_tokens": 230384741.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 1.6420515179634094, | |
| "epoch": 0.5701912666907254, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1442468166351318, | |
| "mean_token_accuracy": 0.7302609384059906, | |
| "num_tokens": 230661857.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.6136687397956848, | |
| "epoch": 0.5709130277878023, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1194534301757812, | |
| "mean_token_accuracy": 0.7354243993759155, | |
| "num_tokens": 230944663.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 1.5768129229545593, | |
| "epoch": 0.5716347888848791, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0868890285491943, | |
| "mean_token_accuracy": 0.7402880489826202, | |
| "num_tokens": 231240727.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 1.6554696559906006, | |
| "epoch": 0.572356549981956, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1407463550567627, | |
| "mean_token_accuracy": 0.7300878465175629, | |
| "num_tokens": 231534404.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 1.5718886256217957, | |
| "epoch": 0.5730783110790328, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.050676703453064, | |
| "mean_token_accuracy": 0.7486723959445953, | |
| "num_tokens": 231829902.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 1.5340809226036072, | |
| "epoch": 0.5738000721761097, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0349626541137695, | |
| "mean_token_accuracy": 0.7493254542350769, | |
| "num_tokens": 232128819.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 1.5420336723327637, | |
| "epoch": 0.5745218332731866, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0301799774169922, | |
| "mean_token_accuracy": 0.7503442466259003, | |
| "num_tokens": 232422368.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 1.601945161819458, | |
| "epoch": 0.5752435943702634, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1416549682617188, | |
| "mean_token_accuracy": 0.731245368719101, | |
| "num_tokens": 232721865.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 1.5999817252159119, | |
| "epoch": 0.5759653554673403, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0979357957839966, | |
| "mean_token_accuracy": 0.7386476993560791, | |
| "num_tokens": 233007261.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 1.6433138251304626, | |
| "epoch": 0.5766871165644172, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.174140214920044, | |
| "mean_token_accuracy": 0.7229587435722351, | |
| "num_tokens": 233305743.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 1.6058536767959595, | |
| "epoch": 0.5774088776614941, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1397472620010376, | |
| "mean_token_accuracy": 0.7314248979091644, | |
| "num_tokens": 233604387.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.6596161127090454, | |
| "epoch": 0.578130638758571, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.20133638381958, | |
| "mean_token_accuracy": 0.7219476997852325, | |
| "num_tokens": 233901570.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 1.591832458972931, | |
| "epoch": 0.5788523998556477, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.132655143737793, | |
| "mean_token_accuracy": 0.7314733564853668, | |
| "num_tokens": 234197534.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 1.5689212083816528, | |
| "epoch": 0.5795741609527246, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0715937614440918, | |
| "mean_token_accuracy": 0.7423879504203796, | |
| "num_tokens": 234485552.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 1.6184499859809875, | |
| "epoch": 0.5802959220498015, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1449391841888428, | |
| "mean_token_accuracy": 0.7319357991218567, | |
| "num_tokens": 234772765.0, | |
| "step": 804 | |
| }, | |
| { | |
| "entropy": 1.560195803642273, | |
| "epoch": 0.5810176831468784, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0644077062606812, | |
| "mean_token_accuracy": 0.7447086870670319, | |
| "num_tokens": 235071126.0, | |
| "step": 805 | |
| }, | |
| { | |
| "entropy": 1.5500043630599976, | |
| "epoch": 0.5817394442439553, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.062759280204773, | |
| "mean_token_accuracy": 0.7457413375377655, | |
| "num_tokens": 235376234.0, | |
| "step": 806 | |
| }, | |
| { | |
| "entropy": 1.727298617362976, | |
| "epoch": 0.5824612053410321, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2301886081695557, | |
| "mean_token_accuracy": 0.7164928317070007, | |
| "num_tokens": 235664471.0, | |
| "step": 807 | |
| }, | |
| { | |
| "entropy": 1.6400126218795776, | |
| "epoch": 0.583182966438109, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1442259550094604, | |
| "mean_token_accuracy": 0.7289875149726868, | |
| "num_tokens": 235954455.0, | |
| "step": 808 | |
| }, | |
| { | |
| "entropy": 1.5891234874725342, | |
| "epoch": 0.5839047275351859, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1428073644638062, | |
| "mean_token_accuracy": 0.731132298707962, | |
| "num_tokens": 236249908.0, | |
| "step": 809 | |
| }, | |
| { | |
| "entropy": 1.6795213222503662, | |
| "epoch": 0.5846264886322627, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1815935373306274, | |
| "mean_token_accuracy": 0.7229796350002289, | |
| "num_tokens": 236529709.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.5735644698143005, | |
| "epoch": 0.5853482497293396, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0912050008773804, | |
| "mean_token_accuracy": 0.7410696148872375, | |
| "num_tokens": 236824950.0, | |
| "step": 811 | |
| }, | |
| { | |
| "entropy": 1.599673092365265, | |
| "epoch": 0.5860700108264164, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1251013278961182, | |
| "mean_token_accuracy": 0.7329800128936768, | |
| "num_tokens": 237126802.0, | |
| "step": 812 | |
| }, | |
| { | |
| "entropy": 1.646547257900238, | |
| "epoch": 0.5867917719234933, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1593366861343384, | |
| "mean_token_accuracy": 0.7274273931980133, | |
| "num_tokens": 237408940.0, | |
| "step": 813 | |
| }, | |
| { | |
| "entropy": 1.638170838356018, | |
| "epoch": 0.5875135330205702, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.184498906135559, | |
| "mean_token_accuracy": 0.7247344553470612, | |
| "num_tokens": 237705246.0, | |
| "step": 814 | |
| }, | |
| { | |
| "entropy": 1.625171184539795, | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.110393762588501, | |
| "mean_token_accuracy": 0.7368338108062744, | |
| "num_tokens": 237998903.0, | |
| "step": 815 | |
| }, | |
| { | |
| "entropy": 1.5660130977630615, | |
| "epoch": 0.588957055214724, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.103783130645752, | |
| "mean_token_accuracy": 0.7363527119159698, | |
| "num_tokens": 238303813.0, | |
| "step": 816 | |
| }, | |
| { | |
| "entropy": 1.6444847583770752, | |
| "epoch": 0.5896788163118007, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1604297161102295, | |
| "mean_token_accuracy": 0.7265297770500183, | |
| "num_tokens": 238576924.0, | |
| "step": 817 | |
| }, | |
| { | |
| "entropy": 1.5979716181755066, | |
| "epoch": 0.5904005774088776, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0916812419891357, | |
| "mean_token_accuracy": 0.7400945723056793, | |
| "num_tokens": 238873179.0, | |
| "step": 818 | |
| }, | |
| { | |
| "entropy": 1.6076260805130005, | |
| "epoch": 0.5911223385059545, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1293686628341675, | |
| "mean_token_accuracy": 0.7330831289291382, | |
| "num_tokens": 239160059.0, | |
| "step": 819 | |
| }, | |
| { | |
| "entropy": 1.5772748589515686, | |
| "epoch": 0.5918440996030314, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0674822330474854, | |
| "mean_token_accuracy": 0.7467565536499023, | |
| "num_tokens": 239441306.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.619528889656067, | |
| "epoch": 0.5925658607001083, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1263235807418823, | |
| "mean_token_accuracy": 0.7331737279891968, | |
| "num_tokens": 239744900.0, | |
| "step": 821 | |
| }, | |
| { | |
| "entropy": 1.5617541670799255, | |
| "epoch": 0.5932876217971851, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.090696096420288, | |
| "mean_token_accuracy": 0.7420140504837036, | |
| "num_tokens": 240035188.0, | |
| "step": 822 | |
| }, | |
| { | |
| "entropy": 1.5940911173820496, | |
| "epoch": 0.594009382894262, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1337205171585083, | |
| "mean_token_accuracy": 0.7307989299297333, | |
| "num_tokens": 240327739.0, | |
| "step": 823 | |
| }, | |
| { | |
| "entropy": 1.6092841625213623, | |
| "epoch": 0.5947311439913389, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1504008769989014, | |
| "mean_token_accuracy": 0.7267559170722961, | |
| "num_tokens": 240610249.0, | |
| "step": 824 | |
| }, | |
| { | |
| "entropy": 1.6440231204032898, | |
| "epoch": 0.5954529050884158, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1251955032348633, | |
| "mean_token_accuracy": 0.7332545220851898, | |
| "num_tokens": 240899880.0, | |
| "step": 825 | |
| }, | |
| { | |
| "entropy": 1.63515704870224, | |
| "epoch": 0.5961746661854926, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.139757513999939, | |
| "mean_token_accuracy": 0.7306568026542664, | |
| "num_tokens": 241186750.0, | |
| "step": 826 | |
| }, | |
| { | |
| "entropy": 1.574657678604126, | |
| "epoch": 0.5968964272825694, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.086228847503662, | |
| "mean_token_accuracy": 0.7400854825973511, | |
| "num_tokens": 241466949.0, | |
| "step": 827 | |
| }, | |
| { | |
| "entropy": 1.6359160542488098, | |
| "epoch": 0.5976181883796463, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1688497066497803, | |
| "mean_token_accuracy": 0.7251756489276886, | |
| "num_tokens": 241775442.0, | |
| "step": 828 | |
| }, | |
| { | |
| "entropy": 1.5677058100700378, | |
| "epoch": 0.5983399494767232, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0687313079833984, | |
| "mean_token_accuracy": 0.746554434299469, | |
| "num_tokens": 242083256.0, | |
| "step": 829 | |
| }, | |
| { | |
| "entropy": 1.609235405921936, | |
| "epoch": 0.5990617105738001, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1303436756134033, | |
| "mean_token_accuracy": 0.7338172197341919, | |
| "num_tokens": 242379273.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.5625333786010742, | |
| "epoch": 0.599783471670877, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0885215997695923, | |
| "mean_token_accuracy": 0.7415518462657928, | |
| "num_tokens": 242685269.0, | |
| "step": 831 | |
| }, | |
| { | |
| "entropy": 1.5976009368896484, | |
| "epoch": 0.6005052327679538, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1217962503433228, | |
| "mean_token_accuracy": 0.7356420159339905, | |
| "num_tokens": 242982642.0, | |
| "step": 832 | |
| }, | |
| { | |
| "entropy": 1.6693939566612244, | |
| "epoch": 0.6012269938650306, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1966601610183716, | |
| "mean_token_accuracy": 0.7216682136058807, | |
| "num_tokens": 243265802.0, | |
| "step": 833 | |
| }, | |
| { | |
| "entropy": 1.5951752662658691, | |
| "epoch": 0.6019487549621075, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1119740009307861, | |
| "mean_token_accuracy": 0.7357187867164612, | |
| "num_tokens": 243548853.0, | |
| "step": 834 | |
| }, | |
| { | |
| "entropy": 1.5999594926834106, | |
| "epoch": 0.6026705160591844, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1252211332321167, | |
| "mean_token_accuracy": 0.7341367900371552, | |
| "num_tokens": 243843685.0, | |
| "step": 835 | |
| }, | |
| { | |
| "entropy": 1.5888774991035461, | |
| "epoch": 0.6033922771562613, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1227599382400513, | |
| "mean_token_accuracy": 0.7352261245250702, | |
| "num_tokens": 244145521.0, | |
| "step": 836 | |
| }, | |
| { | |
| "entropy": 1.6378155946731567, | |
| "epoch": 0.6041140382533381, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1893961429595947, | |
| "mean_token_accuracy": 0.7203771471977234, | |
| "num_tokens": 244440744.0, | |
| "step": 837 | |
| }, | |
| { | |
| "entropy": 1.5931916236877441, | |
| "epoch": 0.604835799350415, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0919225215911865, | |
| "mean_token_accuracy": 0.7403932213783264, | |
| "num_tokens": 244734254.0, | |
| "step": 838 | |
| }, | |
| { | |
| "entropy": 1.519165277481079, | |
| "epoch": 0.6055575604474919, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0478832721710205, | |
| "mean_token_accuracy": 0.7476430833339691, | |
| "num_tokens": 245045811.0, | |
| "step": 839 | |
| }, | |
| { | |
| "entropy": 1.6530899405479431, | |
| "epoch": 0.6062793215445688, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1435611248016357, | |
| "mean_token_accuracy": 0.7297418415546417, | |
| "num_tokens": 245328853.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.5310051441192627, | |
| "epoch": 0.6070010826416457, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0675201416015625, | |
| "mean_token_accuracy": 0.7423709034919739, | |
| "num_tokens": 245637169.0, | |
| "step": 841 | |
| }, | |
| { | |
| "entropy": 1.5507161617279053, | |
| "epoch": 0.6077228437387224, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0705628395080566, | |
| "mean_token_accuracy": 0.7419393360614777, | |
| "num_tokens": 245937163.0, | |
| "step": 842 | |
| }, | |
| { | |
| "entropy": 1.5738581418991089, | |
| "epoch": 0.6084446048357993, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0876317024230957, | |
| "mean_token_accuracy": 0.7430408298969269, | |
| "num_tokens": 246241707.0, | |
| "step": 843 | |
| }, | |
| { | |
| "entropy": 1.6529176831245422, | |
| "epoch": 0.6091663659328762, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1885097026824951, | |
| "mean_token_accuracy": 0.7237593829631805, | |
| "num_tokens": 246535964.0, | |
| "step": 844 | |
| }, | |
| { | |
| "entropy": 1.6576657891273499, | |
| "epoch": 0.6098881270299531, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1649441719055176, | |
| "mean_token_accuracy": 0.7286129593849182, | |
| "num_tokens": 246815910.0, | |
| "step": 845 | |
| }, | |
| { | |
| "entropy": 1.5953180193901062, | |
| "epoch": 0.61060988812703, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.092428207397461, | |
| "mean_token_accuracy": 0.7374953329563141, | |
| "num_tokens": 247101046.0, | |
| "step": 846 | |
| }, | |
| { | |
| "entropy": 1.6287794709205627, | |
| "epoch": 0.6113316492241068, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.167184829711914, | |
| "mean_token_accuracy": 0.7245387136936188, | |
| "num_tokens": 247398266.0, | |
| "step": 847 | |
| }, | |
| { | |
| "entropy": 1.6177645325660706, | |
| "epoch": 0.6120534103211837, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1393437385559082, | |
| "mean_token_accuracy": 0.730632871389389, | |
| "num_tokens": 247695970.0, | |
| "step": 848 | |
| }, | |
| { | |
| "entropy": 1.6005699038505554, | |
| "epoch": 0.6127751714182605, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1284637451171875, | |
| "mean_token_accuracy": 0.7324575185775757, | |
| "num_tokens": 247976005.0, | |
| "step": 849 | |
| }, | |
| { | |
| "entropy": 1.6011532545089722, | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0987575054168701, | |
| "mean_token_accuracy": 0.7377772927284241, | |
| "num_tokens": 248263761.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.5941656827926636, | |
| "epoch": 0.6142186936124143, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.116324543952942, | |
| "mean_token_accuracy": 0.7360095977783203, | |
| "num_tokens": 248547416.0, | |
| "step": 851 | |
| }, | |
| { | |
| "entropy": 1.6171147227287292, | |
| "epoch": 0.6149404547094912, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1542052030563354, | |
| "mean_token_accuracy": 0.7280252277851105, | |
| "num_tokens": 248850035.0, | |
| "step": 852 | |
| }, | |
| { | |
| "entropy": 1.5732914209365845, | |
| "epoch": 0.615662215806568, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0778353214263916, | |
| "mean_token_accuracy": 0.7420243918895721, | |
| "num_tokens": 249153157.0, | |
| "step": 853 | |
| }, | |
| { | |
| "entropy": 1.5666159391403198, | |
| "epoch": 0.6163839769036449, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0931177139282227, | |
| "mean_token_accuracy": 0.7384633719921112, | |
| "num_tokens": 249457894.0, | |
| "step": 854 | |
| }, | |
| { | |
| "entropy": 1.5923348665237427, | |
| "epoch": 0.6171057380007218, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.10996413230896, | |
| "mean_token_accuracy": 0.7368112206459045, | |
| "num_tokens": 249748785.0, | |
| "step": 855 | |
| }, | |
| { | |
| "entropy": 1.5746431350708008, | |
| "epoch": 0.6178274990977987, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0802783966064453, | |
| "mean_token_accuracy": 0.7404201924800873, | |
| "num_tokens": 250046612.0, | |
| "step": 856 | |
| }, | |
| { | |
| "entropy": 1.6133906245231628, | |
| "epoch": 0.6185492601948755, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1414340734481812, | |
| "mean_token_accuracy": 0.7342709004878998, | |
| "num_tokens": 250335253.0, | |
| "step": 857 | |
| }, | |
| { | |
| "entropy": 1.5678685903549194, | |
| "epoch": 0.6192710212919523, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1016035079956055, | |
| "mean_token_accuracy": 0.7387181222438812, | |
| "num_tokens": 250642741.0, | |
| "step": 858 | |
| }, | |
| { | |
| "entropy": 1.6146076917648315, | |
| "epoch": 0.6199927823890292, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.115307331085205, | |
| "mean_token_accuracy": 0.7368897795677185, | |
| "num_tokens": 250926978.0, | |
| "step": 859 | |
| }, | |
| { | |
| "entropy": 1.6452381610870361, | |
| "epoch": 0.6207145434861061, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1519224643707275, | |
| "mean_token_accuracy": 0.7301294803619385, | |
| "num_tokens": 251213211.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.62114417552948, | |
| "epoch": 0.621436304583183, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1675444841384888, | |
| "mean_token_accuracy": 0.7257243990898132, | |
| "num_tokens": 251508742.0, | |
| "step": 861 | |
| }, | |
| { | |
| "entropy": 1.6345261335372925, | |
| "epoch": 0.6221580656802599, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.15352201461792, | |
| "mean_token_accuracy": 0.7280356287956238, | |
| "num_tokens": 251803755.0, | |
| "step": 862 | |
| }, | |
| { | |
| "entropy": 1.5738179683685303, | |
| "epoch": 0.6228798267773367, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1155591011047363, | |
| "mean_token_accuracy": 0.7351842820644379, | |
| "num_tokens": 252114410.0, | |
| "step": 863 | |
| }, | |
| { | |
| "entropy": 1.6967588663101196, | |
| "epoch": 0.6236015878744136, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2030296325683594, | |
| "mean_token_accuracy": 0.7169192135334015, | |
| "num_tokens": 252408452.0, | |
| "step": 864 | |
| }, | |
| { | |
| "entropy": 1.6355851888656616, | |
| "epoch": 0.6243233489714904, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1457853317260742, | |
| "mean_token_accuracy": 0.7292567193508148, | |
| "num_tokens": 252698923.0, | |
| "step": 865 | |
| }, | |
| { | |
| "entropy": 1.5897729992866516, | |
| "epoch": 0.6250451100685673, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1399295330047607, | |
| "mean_token_accuracy": 0.732798844575882, | |
| "num_tokens": 253002243.0, | |
| "step": 866 | |
| }, | |
| { | |
| "entropy": 1.5955116748809814, | |
| "epoch": 0.6257668711656442, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1202375888824463, | |
| "mean_token_accuracy": 0.7347755134105682, | |
| "num_tokens": 253300759.0, | |
| "step": 867 | |
| }, | |
| { | |
| "entropy": 1.5647806525230408, | |
| "epoch": 0.626488632262721, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0740394592285156, | |
| "mean_token_accuracy": 0.7438717782497406, | |
| "num_tokens": 253602815.0, | |
| "step": 868 | |
| }, | |
| { | |
| "entropy": 1.6749018430709839, | |
| "epoch": 0.6272103933597979, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1937298774719238, | |
| "mean_token_accuracy": 0.722371369600296, | |
| "num_tokens": 253897331.0, | |
| "step": 869 | |
| }, | |
| { | |
| "entropy": 1.6225835084915161, | |
| "epoch": 0.6279321544568748, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.166698932647705, | |
| "mean_token_accuracy": 0.7268076837062836, | |
| "num_tokens": 254190942.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.5822492837905884, | |
| "epoch": 0.6286539155539517, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0897023677825928, | |
| "mean_token_accuracy": 0.7397574186325073, | |
| "num_tokens": 254488326.0, | |
| "step": 871 | |
| }, | |
| { | |
| "entropy": 1.5915055871009827, | |
| "epoch": 0.6293756766510286, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0920429229736328, | |
| "mean_token_accuracy": 0.7381505370140076, | |
| "num_tokens": 254783741.0, | |
| "step": 872 | |
| }, | |
| { | |
| "entropy": 1.6168046593666077, | |
| "epoch": 0.6300974377481053, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1350312232971191, | |
| "mean_token_accuracy": 0.7319681644439697, | |
| "num_tokens": 255076495.0, | |
| "step": 873 | |
| }, | |
| { | |
| "entropy": 1.593022108078003, | |
| "epoch": 0.6308191988451822, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0999274253845215, | |
| "mean_token_accuracy": 0.7374843060970306, | |
| "num_tokens": 255378884.0, | |
| "step": 874 | |
| }, | |
| { | |
| "entropy": 1.6086589694023132, | |
| "epoch": 0.6315409599422591, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.097945213317871, | |
| "mean_token_accuracy": 0.7406039834022522, | |
| "num_tokens": 255654971.0, | |
| "step": 875 | |
| }, | |
| { | |
| "entropy": 1.5572444200515747, | |
| "epoch": 0.632262721039336, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0677345991134644, | |
| "mean_token_accuracy": 0.745005875825882, | |
| "num_tokens": 255951683.0, | |
| "step": 876 | |
| }, | |
| { | |
| "entropy": 1.6466046571731567, | |
| "epoch": 0.6329844821364129, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.142011046409607, | |
| "mean_token_accuracy": 0.7301119565963745, | |
| "num_tokens": 256244505.0, | |
| "step": 877 | |
| }, | |
| { | |
| "entropy": 1.597251296043396, | |
| "epoch": 0.6337062432334897, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1165742874145508, | |
| "mean_token_accuracy": 0.7368136942386627, | |
| "num_tokens": 256532009.0, | |
| "step": 878 | |
| }, | |
| { | |
| "entropy": 1.5828416347503662, | |
| "epoch": 0.6344280043305666, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0812149047851562, | |
| "mean_token_accuracy": 0.7404722273349762, | |
| "num_tokens": 256826755.0, | |
| "step": 879 | |
| }, | |
| { | |
| "entropy": 1.6681491136550903, | |
| "epoch": 0.6351497654276435, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1748732328414917, | |
| "mean_token_accuracy": 0.727603018283844, | |
| "num_tokens": 257100921.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.5638583302497864, | |
| "epoch": 0.6358715265247203, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0835227966308594, | |
| "mean_token_accuracy": 0.7433188855648041, | |
| "num_tokens": 257404665.0, | |
| "step": 881 | |
| }, | |
| { | |
| "entropy": 1.620697557926178, | |
| "epoch": 0.6365932876217972, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1546287536621094, | |
| "mean_token_accuracy": 0.729461669921875, | |
| "num_tokens": 257692210.0, | |
| "step": 882 | |
| }, | |
| { | |
| "entropy": 1.5801892280578613, | |
| "epoch": 0.637315048718874, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0944933891296387, | |
| "mean_token_accuracy": 0.7411805987358093, | |
| "num_tokens": 257983293.0, | |
| "step": 883 | |
| }, | |
| { | |
| "entropy": 1.6741604208946228, | |
| "epoch": 0.6380368098159509, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1767834424972534, | |
| "mean_token_accuracy": 0.7239200174808502, | |
| "num_tokens": 258269415.0, | |
| "step": 884 | |
| }, | |
| { | |
| "entropy": 1.6092719435691833, | |
| "epoch": 0.6387585709130278, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1343512535095215, | |
| "mean_token_accuracy": 0.7311052978038788, | |
| "num_tokens": 258564567.0, | |
| "step": 885 | |
| }, | |
| { | |
| "entropy": 1.5514914989471436, | |
| "epoch": 0.6394803320101047, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0301095247268677, | |
| "mean_token_accuracy": 0.7504675090312958, | |
| "num_tokens": 258854120.0, | |
| "step": 886 | |
| }, | |
| { | |
| "entropy": 1.621547818183899, | |
| "epoch": 0.6402020931071816, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1654298305511475, | |
| "mean_token_accuracy": 0.7246925234794617, | |
| "num_tokens": 259133554.0, | |
| "step": 887 | |
| }, | |
| { | |
| "entropy": 1.607251226902008, | |
| "epoch": 0.6409238542042583, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1854426860809326, | |
| "mean_token_accuracy": 0.7221417725086212, | |
| "num_tokens": 259425427.0, | |
| "step": 888 | |
| }, | |
| { | |
| "entropy": 1.6045910120010376, | |
| "epoch": 0.6416456153013352, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1444283723831177, | |
| "mean_token_accuracy": 0.7326542437076569, | |
| "num_tokens": 259715199.0, | |
| "step": 889 | |
| }, | |
| { | |
| "entropy": 1.630848467350006, | |
| "epoch": 0.6423673763984121, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0887811183929443, | |
| "mean_token_accuracy": 0.7423493564128876, | |
| "num_tokens": 260006079.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.5295590162277222, | |
| "epoch": 0.643089137495489, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0422464609146118, | |
| "mean_token_accuracy": 0.7499716877937317, | |
| "num_tokens": 260309699.0, | |
| "step": 891 | |
| }, | |
| { | |
| "entropy": 1.648544192314148, | |
| "epoch": 0.6438108985925659, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1558257341384888, | |
| "mean_token_accuracy": 0.7308394014835358, | |
| "num_tokens": 260599774.0, | |
| "step": 892 | |
| }, | |
| { | |
| "entropy": 1.648951768875122, | |
| "epoch": 0.6445326596896427, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1616456508636475, | |
| "mean_token_accuracy": 0.7257620394229889, | |
| "num_tokens": 260881109.0, | |
| "step": 893 | |
| }, | |
| { | |
| "entropy": 1.6252157092094421, | |
| "epoch": 0.6452544207867196, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1504735946655273, | |
| "mean_token_accuracy": 0.7270793616771698, | |
| "num_tokens": 261165325.0, | |
| "step": 894 | |
| }, | |
| { | |
| "entropy": 1.6127622723579407, | |
| "epoch": 0.6459761818837965, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1419482231140137, | |
| "mean_token_accuracy": 0.7316944897174835, | |
| "num_tokens": 261457436.0, | |
| "step": 895 | |
| }, | |
| { | |
| "entropy": 1.53473562002182, | |
| "epoch": 0.6466979429808734, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0490422248840332, | |
| "mean_token_accuracy": 0.748588889837265, | |
| "num_tokens": 261772169.0, | |
| "step": 896 | |
| }, | |
| { | |
| "entropy": 1.557247281074524, | |
| "epoch": 0.6474197040779502, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0470713376998901, | |
| "mean_token_accuracy": 0.7490670084953308, | |
| "num_tokens": 262058729.0, | |
| "step": 897 | |
| }, | |
| { | |
| "entropy": 1.581372618675232, | |
| "epoch": 0.648141465175027, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1146125793457031, | |
| "mean_token_accuracy": 0.7369222939014435, | |
| "num_tokens": 262359564.0, | |
| "step": 898 | |
| }, | |
| { | |
| "entropy": 1.5919791460037231, | |
| "epoch": 0.6488632262721039, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1010425090789795, | |
| "mean_token_accuracy": 0.7405194044113159, | |
| "num_tokens": 262656392.0, | |
| "step": 899 | |
| }, | |
| { | |
| "entropy": 1.6238026022911072, | |
| "epoch": 0.6495849873691808, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1433484554290771, | |
| "mean_token_accuracy": 0.7324753701686859, | |
| "num_tokens": 262950769.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.5863648653030396, | |
| "epoch": 0.6503067484662577, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.110450267791748, | |
| "mean_token_accuracy": 0.7372980713844299, | |
| "num_tokens": 263238762.0, | |
| "step": 901 | |
| }, | |
| { | |
| "entropy": 1.5862815976142883, | |
| "epoch": 0.6510285095633346, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1120389699935913, | |
| "mean_token_accuracy": 0.7377087771892548, | |
| "num_tokens": 263532186.0, | |
| "step": 902 | |
| }, | |
| { | |
| "entropy": 1.6178622245788574, | |
| "epoch": 0.6517502706604114, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1054493188858032, | |
| "mean_token_accuracy": 0.7389742732048035, | |
| "num_tokens": 263810322.0, | |
| "step": 903 | |
| }, | |
| { | |
| "entropy": 1.6152253150939941, | |
| "epoch": 0.6524720317574882, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1155157089233398, | |
| "mean_token_accuracy": 0.7362486720085144, | |
| "num_tokens": 264093526.0, | |
| "step": 904 | |
| }, | |
| { | |
| "entropy": 1.5610551238059998, | |
| "epoch": 0.6531937928545651, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0780706405639648, | |
| "mean_token_accuracy": 0.7398194372653961, | |
| "num_tokens": 264386431.0, | |
| "step": 905 | |
| }, | |
| { | |
| "entropy": 1.6352735757827759, | |
| "epoch": 0.653915553951642, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1377203464508057, | |
| "mean_token_accuracy": 0.7319419384002686, | |
| "num_tokens": 264662763.0, | |
| "step": 906 | |
| }, | |
| { | |
| "entropy": 1.6403372287750244, | |
| "epoch": 0.6546373150487189, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1423468589782715, | |
| "mean_token_accuracy": 0.7309231460094452, | |
| "num_tokens": 264938342.0, | |
| "step": 907 | |
| }, | |
| { | |
| "entropy": 1.5906200408935547, | |
| "epoch": 0.6553590761457957, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1279376745224, | |
| "mean_token_accuracy": 0.7327744960784912, | |
| "num_tokens": 265228160.0, | |
| "step": 908 | |
| }, | |
| { | |
| "entropy": 1.6102629899978638, | |
| "epoch": 0.6560808372428726, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.10719633102417, | |
| "mean_token_accuracy": 0.7365886569023132, | |
| "num_tokens": 265509748.0, | |
| "step": 909 | |
| }, | |
| { | |
| "entropy": 1.5446515083312988, | |
| "epoch": 0.6568025983399495, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0781011581420898, | |
| "mean_token_accuracy": 0.7421518266201019, | |
| "num_tokens": 265814965.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.5656993389129639, | |
| "epoch": 0.6575243594370264, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0570569038391113, | |
| "mean_token_accuracy": 0.7460503876209259, | |
| "num_tokens": 266112327.0, | |
| "step": 911 | |
| }, | |
| { | |
| "entropy": 1.6066370606422424, | |
| "epoch": 0.6582461205341033, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.13433837890625, | |
| "mean_token_accuracy": 0.7298902571201324, | |
| "num_tokens": 266417027.0, | |
| "step": 912 | |
| }, | |
| { | |
| "entropy": 1.5417880415916443, | |
| "epoch": 0.65896788163118, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0931062698364258, | |
| "mean_token_accuracy": 0.7403707206249237, | |
| "num_tokens": 266718813.0, | |
| "step": 913 | |
| }, | |
| { | |
| "entropy": 1.6033689975738525, | |
| "epoch": 0.6596896427282569, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1128414869308472, | |
| "mean_token_accuracy": 0.735693097114563, | |
| "num_tokens": 267005821.0, | |
| "step": 914 | |
| }, | |
| { | |
| "entropy": 1.5902194380760193, | |
| "epoch": 0.6604114038253338, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0905375480651855, | |
| "mean_token_accuracy": 0.739936113357544, | |
| "num_tokens": 267286801.0, | |
| "step": 915 | |
| }, | |
| { | |
| "entropy": 1.6343393921852112, | |
| "epoch": 0.6611331649224107, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1188874244689941, | |
| "mean_token_accuracy": 0.7347078323364258, | |
| "num_tokens": 267574678.0, | |
| "step": 916 | |
| }, | |
| { | |
| "entropy": 1.640225112438202, | |
| "epoch": 0.6618549260194876, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1373822689056396, | |
| "mean_token_accuracy": 0.7312084138393402, | |
| "num_tokens": 267856177.0, | |
| "step": 917 | |
| }, | |
| { | |
| "entropy": 1.5904943943023682, | |
| "epoch": 0.6625766871165644, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.107534408569336, | |
| "mean_token_accuracy": 0.7372018694877625, | |
| "num_tokens": 268153899.0, | |
| "step": 918 | |
| }, | |
| { | |
| "entropy": 1.56828773021698, | |
| "epoch": 0.6632984482136413, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0689823627471924, | |
| "mean_token_accuracy": 0.7442007660865784, | |
| "num_tokens": 268452188.0, | |
| "step": 919 | |
| }, | |
| { | |
| "entropy": 1.594265639781952, | |
| "epoch": 0.6640202093107181, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1469228267669678, | |
| "mean_token_accuracy": 0.7300191819667816, | |
| "num_tokens": 268749324.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.5534809827804565, | |
| "epoch": 0.664741970407795, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0656530857086182, | |
| "mean_token_accuracy": 0.7445902526378632, | |
| "num_tokens": 269039727.0, | |
| "step": 921 | |
| }, | |
| { | |
| "entropy": 1.6094595789909363, | |
| "epoch": 0.6654637315048719, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1346147060394287, | |
| "mean_token_accuracy": 0.733146071434021, | |
| "num_tokens": 269333705.0, | |
| "step": 922 | |
| }, | |
| { | |
| "entropy": 1.6785868406295776, | |
| "epoch": 0.6661854926019487, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.218106985092163, | |
| "mean_token_accuracy": 0.7160632610321045, | |
| "num_tokens": 269611617.0, | |
| "step": 923 | |
| }, | |
| { | |
| "entropy": 1.664020299911499, | |
| "epoch": 0.6669072536990256, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1749920845031738, | |
| "mean_token_accuracy": 0.7255774736404419, | |
| "num_tokens": 269891725.0, | |
| "step": 924 | |
| }, | |
| { | |
| "entropy": 1.6315612196922302, | |
| "epoch": 0.6676290147961025, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.123667597770691, | |
| "mean_token_accuracy": 0.7347060143947601, | |
| "num_tokens": 270176655.0, | |
| "step": 925 | |
| }, | |
| { | |
| "entropy": 1.58631032705307, | |
| "epoch": 0.6683507758931794, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.107755422592163, | |
| "mean_token_accuracy": 0.7377076745033264, | |
| "num_tokens": 270468761.0, | |
| "step": 926 | |
| }, | |
| { | |
| "entropy": 1.5503064393997192, | |
| "epoch": 0.6690725369902563, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0641939640045166, | |
| "mean_token_accuracy": 0.7454411685466766, | |
| "num_tokens": 270770579.0, | |
| "step": 927 | |
| }, | |
| { | |
| "entropy": 1.6055181622505188, | |
| "epoch": 0.669794298087333, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1491901874542236, | |
| "mean_token_accuracy": 0.7300946116447449, | |
| "num_tokens": 271060224.0, | |
| "step": 928 | |
| }, | |
| { | |
| "entropy": 1.5941227674484253, | |
| "epoch": 0.6705160591844099, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1114215850830078, | |
| "mean_token_accuracy": 0.7382023632526398, | |
| "num_tokens": 271366518.0, | |
| "step": 929 | |
| }, | |
| { | |
| "entropy": 1.5658720135688782, | |
| "epoch": 0.6712378202814868, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0909024477005005, | |
| "mean_token_accuracy": 0.7400651574134827, | |
| "num_tokens": 271669690.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.602582335472107, | |
| "epoch": 0.6719595813785637, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.124122142791748, | |
| "mean_token_accuracy": 0.7325031459331512, | |
| "num_tokens": 271950805.0, | |
| "step": 931 | |
| }, | |
| { | |
| "entropy": 1.6303736567497253, | |
| "epoch": 0.6726813424756406, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.145336389541626, | |
| "mean_token_accuracy": 0.7321729063987732, | |
| "num_tokens": 272249314.0, | |
| "step": 932 | |
| }, | |
| { | |
| "entropy": 1.5918556451797485, | |
| "epoch": 0.6734031035727174, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1050128936767578, | |
| "mean_token_accuracy": 0.7372748255729675, | |
| "num_tokens": 272550435.0, | |
| "step": 933 | |
| }, | |
| { | |
| "entropy": 1.5954800844192505, | |
| "epoch": 0.6741248646697943, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1313841342926025, | |
| "mean_token_accuracy": 0.7309823036193848, | |
| "num_tokens": 272843679.0, | |
| "step": 934 | |
| }, | |
| { | |
| "entropy": 1.55681973695755, | |
| "epoch": 0.6748466257668712, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.094477891921997, | |
| "mean_token_accuracy": 0.7430874407291412, | |
| "num_tokens": 273157926.0, | |
| "step": 935 | |
| }, | |
| { | |
| "entropy": 1.5433151125907898, | |
| "epoch": 0.675568386863948, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0513269901275635, | |
| "mean_token_accuracy": 0.747769683599472, | |
| "num_tokens": 273461019.0, | |
| "step": 936 | |
| }, | |
| { | |
| "entropy": 1.5947770476341248, | |
| "epoch": 0.6762901479610249, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1324204206466675, | |
| "mean_token_accuracy": 0.7329586744308472, | |
| "num_tokens": 273746093.0, | |
| "step": 937 | |
| }, | |
| { | |
| "entropy": 1.5466710329055786, | |
| "epoch": 0.6770119090581017, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0900349617004395, | |
| "mean_token_accuracy": 0.7388436198234558, | |
| "num_tokens": 274050211.0, | |
| "step": 938 | |
| }, | |
| { | |
| "entropy": 1.657389223575592, | |
| "epoch": 0.6777336701551786, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1392896175384521, | |
| "mean_token_accuracy": 0.7313405871391296, | |
| "num_tokens": 274344408.0, | |
| "step": 939 | |
| }, | |
| { | |
| "entropy": 1.6149133443832397, | |
| "epoch": 0.6784554312522555, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0918140411376953, | |
| "mean_token_accuracy": 0.7390757501125336, | |
| "num_tokens": 274638677.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.6076053977012634, | |
| "epoch": 0.6791771923493324, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0931899547576904, | |
| "mean_token_accuracy": 0.7379167377948761, | |
| "num_tokens": 274923490.0, | |
| "step": 941 | |
| }, | |
| { | |
| "entropy": 1.6174147725105286, | |
| "epoch": 0.6798989534464093, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.126497745513916, | |
| "mean_token_accuracy": 0.73365917801857, | |
| "num_tokens": 275228626.0, | |
| "step": 942 | |
| }, | |
| { | |
| "entropy": 1.5941608548164368, | |
| "epoch": 0.6806207145434862, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.106541395187378, | |
| "mean_token_accuracy": 0.7357300519943237, | |
| "num_tokens": 275522330.0, | |
| "step": 943 | |
| }, | |
| { | |
| "entropy": 1.6859710812568665, | |
| "epoch": 0.6813424756405629, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1974095106124878, | |
| "mean_token_accuracy": 0.7205123007297516, | |
| "num_tokens": 275809745.0, | |
| "step": 944 | |
| }, | |
| { | |
| "entropy": 1.6501319408416748, | |
| "epoch": 0.6820642367376398, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.174088716506958, | |
| "mean_token_accuracy": 0.7260400354862213, | |
| "num_tokens": 276101032.0, | |
| "step": 945 | |
| }, | |
| { | |
| "entropy": 1.580397367477417, | |
| "epoch": 0.6827859978347167, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0914998054504395, | |
| "mean_token_accuracy": 0.7411804497241974, | |
| "num_tokens": 276413998.0, | |
| "step": 946 | |
| }, | |
| { | |
| "entropy": 1.6093501448631287, | |
| "epoch": 0.6835077589317936, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1446261405944824, | |
| "mean_token_accuracy": 0.7326328456401825, | |
| "num_tokens": 276710068.0, | |
| "step": 947 | |
| }, | |
| { | |
| "entropy": 1.5760272145271301, | |
| "epoch": 0.6842295200288705, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0904967784881592, | |
| "mean_token_accuracy": 0.7404919564723969, | |
| "num_tokens": 277007079.0, | |
| "step": 948 | |
| }, | |
| { | |
| "entropy": 1.580204427242279, | |
| "epoch": 0.6849512811259473, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0839993953704834, | |
| "mean_token_accuracy": 0.7390538454055786, | |
| "num_tokens": 277297549.0, | |
| "step": 949 | |
| }, | |
| { | |
| "entropy": 1.6093292832374573, | |
| "epoch": 0.6856730422230242, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.084509015083313, | |
| "mean_token_accuracy": 0.742249071598053, | |
| "num_tokens": 277584510.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.67963707447052, | |
| "epoch": 0.686394803320101, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2040464878082275, | |
| "mean_token_accuracy": 0.7204844653606415, | |
| "num_tokens": 277867161.0, | |
| "step": 951 | |
| }, | |
| { | |
| "entropy": 1.6682324409484863, | |
| "epoch": 0.6871165644171779, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1984906196594238, | |
| "mean_token_accuracy": 0.7199585139751434, | |
| "num_tokens": 278162754.0, | |
| "step": 952 | |
| }, | |
| { | |
| "entropy": 1.649233341217041, | |
| "epoch": 0.6878383255142548, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1511235237121582, | |
| "mean_token_accuracy": 0.7316516935825348, | |
| "num_tokens": 278449207.0, | |
| "step": 953 | |
| }, | |
| { | |
| "entropy": 1.6017658710479736, | |
| "epoch": 0.6885600866113316, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0907299518585205, | |
| "mean_token_accuracy": 0.7412089705467224, | |
| "num_tokens": 278747205.0, | |
| "step": 954 | |
| }, | |
| { | |
| "entropy": 1.5632635354995728, | |
| "epoch": 0.6892818477084085, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0943294763565063, | |
| "mean_token_accuracy": 0.7398828566074371, | |
| "num_tokens": 279035066.0, | |
| "step": 955 | |
| }, | |
| { | |
| "entropy": 1.67336106300354, | |
| "epoch": 0.6900036088054854, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1602253913879395, | |
| "mean_token_accuracy": 0.7243440747261047, | |
| "num_tokens": 279303244.0, | |
| "step": 956 | |
| }, | |
| { | |
| "entropy": 1.645988404750824, | |
| "epoch": 0.6907253699025623, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1323683261871338, | |
| "mean_token_accuracy": 0.7338282465934753, | |
| "num_tokens": 279597702.0, | |
| "step": 957 | |
| }, | |
| { | |
| "entropy": 1.5881378054618835, | |
| "epoch": 0.6914471309996392, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.076913595199585, | |
| "mean_token_accuracy": 0.7412590384483337, | |
| "num_tokens": 279908965.0, | |
| "step": 958 | |
| }, | |
| { | |
| "entropy": 1.6132688522338867, | |
| "epoch": 0.692168892096716, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.092913031578064, | |
| "mean_token_accuracy": 0.7388288378715515, | |
| "num_tokens": 280193105.0, | |
| "step": 959 | |
| }, | |
| { | |
| "entropy": 1.588767647743225, | |
| "epoch": 0.6928906531937928, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0910826921463013, | |
| "mean_token_accuracy": 0.7414247989654541, | |
| "num_tokens": 280492885.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.65106862783432, | |
| "epoch": 0.6936124142908697, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1789040565490723, | |
| "mean_token_accuracy": 0.7246316969394684, | |
| "num_tokens": 280782331.0, | |
| "step": 961 | |
| }, | |
| { | |
| "entropy": 1.6243720054626465, | |
| "epoch": 0.6943341753879466, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.153637170791626, | |
| "mean_token_accuracy": 0.7275400161743164, | |
| "num_tokens": 281081612.0, | |
| "step": 962 | |
| }, | |
| { | |
| "entropy": 1.5809986591339111, | |
| "epoch": 0.6950559364850235, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1162421703338623, | |
| "mean_token_accuracy": 0.7366466820240021, | |
| "num_tokens": 281391574.0, | |
| "step": 963 | |
| }, | |
| { | |
| "entropy": 1.5244627594947815, | |
| "epoch": 0.6957776975821003, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.049726963043213, | |
| "mean_token_accuracy": 0.7491204142570496, | |
| "num_tokens": 281700399.0, | |
| "step": 964 | |
| }, | |
| { | |
| "entropy": 1.590291142463684, | |
| "epoch": 0.6964994586791772, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.122822642326355, | |
| "mean_token_accuracy": 0.7341059446334839, | |
| "num_tokens": 282008010.0, | |
| "step": 965 | |
| }, | |
| { | |
| "entropy": 1.5317293405532837, | |
| "epoch": 0.6972212197762541, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0474406480789185, | |
| "mean_token_accuracy": 0.7487011551856995, | |
| "num_tokens": 282306786.0, | |
| "step": 966 | |
| }, | |
| { | |
| "entropy": 1.598990797996521, | |
| "epoch": 0.697942980873331, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0971026420593262, | |
| "mean_token_accuracy": 0.7404988408088684, | |
| "num_tokens": 282592379.0, | |
| "step": 967 | |
| }, | |
| { | |
| "entropy": 1.5990492105484009, | |
| "epoch": 0.6986647419704078, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.124864101409912, | |
| "mean_token_accuracy": 0.7342464923858643, | |
| "num_tokens": 282885332.0, | |
| "step": 968 | |
| }, | |
| { | |
| "entropy": 1.5604233741760254, | |
| "epoch": 0.6993865030674846, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0768327713012695, | |
| "mean_token_accuracy": 0.7427924275398254, | |
| "num_tokens": 283180489.0, | |
| "step": 969 | |
| }, | |
| { | |
| "entropy": 1.6985811591148376, | |
| "epoch": 0.7001082641645615, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.217573881149292, | |
| "mean_token_accuracy": 0.7172922194004059, | |
| "num_tokens": 283458906.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.679398536682129, | |
| "epoch": 0.7008300252616384, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.154064416885376, | |
| "mean_token_accuracy": 0.7272751331329346, | |
| "num_tokens": 283740762.0, | |
| "step": 971 | |
| }, | |
| { | |
| "entropy": 1.6003840565681458, | |
| "epoch": 0.7015517863587153, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1534478664398193, | |
| "mean_token_accuracy": 0.7289105951786041, | |
| "num_tokens": 284040954.0, | |
| "step": 972 | |
| }, | |
| { | |
| "entropy": 1.6483497023582458, | |
| "epoch": 0.7022735474557922, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1512547731399536, | |
| "mean_token_accuracy": 0.7295123338699341, | |
| "num_tokens": 284325851.0, | |
| "step": 973 | |
| }, | |
| { | |
| "entropy": 1.567041277885437, | |
| "epoch": 0.702995308552869, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0615626573562622, | |
| "mean_token_accuracy": 0.7467664480209351, | |
| "num_tokens": 284612967.0, | |
| "step": 974 | |
| }, | |
| { | |
| "entropy": 1.5734952688217163, | |
| "epoch": 0.7037170696499458, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1232900619506836, | |
| "mean_token_accuracy": 0.7348988652229309, | |
| "num_tokens": 284908035.0, | |
| "step": 975 | |
| }, | |
| { | |
| "entropy": 1.5315465331077576, | |
| "epoch": 0.7044388307470227, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0262224674224854, | |
| "mean_token_accuracy": 0.7503110468387604, | |
| "num_tokens": 285213882.0, | |
| "step": 976 | |
| }, | |
| { | |
| "entropy": 1.6075134873390198, | |
| "epoch": 0.7051605918440996, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1080231666564941, | |
| "mean_token_accuracy": 0.7375926971435547, | |
| "num_tokens": 285490494.0, | |
| "step": 977 | |
| }, | |
| { | |
| "entropy": 1.6598886847496033, | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1537938117980957, | |
| "mean_token_accuracy": 0.7295781373977661, | |
| "num_tokens": 285784594.0, | |
| "step": 978 | |
| }, | |
| { | |
| "entropy": 1.6115790605545044, | |
| "epoch": 0.7066041140382533, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.160152792930603, | |
| "mean_token_accuracy": 0.7279682755470276, | |
| "num_tokens": 286076590.0, | |
| "step": 979 | |
| }, | |
| { | |
| "entropy": 1.6706183552742004, | |
| "epoch": 0.7073258751353302, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1740875244140625, | |
| "mean_token_accuracy": 0.7252408266067505, | |
| "num_tokens": 286352047.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.610364556312561, | |
| "epoch": 0.7080476362324071, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1607551574707031, | |
| "mean_token_accuracy": 0.729096919298172, | |
| "num_tokens": 286650614.0, | |
| "step": 981 | |
| }, | |
| { | |
| "entropy": 1.6068995594978333, | |
| "epoch": 0.708769397329484, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.095207929611206, | |
| "mean_token_accuracy": 0.7393112778663635, | |
| "num_tokens": 286941258.0, | |
| "step": 982 | |
| }, | |
| { | |
| "entropy": 1.5314761400222778, | |
| "epoch": 0.7094911584265609, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.067337989807129, | |
| "mean_token_accuracy": 0.7435638010501862, | |
| "num_tokens": 287241552.0, | |
| "step": 983 | |
| }, | |
| { | |
| "entropy": 1.556878924369812, | |
| "epoch": 0.7102129195236376, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0859746932983398, | |
| "mean_token_accuracy": 0.7392235696315765, | |
| "num_tokens": 287532086.0, | |
| "step": 984 | |
| }, | |
| { | |
| "entropy": 1.631901204586029, | |
| "epoch": 0.7109346806207145, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1279494762420654, | |
| "mean_token_accuracy": 0.7334802448749542, | |
| "num_tokens": 287820978.0, | |
| "step": 985 | |
| }, | |
| { | |
| "entropy": 1.6896315217018127, | |
| "epoch": 0.7116564417177914, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.209764003753662, | |
| "mean_token_accuracy": 0.719461053609848, | |
| "num_tokens": 288114034.0, | |
| "step": 986 | |
| }, | |
| { | |
| "entropy": 1.5682026743888855, | |
| "epoch": 0.7123782028148683, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0700600147247314, | |
| "mean_token_accuracy": 0.7445848882198334, | |
| "num_tokens": 288416245.0, | |
| "step": 987 | |
| }, | |
| { | |
| "entropy": 1.6246426105499268, | |
| "epoch": 0.7130999639119452, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.170893907546997, | |
| "mean_token_accuracy": 0.7248595356941223, | |
| "num_tokens": 288708707.0, | |
| "step": 988 | |
| }, | |
| { | |
| "entropy": 1.626733422279358, | |
| "epoch": 0.713821725009022, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1392338275909424, | |
| "mean_token_accuracy": 0.7327068746089935, | |
| "num_tokens": 288992657.0, | |
| "step": 989 | |
| }, | |
| { | |
| "entropy": 1.6286635994911194, | |
| "epoch": 0.7145434861060989, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.166459560394287, | |
| "mean_token_accuracy": 0.7263357937335968, | |
| "num_tokens": 289296719.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.6180419921875, | |
| "epoch": 0.7152652472031757, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1272691488265991, | |
| "mean_token_accuracy": 0.7338026463985443, | |
| "num_tokens": 289578617.0, | |
| "step": 991 | |
| }, | |
| { | |
| "entropy": 1.5990166068077087, | |
| "epoch": 0.7159870083002526, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1410884857177734, | |
| "mean_token_accuracy": 0.7329325079917908, | |
| "num_tokens": 289871134.0, | |
| "step": 992 | |
| }, | |
| { | |
| "entropy": 1.5957458019256592, | |
| "epoch": 0.7167087693973295, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1002070903778076, | |
| "mean_token_accuracy": 0.739589661359787, | |
| "num_tokens": 290174939.0, | |
| "step": 993 | |
| }, | |
| { | |
| "entropy": 1.6043808460235596, | |
| "epoch": 0.7174305304944063, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.104749083518982, | |
| "mean_token_accuracy": 0.736821860074997, | |
| "num_tokens": 290470725.0, | |
| "step": 994 | |
| }, | |
| { | |
| "entropy": 1.5915351510047913, | |
| "epoch": 0.7181522915914832, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1227054595947266, | |
| "mean_token_accuracy": 0.7356365025043488, | |
| "num_tokens": 290768937.0, | |
| "step": 995 | |
| }, | |
| { | |
| "entropy": 1.617738664150238, | |
| "epoch": 0.7188740526885601, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1229993104934692, | |
| "mean_token_accuracy": 0.7319775223731995, | |
| "num_tokens": 291051445.0, | |
| "step": 996 | |
| }, | |
| { | |
| "entropy": 1.5803756713867188, | |
| "epoch": 0.719595813785637, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.110452651977539, | |
| "mean_token_accuracy": 0.7355692982673645, | |
| "num_tokens": 291356207.0, | |
| "step": 997 | |
| }, | |
| { | |
| "entropy": 1.5010618567466736, | |
| "epoch": 0.7203175748827139, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0443360805511475, | |
| "mean_token_accuracy": 0.749332457780838, | |
| "num_tokens": 291667885.0, | |
| "step": 998 | |
| }, | |
| { | |
| "entropy": 1.5546172261238098, | |
| "epoch": 0.7210393359797906, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0694785118103027, | |
| "mean_token_accuracy": 0.7447611689567566, | |
| "num_tokens": 291972285.0, | |
| "step": 999 | |
| }, | |
| { | |
| "entropy": 1.6047680377960205, | |
| "epoch": 0.7217610970768675, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.113739013671875, | |
| "mean_token_accuracy": 0.7364310026168823, | |
| "num_tokens": 292274886.0, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 2772, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.976964789614674e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |