MgGladys commited on Feb 7

Commit

7bcff03

verified ·

1 Parent(s): 07afa9a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-200/added_tokens.json +24 -0
checkpoint-200/chat_template.jinja +7 -0
checkpoint-200/merges.txt +0 -0
checkpoint-200/model.safetensors +3 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/preprocessor_config.json +29 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +31 -0
checkpoint-200/tokenizer.json +3 -0
checkpoint-200/tokenizer_config.json +208 -0
checkpoint-200/trainer_state.json +1434 -0
checkpoint-200/training_args.bin +3 -0
checkpoint-200/vocab.json +0 -0
checkpoint-300/added_tokens.json +24 -0
checkpoint-300/chat_template.jinja +7 -0
checkpoint-300/merges.txt +0 -0
checkpoint-300/model.safetensors +3 -0
checkpoint-300/optimizer.pt +3 -0
checkpoint-300/preprocessor_config.json +29 -0
checkpoint-300/rng_state.pth +3 -0
checkpoint-300/scheduler.pt +3 -0
checkpoint-300/special_tokens_map.json +31 -0
checkpoint-300/tokenizer.json +3 -0
checkpoint-300/tokenizer_config.json +208 -0
checkpoint-300/trainer_state.json +2134 -0
checkpoint-300/training_args.bin +3 -0
checkpoint-300/vocab.json +0 -0
checkpoint-400/added_tokens.json +24 -0
checkpoint-400/chat_template.jinja +7 -0
checkpoint-400/merges.txt +0 -0
checkpoint-400/model.safetensors +3 -0
checkpoint-400/optimizer.pt +3 -0
checkpoint-400/preprocessor_config.json +29 -0
checkpoint-400/scheduler.pt +3 -0
checkpoint-400/special_tokens_map.json +31 -0
checkpoint-400/tokenizer.json +3 -0
checkpoint-400/tokenizer_config.json +208 -0
checkpoint-400/trainer_state.json +2834 -0
checkpoint-400/vocab.json +0 -0
checkpoint-500/merges.txt +0 -0
checkpoint-500/tokenizer.json +3 -0
checkpoint-500/vocab.json +0 -0
checkpoint-600/added_tokens.json +24 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f128e58608eb4e1f136160e2d7565d16396476050ff6d069ce6670d8d13d43bd
+size 14645

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3222e0bb19d7f9cf6eb8a496a9297573dc0b270b4c55c8e2eaa9443c5b18b442
+size 1465

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:004527189128ad9f86554b0f384032fa8c4c91478964cf149c179071f96bf50a
+size 6289

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e63e79316d282a810ce0ff83b6087314a51322d0f49e91f693bcdf1447be448a
+size 14645

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:004527189128ad9f86554b0f384032fa8c4c91478964cf149c179071f96bf50a
+size 6289

checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ddb1989e7c777e845ab853e85d2dae3d031a4f55465bbf484b9bc41a5e37bcb
+size 1003852

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d7fb43e584d0eca965cd8a97e80c7cb516a3b93b7a3f8986dc0f585fb6b62bc
+size 2019836

checkpoint-200/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:222fc825955a4b313634dc5514da060205ad9cb082bd4c56e0bfb1208a802436
+size 14645

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc4996496bbf363708baba32f39889a80aadbe5b598653714ed77f13a8eecaa5
+size 1465

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1434 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.10432968179447052,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005216484089723526,
+      "grad_norm": 3.6680614948272705,
+      "learning_rate": 0.0,
+      "loss": 0.541,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010432968179447052,
+      "grad_norm": 1.7092307806015015,
+      "learning_rate": 5e-06,
+      "loss": 0.2639,
+      "step": 2
+    },
+    {
+      "epoch": 0.001564945226917058,
+      "grad_norm": 1.990319013595581,
+      "learning_rate": 1e-05,
+      "loss": 0.3118,
+      "step": 3
+    },
+    {
+      "epoch": 0.0020865936358894104,
+      "grad_norm": 3.750917434692383,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4562,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026082420448617634,
+      "grad_norm": 4.690845966339111,
+      "learning_rate": 2e-05,
+      "loss": 0.673,
+      "step": 5
+    },
+    {
+      "epoch": 0.003129890453834116,
+      "grad_norm": 1.4218288660049438,
+      "learning_rate": 2.5e-05,
+      "loss": 0.2984,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036515388628064684,
+      "grad_norm": 4.896511077880859,
+      "learning_rate": 3e-05,
+      "loss": 0.7113,
+      "step": 7
+    },
+    {
+      "epoch": 0.004173187271778821,
+      "grad_norm": 2.5787155628204346,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.4226,
+      "step": 8
+    },
+    {
+      "epoch": 0.004694835680751174,
+      "grad_norm": 1.028937578201294,
+      "learning_rate": 4e-05,
+      "loss": 0.1873,
+      "step": 9
+    },
+    {
+      "epoch": 0.005216484089723527,
+      "grad_norm": 3.9262092113494873,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.5728,
+      "step": 10
+    },
+    {
+      "epoch": 0.005738132498695879,
+      "grad_norm": 4.9360198974609375,
+      "learning_rate": 5e-05,
+      "loss": 0.725,
+      "step": 11
+    },
+    {
+      "epoch": 0.006259780907668232,
+      "grad_norm": 4.287437915802002,
+      "learning_rate": 5.5e-05,
+      "loss": 0.6361,
+      "step": 12
+    },
+    {
+      "epoch": 0.006781429316640584,
+      "grad_norm": 1.3290928602218628,
+      "learning_rate": 6e-05,
+      "loss": 0.3109,
+      "step": 13
+    },
+    {
+      "epoch": 0.007303077725612937,
+      "grad_norm": 2.0050501823425293,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.4099,
+      "step": 14
+    },
+    {
+      "epoch": 0.00782472613458529,
+      "grad_norm": 4.360481262207031,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.6363,
+      "step": 15
+    },
+    {
+      "epoch": 0.008346374543557642,
+      "grad_norm": 3.9680511951446533,
+      "learning_rate": 7.5e-05,
+      "loss": 0.6124,
+      "step": 16
+    },
+    {
+      "epoch": 0.008868022952529996,
+      "grad_norm": 1.701784610748291,
+      "learning_rate": 8e-05,
+      "loss": 0.3439,
+      "step": 17
+    },
+    {
+      "epoch": 0.009389671361502348,
+      "grad_norm": 4.544748783111572,
+      "learning_rate": 8.5e-05,
+      "loss": 0.6253,
+      "step": 18
+    },
+    {
+      "epoch": 0.0099113197704747,
+      "grad_norm": 4.58634090423584,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.7669,
+      "step": 19
+    },
+    {
+      "epoch": 0.010432968179447054,
+      "grad_norm": 2.89898419380188,
+      "learning_rate": 9.5e-05,
+      "loss": 0.512,
+      "step": 20
+    },
+    {
+      "epoch": 0.010954616588419406,
+      "grad_norm": 2.61112904548645,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 21
+    },
+    {
+      "epoch": 0.011476264997391758,
+      "grad_norm": 3.217054843902588,
+      "learning_rate": 0.000105,
+      "loss": 0.4959,
+      "step": 22
+    },
+    {
+      "epoch": 0.011997913406364111,
+      "grad_norm": 2.569636821746826,
+      "learning_rate": 0.00011,
+      "loss": 0.3918,
+      "step": 23
+    },
+    {
+      "epoch": 0.012519561815336464,
+      "grad_norm": 1.4626373052597046,
+      "learning_rate": 0.000115,
+      "loss": 0.3316,
+      "step": 24
+    },
+    {
+      "epoch": 0.013041210224308816,
+      "grad_norm": 1.2480732202529907,
+      "learning_rate": 0.00012,
+      "loss": 0.3484,
+      "step": 25
+    },
+    {
+      "epoch": 0.013562858633281168,
+      "grad_norm": 2.5430543422698975,
+      "learning_rate": 0.000125,
+      "loss": 0.4699,
+      "step": 26
+    },
+    {
+      "epoch": 0.014084507042253521,
+      "grad_norm": 1.7051862478256226,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.2139,
+      "step": 27
+    },
+    {
+      "epoch": 0.014606155451225874,
+      "grad_norm": 1.1670981645584106,
+      "learning_rate": 0.000135,
+      "loss": 0.3883,
+      "step": 28
+    },
+    {
+      "epoch": 0.015127803860198226,
+      "grad_norm": 1.336538314819336,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.3714,
+      "step": 29
+    },
+    {
+      "epoch": 0.01564945226917058,
+      "grad_norm": 2.018078565597534,
+      "learning_rate": 0.000145,
+      "loss": 0.2301,
+      "step": 30
+    },
+    {
+      "epoch": 0.01617110067814293,
+      "grad_norm": 1.5743223428726196,
+      "learning_rate": 0.00015,
+      "loss": 0.2935,
+      "step": 31
+    },
+    {
+      "epoch": 0.016692749087115284,
+      "grad_norm": 1.2724987268447876,
+      "learning_rate": 0.000155,
+      "loss": 0.3141,
+      "step": 32
+    },
+    {
+      "epoch": 0.017214397496087636,
+      "grad_norm": 2.2347893714904785,
+      "learning_rate": 0.00016,
+      "loss": 0.2917,
+      "step": 33
+    },
+    {
+      "epoch": 0.01773604590505999,
+      "grad_norm": 1.6726069450378418,
+      "learning_rate": 0.000165,
+      "loss": 0.377,
+      "step": 34
+    },
+    {
+      "epoch": 0.018257694314032343,
+      "grad_norm": 1.2217071056365967,
+      "learning_rate": 0.00017,
+      "loss": 0.3027,
+      "step": 35
+    },
+    {
+      "epoch": 0.018779342723004695,
+      "grad_norm": 1.3436322212219238,
+      "learning_rate": 0.000175,
+      "loss": 0.2853,
+      "step": 36
+    },
+    {
+      "epoch": 0.019300991131977047,
+      "grad_norm": 1.2247120141983032,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.2967,
+      "step": 37
+    },
+    {
+      "epoch": 0.0198226395409494,
+      "grad_norm": 1.0636978149414062,
+      "learning_rate": 0.000185,
+      "loss": 0.2745,
+      "step": 38
+    },
+    {
+      "epoch": 0.02034428794992175,
+      "grad_norm": 1.302099347114563,
+      "learning_rate": 0.00019,
+      "loss": 0.2688,
+      "step": 39
+    },
+    {
+      "epoch": 0.020865936358894107,
+      "grad_norm": 1.0052679777145386,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2539,
+      "step": 40
+    },
+    {
+      "epoch": 0.02138758476786646,
+      "grad_norm": 1.0164084434509277,
+      "learning_rate": 0.0002,
+      "loss": 0.1978,
+      "step": 41
+    },
+    {
+      "epoch": 0.02190923317683881,
+      "grad_norm": 1.3891016244888306,
+      "learning_rate": 0.000205,
+      "loss": 0.3189,
+      "step": 42
+    },
+    {
+      "epoch": 0.022430881585811163,
+      "grad_norm": 0.960986852645874,
+      "learning_rate": 0.00021,
+      "loss": 0.2321,
+      "step": 43
+    },
+    {
+      "epoch": 0.022952529994783515,
+      "grad_norm": 0.9918408393859863,
+      "learning_rate": 0.000215,
+      "loss": 0.2359,
+      "step": 44
+    },
+    {
+      "epoch": 0.023474178403755867,
+      "grad_norm": 1.190205693244934,
+      "learning_rate": 0.00022,
+      "loss": 0.2347,
+      "step": 45
+    },
+    {
+      "epoch": 0.023995826812728223,
+      "grad_norm": 0.7985232472419739,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.2048,
+      "step": 46
+    },
+    {
+      "epoch": 0.024517475221700575,
+      "grad_norm": 0.5192842483520508,
+      "learning_rate": 0.00023,
+      "loss": 0.1116,
+      "step": 47
+    },
+    {
+      "epoch": 0.025039123630672927,
+      "grad_norm": 1.1033375263214111,
+      "learning_rate": 0.000235,
+      "loss": 0.2665,
+      "step": 48
+    },
+    {
+      "epoch": 0.02556077203964528,
+      "grad_norm": 0.7089418172836304,
+      "learning_rate": 0.00024,
+      "loss": 0.1639,
+      "step": 49
+    },
+    {
+      "epoch": 0.02608242044861763,
+      "grad_norm": 1.08647882938385,
+      "learning_rate": 0.000245,
+      "loss": 0.2072,
+      "step": 50
+    },
+    {
+      "epoch": 0.026604068857589983,
+      "grad_norm": 0.9901174902915955,
+      "learning_rate": 0.00025,
+      "loss": 0.2035,
+      "step": 51
+    },
+    {
+      "epoch": 0.027125717266562335,
+      "grad_norm": 0.6938351988792419,
+      "learning_rate": 0.000255,
+      "loss": 0.1851,
+      "step": 52
+    },
+    {
+      "epoch": 0.02764736567553469,
+      "grad_norm": 0.8392678499221802,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.1918,
+      "step": 53
+    },
+    {
+      "epoch": 0.028169014084507043,
+      "grad_norm": 0.5979602932929993,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.1243,
+      "step": 54
+    },
+    {
+      "epoch": 0.028690662493479395,
+      "grad_norm": 0.7119799852371216,
+      "learning_rate": 0.00027,
+      "loss": 0.1594,
+      "step": 55
+    },
+    {
+      "epoch": 0.029212310902451747,
+      "grad_norm": 0.5519995093345642,
+      "learning_rate": 0.000275,
+      "loss": 0.078,
+      "step": 56
+    },
+    {
+      "epoch": 0.0297339593114241,
+      "grad_norm": 0.5917723774909973,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.134,
+      "step": 57
+    },
+    {
+      "epoch": 0.03025560772039645,
+      "grad_norm": 0.6265603303909302,
+      "learning_rate": 0.000285,
+      "loss": 0.1848,
+      "step": 58
+    },
+    {
+      "epoch": 0.030777256129368807,
+      "grad_norm": 1.0653454065322876,
+      "learning_rate": 0.00029,
+      "loss": 0.1831,
+      "step": 59
+    },
+    {
+      "epoch": 0.03129890453834116,
+      "grad_norm": 0.3466293513774872,
+      "learning_rate": 0.000295,
+      "loss": 0.0878,
+      "step": 60
+    },
+    {
+      "epoch": 0.03182055294731351,
+      "grad_norm": 0.5498062372207642,
+      "learning_rate": 0.0003,
+      "loss": 0.1733,
+      "step": 61
+    },
+    {
+      "epoch": 0.03234220135628586,
+      "grad_norm": 0.7708966135978699,
+      "learning_rate": 0.000305,
+      "loss": 0.1975,
+      "step": 62
+    },
+    {
+      "epoch": 0.03286384976525822,
+      "grad_norm": 0.7717278003692627,
+      "learning_rate": 0.00031,
+      "loss": 0.1863,
+      "step": 63
+    },
+    {
+      "epoch": 0.03338549817423057,
+      "grad_norm": 0.8076028823852539,
+      "learning_rate": 0.000315,
+      "loss": 0.1938,
+      "step": 64
+    },
+    {
+      "epoch": 0.03390714658320292,
+      "grad_norm": 0.5629755258560181,
+      "learning_rate": 0.00032,
+      "loss": 0.1471,
+      "step": 65
+    },
+    {
+      "epoch": 0.03442879499217527,
+      "grad_norm": 0.5237282514572144,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1244,
+      "step": 66
+    },
+    {
+      "epoch": 0.03495044340114763,
+      "grad_norm": 0.7248942852020264,
+      "learning_rate": 0.00033,
+      "loss": 0.1933,
+      "step": 67
+    },
+    {
+      "epoch": 0.03547209181011998,
+      "grad_norm": 0.49564772844314575,
+      "learning_rate": 0.000335,
+      "loss": 0.1389,
+      "step": 68
+    },
+    {
+      "epoch": 0.03599374021909233,
+      "grad_norm": 0.4806594252586365,
+      "learning_rate": 0.00034,
+      "loss": 0.1295,
+      "step": 69
+    },
+    {
+      "epoch": 0.036515388628064686,
+      "grad_norm": 0.39995619654655457,
+      "learning_rate": 0.000345,
+      "loss": 0.1324,
+      "step": 70
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.6496027708053589,
+      "learning_rate": 0.00035,
+      "loss": 0.1002,
+      "step": 71
+    },
+    {
+      "epoch": 0.03755868544600939,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.000355,
+      "loss": 0.1277,
+      "step": 72
+    },
+    {
+      "epoch": 0.03808033385498174,
+      "grad_norm": 0.49875250458717346,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.1322,
+      "step": 73
+    },
+    {
+      "epoch": 0.038601982263954095,
+      "grad_norm": 0.44551461935043335,
+      "learning_rate": 0.000365,
+      "loss": 0.1278,
+      "step": 74
+    },
+    {
+      "epoch": 0.03912363067292645,
+      "grad_norm": 0.3314933478832245,
+      "learning_rate": 0.00037,
+      "loss": 0.0918,
+      "step": 75
+    },
+    {
+      "epoch": 0.0396452790818988,
+      "grad_norm": 0.3463922441005707,
+      "learning_rate": 0.000375,
+      "loss": 0.0948,
+      "step": 76
+    },
+    {
+      "epoch": 0.040166927490871154,
+      "grad_norm": 0.5401505827903748,
+      "learning_rate": 0.00038,
+      "loss": 0.1574,
+      "step": 77
+    },
+    {
+      "epoch": 0.0406885758998435,
+      "grad_norm": 0.39233317971229553,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.1312,
+      "step": 78
+    },
+    {
+      "epoch": 0.04121022430881586,
+      "grad_norm": 0.4380398988723755,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0601,
+      "step": 79
+    },
+    {
+      "epoch": 0.041731872717788214,
+      "grad_norm": 0.3931694030761719,
+      "learning_rate": 0.000395,
+      "loss": 0.0962,
+      "step": 80
+    },
+    {
+      "epoch": 0.04225352112676056,
+      "grad_norm": 0.3566243648529053,
+      "learning_rate": 0.0004,
+      "loss": 0.1137,
+      "step": 81
+    },
+    {
+      "epoch": 0.04277516953573292,
+      "grad_norm": 0.40159469842910767,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.1128,
+      "step": 82
+    },
+    {
+      "epoch": 0.04329681794470527,
+      "grad_norm": 0.30474773049354553,
+      "learning_rate": 0.00041,
+      "loss": 0.0922,
+      "step": 83
+    },
+    {
+      "epoch": 0.04381846635367762,
+      "grad_norm": 0.31177017092704773,
+      "learning_rate": 0.000415,
+      "loss": 0.1015,
+      "step": 84
+    },
+    {
+      "epoch": 0.04434011476264997,
+      "grad_norm": 0.3996855914592743,
+      "learning_rate": 0.00042,
+      "loss": 0.1266,
+      "step": 85
+    },
+    {
+      "epoch": 0.044861763171622326,
+      "grad_norm": 0.2281728833913803,
+      "learning_rate": 0.000425,
+      "loss": 0.0758,
+      "step": 86
+    },
+    {
+      "epoch": 0.04538341158059468,
+      "grad_norm": 0.5169669985771179,
+      "learning_rate": 0.00043,
+      "loss": 0.1092,
+      "step": 87
+    },
+    {
+      "epoch": 0.04590505998956703,
+      "grad_norm": 0.5525585412979126,
+      "learning_rate": 0.000435,
+      "loss": 0.1226,
+      "step": 88
+    },
+    {
+      "epoch": 0.046426708398539386,
+      "grad_norm": 0.33093884587287903,
+      "learning_rate": 0.00044,
+      "loss": 0.0879,
+      "step": 89
+    },
+    {
+      "epoch": 0.046948356807511735,
+      "grad_norm": 0.3713582158088684,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.1121,
+      "step": 90
+    },
+    {
+      "epoch": 0.04747000521648409,
+      "grad_norm": 0.565517246723175,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.1469,
+      "step": 91
+    },
+    {
+      "epoch": 0.047991653625456446,
+      "grad_norm": 0.31801939010620117,
+      "learning_rate": 0.000455,
+      "loss": 0.0915,
+      "step": 92
+    },
+    {
+      "epoch": 0.048513302034428794,
+      "grad_norm": 0.42586401104927063,
+      "learning_rate": 0.00046,
+      "loss": 0.0411,
+      "step": 93
+    },
+    {
+      "epoch": 0.04903495044340115,
+      "grad_norm": 0.42403289675712585,
+      "learning_rate": 0.000465,
+      "loss": 0.0589,
+      "step": 94
+    },
+    {
+      "epoch": 0.0495565988523735,
+      "grad_norm": 0.2604529559612274,
+      "learning_rate": 0.00047,
+      "loss": 0.0779,
+      "step": 95
+    },
+    {
+      "epoch": 0.050078247261345854,
+      "grad_norm": 0.32257840037345886,
+      "learning_rate": 0.000475,
+      "loss": 0.0958,
+      "step": 96
+    },
+    {
+      "epoch": 0.0505998956703182,
+      "grad_norm": 0.2648946940898895,
+      "learning_rate": 0.00048,
+      "loss": 0.0591,
+      "step": 97
+    },
+    {
+      "epoch": 0.05112154407929056,
+      "grad_norm": 0.26664629578590393,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0607,
+      "step": 98
+    },
+    {
+      "epoch": 0.051643192488262914,
+      "grad_norm": 0.2891658544540405,
+      "learning_rate": 0.00049,
+      "loss": 0.0478,
+      "step": 99
+    },
+    {
+      "epoch": 0.05216484089723526,
+      "grad_norm": 0.35936883091926575,
+      "learning_rate": 0.000495,
+      "loss": 0.1126,
+      "step": 100
+    },
+    {
+      "epoch": 0.05268648930620762,
+      "grad_norm": 0.3226841986179352,
+      "learning_rate": 0.0005,
+      "loss": 0.0995,
+      "step": 101
+    },
+    {
+      "epoch": 0.053208137715179966,
+      "grad_norm": 0.2140406370162964,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.0636,
+      "step": 102
+    },
+    {
+      "epoch": 0.05372978612415232,
+      "grad_norm": 0.28297877311706543,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.0674,
+      "step": 103
+    },
+    {
+      "epoch": 0.05425143453312467,
+      "grad_norm": 0.27131739258766174,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.0657,
+      "step": 104
+    },
+    {
+      "epoch": 0.054773082942097026,
+      "grad_norm": 0.28402701020240784,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.0894,
+      "step": 105
+    },
+    {
+      "epoch": 0.05529473135106938,
+      "grad_norm": 0.33924373984336853,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.1264,
+      "step": 106
+    },
+    {
+      "epoch": 0.05581637976004173,
+      "grad_norm": 0.3655984401702881,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.0828,
+      "step": 107
+    },
+    {
+      "epoch": 0.056338028169014086,
+      "grad_norm": 0.2262953370809555,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.0662,
+      "step": 108
+    },
+    {
+      "epoch": 0.056859676577986434,
+      "grad_norm": 0.23988084495067596,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.0672,
+      "step": 109
+    },
+    {
+      "epoch": 0.05738132498695879,
+      "grad_norm": 0.228820338845253,
+      "learning_rate": 0.000495,
+      "loss": 0.0615,
+      "step": 110
+    },
+    {
+      "epoch": 0.057902973395931145,
+      "grad_norm": 0.32484373450279236,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.0833,
+      "step": 111
+    },
+    {
+      "epoch": 0.058424621804903494,
+      "grad_norm": 0.22520330548286438,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.0767,
+      "step": 112
+    },
+    {
+      "epoch": 0.05894627021387585,
+      "grad_norm": 0.4783564805984497,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0999,
+      "step": 113
+    },
+    {
+      "epoch": 0.0594679186228482,
+      "grad_norm": 0.2565033733844757,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.0819,
+      "step": 114
+    },
+    {
+      "epoch": 0.059989567031820554,
+      "grad_norm": 0.19332879781723022,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.0702,
+      "step": 115
+    },
+    {
+      "epoch": 0.0605112154407929,
+      "grad_norm": 0.2507823705673218,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.076,
+      "step": 116
+    },
+    {
+      "epoch": 0.06103286384976526,
+      "grad_norm": 0.29689472913742065,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.0748,
+      "step": 117
+    },
+    {
+      "epoch": 0.06155451225873761,
+      "grad_norm": 0.34821203351020813,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.0949,
+      "step": 118
+    },
+    {
+      "epoch": 0.06207616066770996,
+      "grad_norm": 0.25025618076324463,
+      "learning_rate": 0.00049,
+      "loss": 0.0813,
+      "step": 119
+    },
+    {
+      "epoch": 0.06259780907668232,
+      "grad_norm": 0.23138757050037384,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.0806,
+      "step": 120
+    },
+    {
+      "epoch": 0.06311945748565467,
+      "grad_norm": 0.25655433535575867,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.0864,
+      "step": 121
+    },
+    {
+      "epoch": 0.06364110589462701,
+      "grad_norm": 0.2863710820674896,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.0659,
+      "step": 122
+    },
+    {
+      "epoch": 0.06416275430359937,
+      "grad_norm": 0.2628318965435028,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.0746,
+      "step": 123
+    },
+    {
+      "epoch": 0.06468440271257173,
+      "grad_norm": 0.2095496952533722,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.0746,
+      "step": 124
+    },
+    {
+      "epoch": 0.06520605112154408,
+      "grad_norm": 0.25687775015830994,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0867,
+      "step": 125
+    },
+    {
+      "epoch": 0.06572769953051644,
+      "grad_norm": 0.3623638153076172,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.0859,
+      "step": 126
+    },
+    {
+      "epoch": 0.06624934793948878,
+      "grad_norm": 0.22254744172096252,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.0956,
+      "step": 127
+    },
+    {
+      "epoch": 0.06677099634846113,
+      "grad_norm": 0.42705070972442627,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0885,
+      "step": 128
+    },
+    {
+      "epoch": 0.06729264475743349,
+      "grad_norm": 0.23360145092010498,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.0521,
+      "step": 129
+    },
+    {
+      "epoch": 0.06781429316640585,
+      "grad_norm": 0.1959061473608017,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.043,
+      "step": 130
+    },
+    {
+      "epoch": 0.0683359415753782,
+      "grad_norm": 0.32006219029426575,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.0942,
+      "step": 131
+    },
+    {
+      "epoch": 0.06885758998435054,
+      "grad_norm": 0.20010985434055328,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.0645,
+      "step": 132
+    },
+    {
+      "epoch": 0.0693792383933229,
+      "grad_norm": 0.18007700145244598,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0593,
+      "step": 133
+    },
+    {
+      "epoch": 0.06990088680229525,
+      "grad_norm": 0.23080182075500488,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.069,
+      "step": 134
+    },
+    {
+      "epoch": 0.07042253521126761,
+      "grad_norm": 0.16220460832118988,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0499,
+      "step": 135
+    },
+    {
+      "epoch": 0.07094418362023996,
+      "grad_norm": 0.19325301051139832,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.0616,
+      "step": 136
+    },
+    {
+      "epoch": 0.0714658320292123,
+      "grad_norm": 0.16364900767803192,
+      "learning_rate": 0.00048,
+      "loss": 0.0612,
+      "step": 137
+    },
+    {
+      "epoch": 0.07198748043818466,
+      "grad_norm": 0.15745937824249268,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0526,
+      "step": 138
+    },
+    {
+      "epoch": 0.07250912884715702,
+      "grad_norm": 0.22706539928913116,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.067,
+      "step": 139
+    },
+    {
+      "epoch": 0.07303077725612937,
+      "grad_norm": 0.22147034108638763,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.0684,
+      "step": 140
+    },
+    {
+      "epoch": 0.07355242566510173,
+      "grad_norm": 0.2623853385448456,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0491,
+      "step": 141
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.1899435669183731,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.029,
+      "step": 142
+    },
+    {
+      "epoch": 0.07459572248304643,
+      "grad_norm": 0.2669859826564789,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.064,
+      "step": 143
+    },
+    {
+      "epoch": 0.07511737089201878,
+      "grad_norm": 0.18063829839229584,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0624,
+      "step": 144
+    },
+    {
+      "epoch": 0.07563901930099114,
+      "grad_norm": 0.22147716581821442,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.0544,
+      "step": 145
+    },
+    {
+      "epoch": 0.07616066770996348,
+      "grad_norm": 0.30522170662879944,
+      "learning_rate": 0.000475,
+      "loss": 0.077,
+      "step": 146
+    },
+    {
+      "epoch": 0.07668231611893583,
+      "grad_norm": 0.15942497551441193,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.0372,
+      "step": 147
+    },
+    {
+      "epoch": 0.07720396452790819,
+      "grad_norm": 0.1456826627254486,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.0423,
+      "step": 148
+    },
+    {
+      "epoch": 0.07772561293688054,
+      "grad_norm": 0.17793269455432892,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0559,
+      "step": 149
+    },
+    {
+      "epoch": 0.0782472613458529,
+      "grad_norm": 0.152329221367836,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0266,
+      "step": 150
+    },
+    {
+      "epoch": 0.07876890975482524,
+      "grad_norm": 0.19327858090400696,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.0608,
+      "step": 151
+    },
+    {
+      "epoch": 0.0792905581637976,
+      "grad_norm": 0.15060095489025116,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.0461,
+      "step": 152
+    },
+    {
+      "epoch": 0.07981220657276995,
+      "grad_norm": 0.1864742785692215,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0724,
+      "step": 153
+    },
+    {
+      "epoch": 0.08033385498174231,
+      "grad_norm": 0.1422508805990219,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.0325,
+      "step": 154
+    },
+    {
+      "epoch": 0.08085550339071466,
+      "grad_norm": 0.21115481853485107,
+      "learning_rate": 0.00047,
+      "loss": 0.0535,
+      "step": 155
+    },
+    {
+      "epoch": 0.081377151799687,
+      "grad_norm": 0.2197350263595581,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0703,
+      "step": 156
+    },
+    {
+      "epoch": 0.08189880020865936,
+      "grad_norm": 0.1608528196811676,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0447,
+      "step": 157
+    },
+    {
+      "epoch": 0.08242044861763172,
+      "grad_norm": 0.1445985585451126,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0469,
+      "step": 158
+    },
+    {
+      "epoch": 0.08294209702660407,
+      "grad_norm": 0.25215667486190796,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.0709,
+      "step": 159
+    },
+    {
+      "epoch": 0.08346374543557643,
+      "grad_norm": 0.14391636848449707,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.0457,
+      "step": 160
+    },
+    {
+      "epoch": 0.08398539384454877,
+      "grad_norm": 0.29619306325912476,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0765,
+      "step": 161
+    },
+    {
+      "epoch": 0.08450704225352113,
+      "grad_norm": 0.15701289474964142,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.0418,
+      "step": 162
+    },
+    {
+      "epoch": 0.08502869066249348,
+      "grad_norm": 0.1698683649301529,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0294,
+      "step": 163
+    },
+    {
+      "epoch": 0.08555033907146584,
+      "grad_norm": 0.12165573239326477,
+      "learning_rate": 0.000465,
+      "loss": 0.0258,
+      "step": 164
+    },
+    {
+      "epoch": 0.08607198748043818,
+      "grad_norm": 0.1611219197511673,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0491,
+      "step": 165
+    },
+    {
+      "epoch": 0.08659363588941053,
+      "grad_norm": 0.1486036628484726,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.0479,
+      "step": 166
+    },
+    {
+      "epoch": 0.08711528429838289,
+      "grad_norm": 0.13054965436458588,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0401,
+      "step": 167
+    },
+    {
+      "epoch": 0.08763693270735524,
+      "grad_norm": 0.15433131158351898,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.048,
+      "step": 168
+    },
+    {
+      "epoch": 0.0881585811163276,
+      "grad_norm": 0.17511604726314545,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0569,
+      "step": 169
+    },
+    {
+      "epoch": 0.08868022952529994,
+      "grad_norm": 0.1398395150899887,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.034,
+      "step": 170
+    },
+    {
+      "epoch": 0.0892018779342723,
+      "grad_norm": 0.15484075248241425,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0514,
+      "step": 171
+    },
+    {
+      "epoch": 0.08972352634324465,
+      "grad_norm": 0.17851784825325012,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.0571,
+      "step": 172
+    },
+    {
+      "epoch": 0.09024517475221701,
+      "grad_norm": 0.18745650351047516,
+      "learning_rate": 0.00046,
+      "loss": 0.0523,
+      "step": 173
+    },
+    {
+      "epoch": 0.09076682316118936,
+      "grad_norm": 0.18322691321372986,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.0642,
+      "step": 174
+    },
+    {
+      "epoch": 0.0912884715701617,
+      "grad_norm": 0.1173708513379097,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0267,
+      "step": 175
+    },
+    {
+      "epoch": 0.09181011997913406,
+      "grad_norm": 0.1754874438047409,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.0657,
+      "step": 176
+    },
+    {
+      "epoch": 0.09233176838810642,
+      "grad_norm": 0.13830502331256866,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.0433,
+      "step": 177
+    },
+    {
+      "epoch": 0.09285341679707877,
+      "grad_norm": 0.11174938827753067,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.04,
+      "step": 178
+    },
+    {
+      "epoch": 0.09337506520605113,
+      "grad_norm": 0.1829378753900528,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0453,
+      "step": 179
+    },
+    {
+      "epoch": 0.09389671361502347,
+      "grad_norm": 0.10748015344142914,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.05,
+      "step": 180
+    },
+    {
+      "epoch": 0.09441836202399582,
+      "grad_norm": 0.1160806268453598,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0245,
+      "step": 181
+    },
+    {
+      "epoch": 0.09494001043296818,
+      "grad_norm": 0.12387479841709137,
+      "learning_rate": 0.000455,
+      "loss": 0.0259,
+      "step": 182
+    },
+    {
+      "epoch": 0.09546165884194054,
+      "grad_norm": 0.1586403250694275,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0378,
+      "step": 183
+    },
+    {
+      "epoch": 0.09598330725091289,
+      "grad_norm": 0.18905822932720184,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0484,
+      "step": 184
+    },
+    {
+      "epoch": 0.09650495565988523,
+      "grad_norm": 0.17541544139385223,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0503,
+      "step": 185
+    },
+    {
+      "epoch": 0.09702660406885759,
+      "grad_norm": 0.1083071306347847,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.0439,
+      "step": 186
+    },
+    {
+      "epoch": 0.09754825247782994,
+      "grad_norm": 0.10464104264974594,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0271,
+      "step": 187
+    },
+    {
+      "epoch": 0.0980699008868023,
+      "grad_norm": 0.18022054433822632,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0589,
+      "step": 188
+    },
+    {
+      "epoch": 0.09859154929577464,
+      "grad_norm": 0.18715251982212067,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0489,
+      "step": 189
+    },
+    {
+      "epoch": 0.099113197704747,
+      "grad_norm": 0.10440787672996521,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.0221,
+      "step": 190
+    },
+    {
+      "epoch": 0.09963484611371935,
+      "grad_norm": 0.11525921523571014,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0427,
+      "step": 191
+    },
+    {
+      "epoch": 0.10015649452269171,
+      "grad_norm": 0.1573028564453125,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.04,
+      "step": 192
+    },
+    {
+      "epoch": 0.10067814293166406,
+      "grad_norm": 0.15942253172397614,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.045,
+      "step": 193
+    },
+    {
+      "epoch": 0.1011997913406364,
+      "grad_norm": 0.2997572422027588,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.062,
+      "step": 194
+    },
+    {
+      "epoch": 0.10172143974960876,
+      "grad_norm": 0.1859196424484253,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0496,
+      "step": 195
+    },
+    {
+      "epoch": 0.10224308815858112,
+      "grad_norm": 0.1265893131494522,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0457,
+      "step": 196
+    },
+    {
+      "epoch": 0.10276473656755347,
+      "grad_norm": 0.16036029160022736,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.046,
+      "step": 197
+    },
+    {
+      "epoch": 0.10328638497652583,
+      "grad_norm": 0.10421448945999146,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.033,
+      "step": 198
+    },
+    {
+      "epoch": 0.10380803338549817,
+      "grad_norm": 0.12321974337100983,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0458,
+      "step": 199
+    },
+    {
+      "epoch": 0.10432968179447052,
+      "grad_norm": 0.13863791525363922,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0221,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:004527189128ad9f86554b0f384032fa8c4c91478964cf149c179071f96bf50a
+size 6289

checkpoint-200/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-300/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-300/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-300/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-300/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cf35d506aafebb35b7859faa1facfed21af871dc2f4d41e5c3a9afdfb2943b0
+size 1003852

checkpoint-300/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:472841ee20f3dbe212d8d9876009229b558cfea68399543d0ef958cfe3432199
+size 2019836

checkpoint-300/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-300/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d3453595821a7c223876546271ef57345e33f2a1aeb7d5040617d7190c545aa
+size 14645

checkpoint-300/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9008526ba75ad62032e440b0c008b057b115ab20ed57dfeabf0d04a531a70829
+size 1465

checkpoint-300/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-300/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-300/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-300/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2134 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1564945226917058,
+  "eval_steps": 500,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005216484089723526,
+      "grad_norm": 3.6680614948272705,
+      "learning_rate": 0.0,
+      "loss": 0.541,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010432968179447052,
+      "grad_norm": 1.7092307806015015,
+      "learning_rate": 5e-06,
+      "loss": 0.2639,
+      "step": 2
+    },
+    {
+      "epoch": 0.001564945226917058,
+      "grad_norm": 1.990319013595581,
+      "learning_rate": 1e-05,
+      "loss": 0.3118,
+      "step": 3
+    },
+    {
+      "epoch": 0.0020865936358894104,
+      "grad_norm": 3.750917434692383,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4562,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026082420448617634,
+      "grad_norm": 4.690845966339111,
+      "learning_rate": 2e-05,
+      "loss": 0.673,
+      "step": 5
+    },
+    {
+      "epoch": 0.003129890453834116,
+      "grad_norm": 1.4218288660049438,
+      "learning_rate": 2.5e-05,
+      "loss": 0.2984,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036515388628064684,
+      "grad_norm": 4.896511077880859,
+      "learning_rate": 3e-05,
+      "loss": 0.7113,
+      "step": 7
+    },
+    {
+      "epoch": 0.004173187271778821,
+      "grad_norm": 2.5787155628204346,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.4226,
+      "step": 8
+    },
+    {
+      "epoch": 0.004694835680751174,
+      "grad_norm": 1.028937578201294,
+      "learning_rate": 4e-05,
+      "loss": 0.1873,
+      "step": 9
+    },
+    {
+      "epoch": 0.005216484089723527,
+      "grad_norm": 3.9262092113494873,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.5728,
+      "step": 10
+    },
+    {
+      "epoch": 0.005738132498695879,
+      "grad_norm": 4.9360198974609375,
+      "learning_rate": 5e-05,
+      "loss": 0.725,
+      "step": 11
+    },
+    {
+      "epoch": 0.006259780907668232,
+      "grad_norm": 4.287437915802002,
+      "learning_rate": 5.5e-05,
+      "loss": 0.6361,
+      "step": 12
+    },
+    {
+      "epoch": 0.006781429316640584,
+      "grad_norm": 1.3290928602218628,
+      "learning_rate": 6e-05,
+      "loss": 0.3109,
+      "step": 13
+    },
+    {
+      "epoch": 0.007303077725612937,
+      "grad_norm": 2.0050501823425293,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.4099,
+      "step": 14
+    },
+    {
+      "epoch": 0.00782472613458529,
+      "grad_norm": 4.360481262207031,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.6363,
+      "step": 15
+    },
+    {
+      "epoch": 0.008346374543557642,
+      "grad_norm": 3.9680511951446533,
+      "learning_rate": 7.5e-05,
+      "loss": 0.6124,
+      "step": 16
+    },
+    {
+      "epoch": 0.008868022952529996,
+      "grad_norm": 1.701784610748291,
+      "learning_rate": 8e-05,
+      "loss": 0.3439,
+      "step": 17
+    },
+    {
+      "epoch": 0.009389671361502348,
+      "grad_norm": 4.544748783111572,
+      "learning_rate": 8.5e-05,
+      "loss": 0.6253,
+      "step": 18
+    },
+    {
+      "epoch": 0.0099113197704747,
+      "grad_norm": 4.58634090423584,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.7669,
+      "step": 19
+    },
+    {
+      "epoch": 0.010432968179447054,
+      "grad_norm": 2.89898419380188,
+      "learning_rate": 9.5e-05,
+      "loss": 0.512,
+      "step": 20
+    },
+    {
+      "epoch": 0.010954616588419406,
+      "grad_norm": 2.61112904548645,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 21
+    },
+    {
+      "epoch": 0.011476264997391758,
+      "grad_norm": 3.217054843902588,
+      "learning_rate": 0.000105,
+      "loss": 0.4959,
+      "step": 22
+    },
+    {
+      "epoch": 0.011997913406364111,
+      "grad_norm": 2.569636821746826,
+      "learning_rate": 0.00011,
+      "loss": 0.3918,
+      "step": 23
+    },
+    {
+      "epoch": 0.012519561815336464,
+      "grad_norm": 1.4626373052597046,
+      "learning_rate": 0.000115,
+      "loss": 0.3316,
+      "step": 24
+    },
+    {
+      "epoch": 0.013041210224308816,
+      "grad_norm": 1.2480732202529907,
+      "learning_rate": 0.00012,
+      "loss": 0.3484,
+      "step": 25
+    },
+    {
+      "epoch": 0.013562858633281168,
+      "grad_norm": 2.5430543422698975,
+      "learning_rate": 0.000125,
+      "loss": 0.4699,
+      "step": 26
+    },
+    {
+      "epoch": 0.014084507042253521,
+      "grad_norm": 1.7051862478256226,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.2139,
+      "step": 27
+    },
+    {
+      "epoch": 0.014606155451225874,
+      "grad_norm": 1.1670981645584106,
+      "learning_rate": 0.000135,
+      "loss": 0.3883,
+      "step": 28
+    },
+    {
+      "epoch": 0.015127803860198226,
+      "grad_norm": 1.336538314819336,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.3714,
+      "step": 29
+    },
+    {
+      "epoch": 0.01564945226917058,
+      "grad_norm": 2.018078565597534,
+      "learning_rate": 0.000145,
+      "loss": 0.2301,
+      "step": 30
+    },
+    {
+      "epoch": 0.01617110067814293,
+      "grad_norm": 1.5743223428726196,
+      "learning_rate": 0.00015,
+      "loss": 0.2935,
+      "step": 31
+    },
+    {
+      "epoch": 0.016692749087115284,
+      "grad_norm": 1.2724987268447876,
+      "learning_rate": 0.000155,
+      "loss": 0.3141,
+      "step": 32
+    },
+    {
+      "epoch": 0.017214397496087636,
+      "grad_norm": 2.2347893714904785,
+      "learning_rate": 0.00016,
+      "loss": 0.2917,
+      "step": 33
+    },
+    {
+      "epoch": 0.01773604590505999,
+      "grad_norm": 1.6726069450378418,
+      "learning_rate": 0.000165,
+      "loss": 0.377,
+      "step": 34
+    },
+    {
+      "epoch": 0.018257694314032343,
+      "grad_norm": 1.2217071056365967,
+      "learning_rate": 0.00017,
+      "loss": 0.3027,
+      "step": 35
+    },
+    {
+      "epoch": 0.018779342723004695,
+      "grad_norm": 1.3436322212219238,
+      "learning_rate": 0.000175,
+      "loss": 0.2853,
+      "step": 36
+    },
+    {
+      "epoch": 0.019300991131977047,
+      "grad_norm": 1.2247120141983032,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.2967,
+      "step": 37
+    },
+    {
+      "epoch": 0.0198226395409494,
+      "grad_norm": 1.0636978149414062,
+      "learning_rate": 0.000185,
+      "loss": 0.2745,
+      "step": 38
+    },
+    {
+      "epoch": 0.02034428794992175,
+      "grad_norm": 1.302099347114563,
+      "learning_rate": 0.00019,
+      "loss": 0.2688,
+      "step": 39
+    },
+    {
+      "epoch": 0.020865936358894107,
+      "grad_norm": 1.0052679777145386,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2539,
+      "step": 40
+    },
+    {
+      "epoch": 0.02138758476786646,
+      "grad_norm": 1.0164084434509277,
+      "learning_rate": 0.0002,
+      "loss": 0.1978,
+      "step": 41
+    },
+    {
+      "epoch": 0.02190923317683881,
+      "grad_norm": 1.3891016244888306,
+      "learning_rate": 0.000205,
+      "loss": 0.3189,
+      "step": 42
+    },
+    {
+      "epoch": 0.022430881585811163,
+      "grad_norm": 0.960986852645874,
+      "learning_rate": 0.00021,
+      "loss": 0.2321,
+      "step": 43
+    },
+    {
+      "epoch": 0.022952529994783515,
+      "grad_norm": 0.9918408393859863,
+      "learning_rate": 0.000215,
+      "loss": 0.2359,
+      "step": 44
+    },
+    {
+      "epoch": 0.023474178403755867,
+      "grad_norm": 1.190205693244934,
+      "learning_rate": 0.00022,
+      "loss": 0.2347,
+      "step": 45
+    },
+    {
+      "epoch": 0.023995826812728223,
+      "grad_norm": 0.7985232472419739,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.2048,
+      "step": 46
+    },
+    {
+      "epoch": 0.024517475221700575,
+      "grad_norm": 0.5192842483520508,
+      "learning_rate": 0.00023,
+      "loss": 0.1116,
+      "step": 47
+    },
+    {
+      "epoch": 0.025039123630672927,
+      "grad_norm": 1.1033375263214111,
+      "learning_rate": 0.000235,
+      "loss": 0.2665,
+      "step": 48
+    },
+    {
+      "epoch": 0.02556077203964528,
+      "grad_norm": 0.7089418172836304,
+      "learning_rate": 0.00024,
+      "loss": 0.1639,
+      "step": 49
+    },
+    {
+      "epoch": 0.02608242044861763,
+      "grad_norm": 1.08647882938385,
+      "learning_rate": 0.000245,
+      "loss": 0.2072,
+      "step": 50
+    },
+    {
+      "epoch": 0.026604068857589983,
+      "grad_norm": 0.9901174902915955,
+      "learning_rate": 0.00025,
+      "loss": 0.2035,
+      "step": 51
+    },
+    {
+      "epoch": 0.027125717266562335,
+      "grad_norm": 0.6938351988792419,
+      "learning_rate": 0.000255,
+      "loss": 0.1851,
+      "step": 52
+    },
+    {
+      "epoch": 0.02764736567553469,
+      "grad_norm": 0.8392678499221802,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.1918,
+      "step": 53
+    },
+    {
+      "epoch": 0.028169014084507043,
+      "grad_norm": 0.5979602932929993,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.1243,
+      "step": 54
+    },
+    {
+      "epoch": 0.028690662493479395,
+      "grad_norm": 0.7119799852371216,
+      "learning_rate": 0.00027,
+      "loss": 0.1594,
+      "step": 55
+    },
+    {
+      "epoch": 0.029212310902451747,
+      "grad_norm": 0.5519995093345642,
+      "learning_rate": 0.000275,
+      "loss": 0.078,
+      "step": 56
+    },
+    {
+      "epoch": 0.0297339593114241,
+      "grad_norm": 0.5917723774909973,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.134,
+      "step": 57
+    },
+    {
+      "epoch": 0.03025560772039645,
+      "grad_norm": 0.6265603303909302,
+      "learning_rate": 0.000285,
+      "loss": 0.1848,
+      "step": 58
+    },
+    {
+      "epoch": 0.030777256129368807,
+      "grad_norm": 1.0653454065322876,
+      "learning_rate": 0.00029,
+      "loss": 0.1831,
+      "step": 59
+    },
+    {
+      "epoch": 0.03129890453834116,
+      "grad_norm": 0.3466293513774872,
+      "learning_rate": 0.000295,
+      "loss": 0.0878,
+      "step": 60
+    },
+    {
+      "epoch": 0.03182055294731351,
+      "grad_norm": 0.5498062372207642,
+      "learning_rate": 0.0003,
+      "loss": 0.1733,
+      "step": 61
+    },
+    {
+      "epoch": 0.03234220135628586,
+      "grad_norm": 0.7708966135978699,
+      "learning_rate": 0.000305,
+      "loss": 0.1975,
+      "step": 62
+    },
+    {
+      "epoch": 0.03286384976525822,
+      "grad_norm": 0.7717278003692627,
+      "learning_rate": 0.00031,
+      "loss": 0.1863,
+      "step": 63
+    },
+    {
+      "epoch": 0.03338549817423057,
+      "grad_norm": 0.8076028823852539,
+      "learning_rate": 0.000315,
+      "loss": 0.1938,
+      "step": 64
+    },
+    {
+      "epoch": 0.03390714658320292,
+      "grad_norm": 0.5629755258560181,
+      "learning_rate": 0.00032,
+      "loss": 0.1471,
+      "step": 65
+    },
+    {
+      "epoch": 0.03442879499217527,
+      "grad_norm": 0.5237282514572144,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1244,
+      "step": 66
+    },
+    {
+      "epoch": 0.03495044340114763,
+      "grad_norm": 0.7248942852020264,
+      "learning_rate": 0.00033,
+      "loss": 0.1933,
+      "step": 67
+    },
+    {
+      "epoch": 0.03547209181011998,
+      "grad_norm": 0.49564772844314575,
+      "learning_rate": 0.000335,
+      "loss": 0.1389,
+      "step": 68
+    },
+    {
+      "epoch": 0.03599374021909233,
+      "grad_norm": 0.4806594252586365,
+      "learning_rate": 0.00034,
+      "loss": 0.1295,
+      "step": 69
+    },
+    {
+      "epoch": 0.036515388628064686,
+      "grad_norm": 0.39995619654655457,
+      "learning_rate": 0.000345,
+      "loss": 0.1324,
+      "step": 70
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.6496027708053589,
+      "learning_rate": 0.00035,
+      "loss": 0.1002,
+      "step": 71
+    },
+    {
+      "epoch": 0.03755868544600939,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.000355,
+      "loss": 0.1277,
+      "step": 72
+    },
+    {
+      "epoch": 0.03808033385498174,
+      "grad_norm": 0.49875250458717346,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.1322,
+      "step": 73
+    },
+    {
+      "epoch": 0.038601982263954095,
+      "grad_norm": 0.44551461935043335,
+      "learning_rate": 0.000365,
+      "loss": 0.1278,
+      "step": 74
+    },
+    {
+      "epoch": 0.03912363067292645,
+      "grad_norm": 0.3314933478832245,
+      "learning_rate": 0.00037,
+      "loss": 0.0918,
+      "step": 75
+    },
+    {
+      "epoch": 0.0396452790818988,
+      "grad_norm": 0.3463922441005707,
+      "learning_rate": 0.000375,
+      "loss": 0.0948,
+      "step": 76
+    },
+    {
+      "epoch": 0.040166927490871154,
+      "grad_norm": 0.5401505827903748,
+      "learning_rate": 0.00038,
+      "loss": 0.1574,
+      "step": 77
+    },
+    {
+      "epoch": 0.0406885758998435,
+      "grad_norm": 0.39233317971229553,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.1312,
+      "step": 78
+    },
+    {
+      "epoch": 0.04121022430881586,
+      "grad_norm": 0.4380398988723755,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0601,
+      "step": 79
+    },
+    {
+      "epoch": 0.041731872717788214,
+      "grad_norm": 0.3931694030761719,
+      "learning_rate": 0.000395,
+      "loss": 0.0962,
+      "step": 80
+    },
+    {
+      "epoch": 0.04225352112676056,
+      "grad_norm": 0.3566243648529053,
+      "learning_rate": 0.0004,
+      "loss": 0.1137,
+      "step": 81
+    },
+    {
+      "epoch": 0.04277516953573292,
+      "grad_norm": 0.40159469842910767,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.1128,
+      "step": 82
+    },
+    {
+      "epoch": 0.04329681794470527,
+      "grad_norm": 0.30474773049354553,
+      "learning_rate": 0.00041,
+      "loss": 0.0922,
+      "step": 83
+    },
+    {
+      "epoch": 0.04381846635367762,
+      "grad_norm": 0.31177017092704773,
+      "learning_rate": 0.000415,
+      "loss": 0.1015,
+      "step": 84
+    },
+    {
+      "epoch": 0.04434011476264997,
+      "grad_norm": 0.3996855914592743,
+      "learning_rate": 0.00042,
+      "loss": 0.1266,
+      "step": 85
+    },
+    {
+      "epoch": 0.044861763171622326,
+      "grad_norm": 0.2281728833913803,
+      "learning_rate": 0.000425,
+      "loss": 0.0758,
+      "step": 86
+    },
+    {
+      "epoch": 0.04538341158059468,
+      "grad_norm": 0.5169669985771179,
+      "learning_rate": 0.00043,
+      "loss": 0.1092,
+      "step": 87
+    },
+    {
+      "epoch": 0.04590505998956703,
+      "grad_norm": 0.5525585412979126,
+      "learning_rate": 0.000435,
+      "loss": 0.1226,
+      "step": 88
+    },
+    {
+      "epoch": 0.046426708398539386,
+      "grad_norm": 0.33093884587287903,
+      "learning_rate": 0.00044,
+      "loss": 0.0879,
+      "step": 89
+    },
+    {
+      "epoch": 0.046948356807511735,
+      "grad_norm": 0.3713582158088684,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.1121,
+      "step": 90
+    },
+    {
+      "epoch": 0.04747000521648409,
+      "grad_norm": 0.565517246723175,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.1469,
+      "step": 91
+    },
+    {
+      "epoch": 0.047991653625456446,
+      "grad_norm": 0.31801939010620117,
+      "learning_rate": 0.000455,
+      "loss": 0.0915,
+      "step": 92
+    },
+    {
+      "epoch": 0.048513302034428794,
+      "grad_norm": 0.42586401104927063,
+      "learning_rate": 0.00046,
+      "loss": 0.0411,
+      "step": 93
+    },
+    {
+      "epoch": 0.04903495044340115,
+      "grad_norm": 0.42403289675712585,
+      "learning_rate": 0.000465,
+      "loss": 0.0589,
+      "step": 94
+    },
+    {
+      "epoch": 0.0495565988523735,
+      "grad_norm": 0.2604529559612274,
+      "learning_rate": 0.00047,
+      "loss": 0.0779,
+      "step": 95
+    },
+    {
+      "epoch": 0.050078247261345854,
+      "grad_norm": 0.32257840037345886,
+      "learning_rate": 0.000475,
+      "loss": 0.0958,
+      "step": 96
+    },
+    {
+      "epoch": 0.0505998956703182,
+      "grad_norm": 0.2648946940898895,
+      "learning_rate": 0.00048,
+      "loss": 0.0591,
+      "step": 97
+    },
+    {
+      "epoch": 0.05112154407929056,
+      "grad_norm": 0.26664629578590393,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0607,
+      "step": 98
+    },
+    {
+      "epoch": 0.051643192488262914,
+      "grad_norm": 0.2891658544540405,
+      "learning_rate": 0.00049,
+      "loss": 0.0478,
+      "step": 99
+    },
+    {
+      "epoch": 0.05216484089723526,
+      "grad_norm": 0.35936883091926575,
+      "learning_rate": 0.000495,
+      "loss": 0.1126,
+      "step": 100
+    },
+    {
+      "epoch": 0.05268648930620762,
+      "grad_norm": 0.3226841986179352,
+      "learning_rate": 0.0005,
+      "loss": 0.0995,
+      "step": 101
+    },
+    {
+      "epoch": 0.053208137715179966,
+      "grad_norm": 0.2140406370162964,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.0636,
+      "step": 102
+    },
+    {
+      "epoch": 0.05372978612415232,
+      "grad_norm": 0.28297877311706543,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.0674,
+      "step": 103
+    },
+    {
+      "epoch": 0.05425143453312467,
+      "grad_norm": 0.27131739258766174,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.0657,
+      "step": 104
+    },
+    {
+      "epoch": 0.054773082942097026,
+      "grad_norm": 0.28402701020240784,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.0894,
+      "step": 105
+    },
+    {
+      "epoch": 0.05529473135106938,
+      "grad_norm": 0.33924373984336853,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.1264,
+      "step": 106
+    },
+    {
+      "epoch": 0.05581637976004173,
+      "grad_norm": 0.3655984401702881,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.0828,
+      "step": 107
+    },
+    {
+      "epoch": 0.056338028169014086,
+      "grad_norm": 0.2262953370809555,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.0662,
+      "step": 108
+    },
+    {
+      "epoch": 0.056859676577986434,
+      "grad_norm": 0.23988084495067596,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.0672,
+      "step": 109
+    },
+    {
+      "epoch": 0.05738132498695879,
+      "grad_norm": 0.228820338845253,
+      "learning_rate": 0.000495,
+      "loss": 0.0615,
+      "step": 110
+    },
+    {
+      "epoch": 0.057902973395931145,
+      "grad_norm": 0.32484373450279236,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.0833,
+      "step": 111
+    },
+    {
+      "epoch": 0.058424621804903494,
+      "grad_norm": 0.22520330548286438,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.0767,
+      "step": 112
+    },
+    {
+      "epoch": 0.05894627021387585,
+      "grad_norm": 0.4783564805984497,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0999,
+      "step": 113
+    },
+    {
+      "epoch": 0.0594679186228482,
+      "grad_norm": 0.2565033733844757,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.0819,
+      "step": 114
+    },
+    {
+      "epoch": 0.059989567031820554,
+      "grad_norm": 0.19332879781723022,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.0702,
+      "step": 115
+    },
+    {
+      "epoch": 0.0605112154407929,
+      "grad_norm": 0.2507823705673218,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.076,
+      "step": 116
+    },
+    {
+      "epoch": 0.06103286384976526,
+      "grad_norm": 0.29689472913742065,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.0748,
+      "step": 117
+    },
+    {
+      "epoch": 0.06155451225873761,
+      "grad_norm": 0.34821203351020813,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.0949,
+      "step": 118
+    },
+    {
+      "epoch": 0.06207616066770996,
+      "grad_norm": 0.25025618076324463,
+      "learning_rate": 0.00049,
+      "loss": 0.0813,
+      "step": 119
+    },
+    {
+      "epoch": 0.06259780907668232,
+      "grad_norm": 0.23138757050037384,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.0806,
+      "step": 120
+    },
+    {
+      "epoch": 0.06311945748565467,
+      "grad_norm": 0.25655433535575867,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.0864,
+      "step": 121
+    },
+    {
+      "epoch": 0.06364110589462701,
+      "grad_norm": 0.2863710820674896,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.0659,
+      "step": 122
+    },
+    {
+      "epoch": 0.06416275430359937,
+      "grad_norm": 0.2628318965435028,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.0746,
+      "step": 123
+    },
+    {
+      "epoch": 0.06468440271257173,
+      "grad_norm": 0.2095496952533722,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.0746,
+      "step": 124
+    },
+    {
+      "epoch": 0.06520605112154408,
+      "grad_norm": 0.25687775015830994,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0867,
+      "step": 125
+    },
+    {
+      "epoch": 0.06572769953051644,
+      "grad_norm": 0.3623638153076172,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.0859,
+      "step": 126
+    },
+    {
+      "epoch": 0.06624934793948878,
+      "grad_norm": 0.22254744172096252,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.0956,
+      "step": 127
+    },
+    {
+      "epoch": 0.06677099634846113,
+      "grad_norm": 0.42705070972442627,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0885,
+      "step": 128
+    },
+    {
+      "epoch": 0.06729264475743349,
+      "grad_norm": 0.23360145092010498,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.0521,
+      "step": 129
+    },
+    {
+      "epoch": 0.06781429316640585,
+      "grad_norm": 0.1959061473608017,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.043,
+      "step": 130
+    },
+    {
+      "epoch": 0.0683359415753782,
+      "grad_norm": 0.32006219029426575,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.0942,
+      "step": 131
+    },
+    {
+      "epoch": 0.06885758998435054,
+      "grad_norm": 0.20010985434055328,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.0645,
+      "step": 132
+    },
+    {
+      "epoch": 0.0693792383933229,
+      "grad_norm": 0.18007700145244598,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0593,
+      "step": 133
+    },
+    {
+      "epoch": 0.06990088680229525,
+      "grad_norm": 0.23080182075500488,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.069,
+      "step": 134
+    },
+    {
+      "epoch": 0.07042253521126761,
+      "grad_norm": 0.16220460832118988,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0499,
+      "step": 135
+    },
+    {
+      "epoch": 0.07094418362023996,
+      "grad_norm": 0.19325301051139832,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.0616,
+      "step": 136
+    },
+    {
+      "epoch": 0.0714658320292123,
+      "grad_norm": 0.16364900767803192,
+      "learning_rate": 0.00048,
+      "loss": 0.0612,
+      "step": 137
+    },
+    {
+      "epoch": 0.07198748043818466,
+      "grad_norm": 0.15745937824249268,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0526,
+      "step": 138
+    },
+    {
+      "epoch": 0.07250912884715702,
+      "grad_norm": 0.22706539928913116,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.067,
+      "step": 139
+    },
+    {
+      "epoch": 0.07303077725612937,
+      "grad_norm": 0.22147034108638763,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.0684,
+      "step": 140
+    },
+    {
+      "epoch": 0.07355242566510173,
+      "grad_norm": 0.2623853385448456,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0491,
+      "step": 141
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.1899435669183731,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.029,
+      "step": 142
+    },
+    {
+      "epoch": 0.07459572248304643,
+      "grad_norm": 0.2669859826564789,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.064,
+      "step": 143
+    },
+    {
+      "epoch": 0.07511737089201878,
+      "grad_norm": 0.18063829839229584,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0624,
+      "step": 144
+    },
+    {
+      "epoch": 0.07563901930099114,
+      "grad_norm": 0.22147716581821442,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.0544,
+      "step": 145
+    },
+    {
+      "epoch": 0.07616066770996348,
+      "grad_norm": 0.30522170662879944,
+      "learning_rate": 0.000475,
+      "loss": 0.077,
+      "step": 146
+    },
+    {
+      "epoch": 0.07668231611893583,
+      "grad_norm": 0.15942497551441193,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.0372,
+      "step": 147
+    },
+    {
+      "epoch": 0.07720396452790819,
+      "grad_norm": 0.1456826627254486,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.0423,
+      "step": 148
+    },
+    {
+      "epoch": 0.07772561293688054,
+      "grad_norm": 0.17793269455432892,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0559,
+      "step": 149
+    },
+    {
+      "epoch": 0.0782472613458529,
+      "grad_norm": 0.152329221367836,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0266,
+      "step": 150
+    },
+    {
+      "epoch": 0.07876890975482524,
+      "grad_norm": 0.19327858090400696,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.0608,
+      "step": 151
+    },
+    {
+      "epoch": 0.0792905581637976,
+      "grad_norm": 0.15060095489025116,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.0461,
+      "step": 152
+    },
+    {
+      "epoch": 0.07981220657276995,
+      "grad_norm": 0.1864742785692215,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0724,
+      "step": 153
+    },
+    {
+      "epoch": 0.08033385498174231,
+      "grad_norm": 0.1422508805990219,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.0325,
+      "step": 154
+    },
+    {
+      "epoch": 0.08085550339071466,
+      "grad_norm": 0.21115481853485107,
+      "learning_rate": 0.00047,
+      "loss": 0.0535,
+      "step": 155
+    },
+    {
+      "epoch": 0.081377151799687,
+      "grad_norm": 0.2197350263595581,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0703,
+      "step": 156
+    },
+    {
+      "epoch": 0.08189880020865936,
+      "grad_norm": 0.1608528196811676,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0447,
+      "step": 157
+    },
+    {
+      "epoch": 0.08242044861763172,
+      "grad_norm": 0.1445985585451126,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0469,
+      "step": 158
+    },
+    {
+      "epoch": 0.08294209702660407,
+      "grad_norm": 0.25215667486190796,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.0709,
+      "step": 159
+    },
+    {
+      "epoch": 0.08346374543557643,
+      "grad_norm": 0.14391636848449707,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.0457,
+      "step": 160
+    },
+    {
+      "epoch": 0.08398539384454877,
+      "grad_norm": 0.29619306325912476,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0765,
+      "step": 161
+    },
+    {
+      "epoch": 0.08450704225352113,
+      "grad_norm": 0.15701289474964142,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.0418,
+      "step": 162
+    },
+    {
+      "epoch": 0.08502869066249348,
+      "grad_norm": 0.1698683649301529,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0294,
+      "step": 163
+    },
+    {
+      "epoch": 0.08555033907146584,
+      "grad_norm": 0.12165573239326477,
+      "learning_rate": 0.000465,
+      "loss": 0.0258,
+      "step": 164
+    },
+    {
+      "epoch": 0.08607198748043818,
+      "grad_norm": 0.1611219197511673,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0491,
+      "step": 165
+    },
+    {
+      "epoch": 0.08659363588941053,
+      "grad_norm": 0.1486036628484726,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.0479,
+      "step": 166
+    },
+    {
+      "epoch": 0.08711528429838289,
+      "grad_norm": 0.13054965436458588,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0401,
+      "step": 167
+    },
+    {
+      "epoch": 0.08763693270735524,
+      "grad_norm": 0.15433131158351898,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.048,
+      "step": 168
+    },
+    {
+      "epoch": 0.0881585811163276,
+      "grad_norm": 0.17511604726314545,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0569,
+      "step": 169
+    },
+    {
+      "epoch": 0.08868022952529994,
+      "grad_norm": 0.1398395150899887,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.034,
+      "step": 170
+    },
+    {
+      "epoch": 0.0892018779342723,
+      "grad_norm": 0.15484075248241425,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0514,
+      "step": 171
+    },
+    {
+      "epoch": 0.08972352634324465,
+      "grad_norm": 0.17851784825325012,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.0571,
+      "step": 172
+    },
+    {
+      "epoch": 0.09024517475221701,
+      "grad_norm": 0.18745650351047516,
+      "learning_rate": 0.00046,
+      "loss": 0.0523,
+      "step": 173
+    },
+    {
+      "epoch": 0.09076682316118936,
+      "grad_norm": 0.18322691321372986,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.0642,
+      "step": 174
+    },
+    {
+      "epoch": 0.0912884715701617,
+      "grad_norm": 0.1173708513379097,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0267,
+      "step": 175
+    },
+    {
+      "epoch": 0.09181011997913406,
+      "grad_norm": 0.1754874438047409,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.0657,
+      "step": 176
+    },
+    {
+      "epoch": 0.09233176838810642,
+      "grad_norm": 0.13830502331256866,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.0433,
+      "step": 177
+    },
+    {
+      "epoch": 0.09285341679707877,
+      "grad_norm": 0.11174938827753067,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.04,
+      "step": 178
+    },
+    {
+      "epoch": 0.09337506520605113,
+      "grad_norm": 0.1829378753900528,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0453,
+      "step": 179
+    },
+    {
+      "epoch": 0.09389671361502347,
+      "grad_norm": 0.10748015344142914,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.05,
+      "step": 180
+    },
+    {
+      "epoch": 0.09441836202399582,
+      "grad_norm": 0.1160806268453598,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0245,
+      "step": 181
+    },
+    {
+      "epoch": 0.09494001043296818,
+      "grad_norm": 0.12387479841709137,
+      "learning_rate": 0.000455,
+      "loss": 0.0259,
+      "step": 182
+    },
+    {
+      "epoch": 0.09546165884194054,
+      "grad_norm": 0.1586403250694275,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0378,
+      "step": 183
+    },
+    {
+      "epoch": 0.09598330725091289,
+      "grad_norm": 0.18905822932720184,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0484,
+      "step": 184
+    },
+    {
+      "epoch": 0.09650495565988523,
+      "grad_norm": 0.17541544139385223,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0503,
+      "step": 185
+    },
+    {
+      "epoch": 0.09702660406885759,
+      "grad_norm": 0.1083071306347847,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.0439,
+      "step": 186
+    },
+    {
+      "epoch": 0.09754825247782994,
+      "grad_norm": 0.10464104264974594,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0271,
+      "step": 187
+    },
+    {
+      "epoch": 0.0980699008868023,
+      "grad_norm": 0.18022054433822632,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0589,
+      "step": 188
+    },
+    {
+      "epoch": 0.09859154929577464,
+      "grad_norm": 0.18715251982212067,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0489,
+      "step": 189
+    },
+    {
+      "epoch": 0.099113197704747,
+      "grad_norm": 0.10440787672996521,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.0221,
+      "step": 190
+    },
+    {
+      "epoch": 0.09963484611371935,
+      "grad_norm": 0.11525921523571014,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0427,
+      "step": 191
+    },
+    {
+      "epoch": 0.10015649452269171,
+      "grad_norm": 0.1573028564453125,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.04,
+      "step": 192
+    },
+    {
+      "epoch": 0.10067814293166406,
+      "grad_norm": 0.15942253172397614,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.045,
+      "step": 193
+    },
+    {
+      "epoch": 0.1011997913406364,
+      "grad_norm": 0.2997572422027588,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.062,
+      "step": 194
+    },
+    {
+      "epoch": 0.10172143974960876,
+      "grad_norm": 0.1859196424484253,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0496,
+      "step": 195
+    },
+    {
+      "epoch": 0.10224308815858112,
+      "grad_norm": 0.1265893131494522,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0457,
+      "step": 196
+    },
+    {
+      "epoch": 0.10276473656755347,
+      "grad_norm": 0.16036029160022736,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.046,
+      "step": 197
+    },
+    {
+      "epoch": 0.10328638497652583,
+      "grad_norm": 0.10421448945999146,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.033,
+      "step": 198
+    },
+    {
+      "epoch": 0.10380803338549817,
+      "grad_norm": 0.12321974337100983,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0458,
+      "step": 199
+    },
+    {
+      "epoch": 0.10432968179447052,
+      "grad_norm": 0.13863791525363922,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0221,
+      "step": 200
+    },
+    {
+      "epoch": 0.10485133020344288,
+      "grad_norm": 0.11896353214979172,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.0477,
+      "step": 201
+    },
+    {
+      "epoch": 0.10537297861241524,
+      "grad_norm": 0.1473105251789093,
+      "learning_rate": 0.0004438888888888889,
+      "loss": 0.056,
+      "step": 202
+    },
+    {
+      "epoch": 0.10589462702138759,
+      "grad_norm": 0.15058237314224243,
+      "learning_rate": 0.00044333333333333334,
+      "loss": 0.0477,
+      "step": 203
+    },
+    {
+      "epoch": 0.10641627543035993,
+      "grad_norm": 0.10770102590322495,
+      "learning_rate": 0.0004427777777777778,
+      "loss": 0.0316,
+      "step": 204
+    },
+    {
+      "epoch": 0.10693792383933229,
+      "grad_norm": 0.13766999542713165,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.041,
+      "step": 205
+    },
+    {
+      "epoch": 0.10745957224830464,
+      "grad_norm": 0.11786706745624542,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 0.0302,
+      "step": 206
+    },
+    {
+      "epoch": 0.107981220657277,
+      "grad_norm": 0.10209888964891434,
+      "learning_rate": 0.00044111111111111114,
+      "loss": 0.0396,
+      "step": 207
+    },
+    {
+      "epoch": 0.10850286906624934,
+      "grad_norm": 0.13609950244426727,
+      "learning_rate": 0.0004405555555555555,
+      "loss": 0.0394,
+      "step": 208
+    },
+    {
+      "epoch": 0.1090245174752217,
+      "grad_norm": 0.11915361881256104,
+      "learning_rate": 0.00044,
+      "loss": 0.0421,
+      "step": 209
+    },
+    {
+      "epoch": 0.10954616588419405,
+      "grad_norm": 0.11170439422130585,
+      "learning_rate": 0.0004394444444444445,
+      "loss": 0.0395,
+      "step": 210
+    },
+    {
+      "epoch": 0.11006781429316641,
+      "grad_norm": 0.12584055960178375,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.0534,
+      "step": 211
+    },
+    {
+      "epoch": 0.11058946270213876,
+      "grad_norm": 0.1454746276140213,
+      "learning_rate": 0.0004383333333333334,
+      "loss": 0.0469,
+      "step": 212
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.10297106951475143,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.0359,
+      "step": 213
+    },
+    {
+      "epoch": 0.11163275952008346,
+      "grad_norm": 0.10994141548871994,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 0.0404,
+      "step": 214
+    },
+    {
+      "epoch": 0.11215440792905582,
+      "grad_norm": 0.13165079057216644,
+      "learning_rate": 0.00043666666666666664,
+      "loss": 0.0475,
+      "step": 215
+    },
+    {
+      "epoch": 0.11267605633802817,
+      "grad_norm": 0.11115416139364243,
+      "learning_rate": 0.00043611111111111113,
+      "loss": 0.0351,
+      "step": 216
+    },
+    {
+      "epoch": 0.11319770474700053,
+      "grad_norm": 0.15927758812904358,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0468,
+      "step": 217
+    },
+    {
+      "epoch": 0.11371935315597287,
+      "grad_norm": 0.0941813513636589,
+      "learning_rate": 0.000435,
+      "loss": 0.0337,
+      "step": 218
+    },
+    {
+      "epoch": 0.11424100156494522,
+      "grad_norm": 0.10850685834884644,
+      "learning_rate": 0.0004344444444444445,
+      "loss": 0.0211,
+      "step": 219
+    },
+    {
+      "epoch": 0.11476264997391758,
+      "grad_norm": 0.0790611058473587,
+      "learning_rate": 0.0004338888888888889,
+      "loss": 0.0196,
+      "step": 220
+    },
+    {
+      "epoch": 0.11528429838288994,
+      "grad_norm": 0.10849782079458237,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.04,
+      "step": 221
+    },
+    {
+      "epoch": 0.11580594679186229,
+      "grad_norm": 0.09607880562543869,
+      "learning_rate": 0.00043277777777777775,
+      "loss": 0.0291,
+      "step": 222
+    },
+    {
+      "epoch": 0.11632759520083463,
+      "grad_norm": 0.17959930002689362,
+      "learning_rate": 0.00043222222222222224,
+      "loss": 0.0426,
+      "step": 223
+    },
+    {
+      "epoch": 0.11684924360980699,
+      "grad_norm": 0.08865644782781601,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 0.0321,
+      "step": 224
+    },
+    {
+      "epoch": 0.11737089201877934,
+      "grad_norm": 0.17324934899806976,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0537,
+      "step": 225
+    },
+    {
+      "epoch": 0.1178925404277517,
+      "grad_norm": 0.10226263850927353,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 0.0342,
+      "step": 226
+    },
+    {
+      "epoch": 0.11841418883672405,
+      "grad_norm": 0.10456152260303497,
+      "learning_rate": 0.00043,
+      "loss": 0.039,
+      "step": 227
+    },
+    {
+      "epoch": 0.1189358372456964,
+      "grad_norm": 0.10196290910243988,
+      "learning_rate": 0.0004294444444444445,
+      "loss": 0.0329,
+      "step": 228
+    },
+    {
+      "epoch": 0.11945748565466875,
+      "grad_norm": 0.12004778534173965,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0434,
+      "step": 229
+    },
+    {
+      "epoch": 0.11997913406364111,
+      "grad_norm": 0.10152442753314972,
+      "learning_rate": 0.00042833333333333335,
+      "loss": 0.0305,
+      "step": 230
+    },
+    {
+      "epoch": 0.12050078247261346,
+      "grad_norm": 0.1072554886341095,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.0407,
+      "step": 231
+    },
+    {
+      "epoch": 0.1210224308815858,
+      "grad_norm": 0.08478479087352753,
+      "learning_rate": 0.00042722222222222223,
+      "loss": 0.0375,
+      "step": 232
+    },
+    {
+      "epoch": 0.12154407929055816,
+      "grad_norm": 0.11901957541704178,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.0281,
+      "step": 233
+    },
+    {
+      "epoch": 0.12206572769953052,
+      "grad_norm": 0.097981758415699,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 0.0365,
+      "step": 234
+    },
+    {
+      "epoch": 0.12258737610850287,
+      "grad_norm": 0.08464547991752625,
+      "learning_rate": 0.0004255555555555556,
+      "loss": 0.0227,
+      "step": 235
+    },
+    {
+      "epoch": 0.12310902451747523,
+      "grad_norm": 0.18886807560920715,
+      "learning_rate": 0.000425,
+      "loss": 0.0494,
+      "step": 236
+    },
+    {
+      "epoch": 0.12363067292644757,
+      "grad_norm": 0.08432997763156891,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.031,
+      "step": 237
+    },
+    {
+      "epoch": 0.12415232133541992,
+      "grad_norm": 0.24738061428070068,
+      "learning_rate": 0.0004238888888888889,
+      "loss": 0.0611,
+      "step": 238
+    },
+    {
+      "epoch": 0.12467396974439228,
+      "grad_norm": 0.11955960839986801,
+      "learning_rate": 0.00042333333333333334,
+      "loss": 0.0481,
+      "step": 239
+    },
+    {
+      "epoch": 0.12519561815336464,
+      "grad_norm": 0.132662832736969,
+      "learning_rate": 0.0004227777777777778,
+      "loss": 0.0432,
+      "step": 240
+    },
+    {
+      "epoch": 0.12571726656233698,
+      "grad_norm": 0.08496639877557755,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0328,
+      "step": 241
+    },
+    {
+      "epoch": 0.12623891497130935,
+      "grad_norm": 0.13830861449241638,
+      "learning_rate": 0.0004216666666666667,
+      "loss": 0.0336,
+      "step": 242
+    },
+    {
+      "epoch": 0.1267605633802817,
+      "grad_norm": 0.12200845032930374,
+      "learning_rate": 0.0004211111111111111,
+      "loss": 0.0346,
+      "step": 243
+    },
+    {
+      "epoch": 0.12728221178925403,
+      "grad_norm": 0.10438041388988495,
+      "learning_rate": 0.0004205555555555556,
+      "loss": 0.039,
+      "step": 244
+    },
+    {
+      "epoch": 0.1278038601982264,
+      "grad_norm": 0.10238846391439438,
+      "learning_rate": 0.00042,
+      "loss": 0.0442,
+      "step": 245
+    },
+    {
+      "epoch": 0.12832550860719874,
+      "grad_norm": 0.10930721461772919,
+      "learning_rate": 0.00041944444444444445,
+      "loss": 0.0426,
+      "step": 246
+    },
+    {
+      "epoch": 0.1288471570161711,
+      "grad_norm": 0.09867265820503235,
+      "learning_rate": 0.0004188888888888889,
+      "loss": 0.0402,
+      "step": 247
+    },
+    {
+      "epoch": 0.12936880542514345,
+      "grad_norm": 0.1137848049402237,
+      "learning_rate": 0.00041833333333333333,
+      "loss": 0.0278,
+      "step": 248
+    },
+    {
+      "epoch": 0.1298904538341158,
+      "grad_norm": 0.1364007592201233,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.0437,
+      "step": 249
+    },
+    {
+      "epoch": 0.13041210224308816,
+      "grad_norm": 0.09385659545660019,
+      "learning_rate": 0.0004172222222222222,
+      "loss": 0.0353,
+      "step": 250
+    },
+    {
+      "epoch": 0.1309337506520605,
+      "grad_norm": 0.1302153617143631,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.0287,
+      "step": 251
+    },
+    {
+      "epoch": 0.13145539906103287,
+      "grad_norm": 0.09976278990507126,
+      "learning_rate": 0.00041611111111111113,
+      "loss": 0.0381,
+      "step": 252
+    },
+    {
+      "epoch": 0.13197704747000522,
+      "grad_norm": 0.0966271236538887,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.0204,
+      "step": 253
+    },
+    {
+      "epoch": 0.13249869587897756,
+      "grad_norm": 0.0773528590798378,
+      "learning_rate": 0.000415,
+      "loss": 0.0285,
+      "step": 254
+    },
+    {
+      "epoch": 0.13302034428794993,
+      "grad_norm": 0.2350674420595169,
+      "learning_rate": 0.00041444444444444444,
+      "loss": 0.0511,
+      "step": 255
+    },
+    {
+      "epoch": 0.13354199269692227,
+      "grad_norm": 0.08375384658575058,
+      "learning_rate": 0.0004138888888888889,
+      "loss": 0.0341,
+      "step": 256
+    },
+    {
+      "epoch": 0.13406364110589464,
+      "grad_norm": 0.09229125827550888,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0329,
+      "step": 257
+    },
+    {
+      "epoch": 0.13458528951486698,
+      "grad_norm": 0.08750821650028229,
+      "learning_rate": 0.0004127777777777778,
+      "loss": 0.0283,
+      "step": 258
+    },
+    {
+      "epoch": 0.13510693792383932,
+      "grad_norm": 0.075618676841259,
+      "learning_rate": 0.00041222222222222224,
+      "loss": 0.0291,
+      "step": 259
+    },
+    {
+      "epoch": 0.1356285863328117,
+      "grad_norm": 0.16954250633716583,
+      "learning_rate": 0.0004116666666666667,
+      "loss": 0.0441,
+      "step": 260
+    },
+    {
+      "epoch": 0.13615023474178403,
+      "grad_norm": 0.07529555261135101,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.0139,
+      "step": 261
+    },
+    {
+      "epoch": 0.1366718831507564,
+      "grad_norm": 0.08884407579898834,
+      "learning_rate": 0.00041055555555555555,
+      "loss": 0.0299,
+      "step": 262
+    },
+    {
+      "epoch": 0.13719353155972874,
+      "grad_norm": 0.09607396274805069,
+      "learning_rate": 0.00041,
+      "loss": 0.0337,
+      "step": 263
+    },
+    {
+      "epoch": 0.13771517996870108,
+      "grad_norm": 0.08453882485628128,
+      "learning_rate": 0.00040944444444444443,
+      "loss": 0.0315,
+      "step": 264
+    },
+    {
+      "epoch": 0.13823682837767345,
+      "grad_norm": 0.09629228711128235,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.0181,
+      "step": 265
+    },
+    {
+      "epoch": 0.1387584767866458,
+      "grad_norm": 0.07212290167808533,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 0.0262,
+      "step": 266
+    },
+    {
+      "epoch": 0.13928012519561817,
+      "grad_norm": 0.09192827343940735,
+      "learning_rate": 0.0004077777777777778,
+      "loss": 0.018,
+      "step": 267
+    },
+    {
+      "epoch": 0.1398017736045905,
+      "grad_norm": 0.10876122117042542,
+      "learning_rate": 0.00040722222222222223,
+      "loss": 0.0316,
+      "step": 268
+    },
+    {
+      "epoch": 0.14032342201356285,
+      "grad_norm": 0.10796765238046646,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.0323,
+      "step": 269
+    },
+    {
+      "epoch": 0.14084507042253522,
+      "grad_norm": 0.08297892659902573,
+      "learning_rate": 0.0004061111111111111,
+      "loss": 0.0244,
+      "step": 270
+    },
+    {
+      "epoch": 0.14136671883150756,
+      "grad_norm": 0.09534858912229538,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.0279,
+      "step": 271
+    },
+    {
+      "epoch": 0.14188836724047993,
+      "grad_norm": 0.07854770123958588,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.032,
+      "step": 272
+    },
+    {
+      "epoch": 0.14241001564945227,
+      "grad_norm": 0.16025401651859283,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0413,
+      "step": 273
+    },
+    {
+      "epoch": 0.1429316640584246,
+      "grad_norm": 0.07919424772262573,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 0.0277,
+      "step": 274
+    },
+    {
+      "epoch": 0.14345331246739698,
+      "grad_norm": 0.07335282117128372,
+      "learning_rate": 0.00040333333333333334,
+      "loss": 0.0363,
+      "step": 275
+    },
+    {
+      "epoch": 0.14397496087636932,
+      "grad_norm": 0.1280767321586609,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 0.0402,
+      "step": 276
+    },
+    {
+      "epoch": 0.1444966092853417,
+      "grad_norm": 0.11371007561683655,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0312,
+      "step": 277
+    },
+    {
+      "epoch": 0.14501825769431403,
+      "grad_norm": 0.12229876220226288,
+      "learning_rate": 0.00040166666666666665,
+      "loss": 0.0357,
+      "step": 278
+    },
+    {
+      "epoch": 0.14553990610328638,
+      "grad_norm": 0.11436333507299423,
+      "learning_rate": 0.0004011111111111111,
+      "loss": 0.0135,
+      "step": 279
+    },
+    {
+      "epoch": 0.14606155451225875,
+      "grad_norm": 0.08084696531295776,
+      "learning_rate": 0.0004005555555555556,
+      "loss": 0.0302,
+      "step": 280
+    },
+    {
+      "epoch": 0.1465832029212311,
+      "grad_norm": 0.09421739727258682,
+      "learning_rate": 0.0004,
+      "loss": 0.0376,
+      "step": 281
+    },
+    {
+      "epoch": 0.14710485133020346,
+      "grad_norm": 0.0744849219918251,
+      "learning_rate": 0.00039944444444444446,
+      "loss": 0.0291,
+      "step": 282
+    },
+    {
+      "epoch": 0.1476264997391758,
+      "grad_norm": 0.06754301488399506,
+      "learning_rate": 0.0003988888888888889,
+      "loss": 0.0262,
+      "step": 283
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.06215747445821762,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 0.0223,
+      "step": 284
+    },
+    {
+      "epoch": 0.1486697965571205,
+      "grad_norm": 0.10289556533098221,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.0401,
+      "step": 285
+    },
+    {
+      "epoch": 0.14919144496609285,
+      "grad_norm": 0.10723885893821716,
+      "learning_rate": 0.0003972222222222222,
+      "loss": 0.018,
+      "step": 286
+    },
+    {
+      "epoch": 0.1497130933750652,
+      "grad_norm": 0.12683100998401642,
+      "learning_rate": 0.0003966666666666667,
+      "loss": 0.0155,
+      "step": 287
+    },
+    {
+      "epoch": 0.15023474178403756,
+      "grad_norm": 0.10709403455257416,
+      "learning_rate": 0.00039611111111111113,
+      "loss": 0.0186,
+      "step": 288
+    },
+    {
+      "epoch": 0.1507563901930099,
+      "grad_norm": 0.09857751429080963,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.0311,
+      "step": 289
+    },
+    {
+      "epoch": 0.15127803860198227,
+      "grad_norm": 0.07990946620702744,
+      "learning_rate": 0.000395,
+      "loss": 0.032,
+      "step": 290
+    },
+    {
+      "epoch": 0.15179968701095461,
+      "grad_norm": 0.06873098760843277,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.0163,
+      "step": 291
+    },
+    {
+      "epoch": 0.15232133541992696,
+      "grad_norm": 0.0788077712059021,
+      "learning_rate": 0.00039388888888888893,
+      "loss": 0.0319,
+      "step": 292
+    },
+    {
+      "epoch": 0.15284298382889933,
+      "grad_norm": 0.08789033442735672,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0352,
+      "step": 293
+    },
+    {
+      "epoch": 0.15336463223787167,
+      "grad_norm": 0.10574653744697571,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 0.0411,
+      "step": 294
+    },
+    {
+      "epoch": 0.15388628064684404,
+      "grad_norm": 0.08198726177215576,
+      "learning_rate": 0.00039222222222222225,
+      "loss": 0.0286,
+      "step": 295
+    },
+    {
+      "epoch": 0.15440792905581638,
+      "grad_norm": 0.2811417579650879,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 0.0508,
+      "step": 296
+    },
+    {
+      "epoch": 0.15492957746478872,
+      "grad_norm": 0.1203279122710228,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.0384,
+      "step": 297
+    },
+    {
+      "epoch": 0.1554512258737611,
+      "grad_norm": 0.08802422881126404,
+      "learning_rate": 0.00039055555555555556,
+      "loss": 0.0305,
+      "step": 298
+    },
+    {
+      "epoch": 0.15597287428273343,
+      "grad_norm": 0.05368930101394653,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0167,
+      "step": 299
+    },
+    {
+      "epoch": 0.1564945226917058,
+      "grad_norm": 0.16041633486747742,
+      "learning_rate": 0.00038944444444444443,
+      "loss": 0.047,
+      "step": 300
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-300/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:004527189128ad9f86554b0f384032fa8c4c91478964cf149c179071f96bf50a
+size 6289

checkpoint-300/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-400/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-400/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-400/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a3f38afafc744b666ec6ce2714c92c526aaf1ec6e9b64e3b84919bb07be9015
+size 1003852

checkpoint-400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:959dfc59ee5d3b022d936b427fa4633d450f05c9034ad2e29efc7a0a9e66e12a
+size 2019836

checkpoint-400/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c68cfb429def849049d41e062d23f9c36c2b134509fe9fa5e19678b55cdccbe7
+size 1465

checkpoint-400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-400/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-400/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2834 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.20865936358894105,
+  "eval_steps": 500,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005216484089723526,
+      "grad_norm": 3.6680614948272705,
+      "learning_rate": 0.0,
+      "loss": 0.541,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010432968179447052,
+      "grad_norm": 1.7092307806015015,
+      "learning_rate": 5e-06,
+      "loss": 0.2639,
+      "step": 2
+    },
+    {
+      "epoch": 0.001564945226917058,
+      "grad_norm": 1.990319013595581,
+      "learning_rate": 1e-05,
+      "loss": 0.3118,
+      "step": 3
+    },
+    {
+      "epoch": 0.0020865936358894104,
+      "grad_norm": 3.750917434692383,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4562,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026082420448617634,
+      "grad_norm": 4.690845966339111,
+      "learning_rate": 2e-05,
+      "loss": 0.673,
+      "step": 5
+    },
+    {
+      "epoch": 0.003129890453834116,
+      "grad_norm": 1.4218288660049438,
+      "learning_rate": 2.5e-05,
+      "loss": 0.2984,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036515388628064684,
+      "grad_norm": 4.896511077880859,
+      "learning_rate": 3e-05,
+      "loss": 0.7113,
+      "step": 7
+    },
+    {
+      "epoch": 0.004173187271778821,
+      "grad_norm": 2.5787155628204346,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.4226,
+      "step": 8
+    },
+    {
+      "epoch": 0.004694835680751174,
+      "grad_norm": 1.028937578201294,
+      "learning_rate": 4e-05,
+      "loss": 0.1873,
+      "step": 9
+    },
+    {
+      "epoch": 0.005216484089723527,
+      "grad_norm": 3.9262092113494873,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.5728,
+      "step": 10
+    },
+    {
+      "epoch": 0.005738132498695879,
+      "grad_norm": 4.9360198974609375,
+      "learning_rate": 5e-05,
+      "loss": 0.725,
+      "step": 11
+    },
+    {
+      "epoch": 0.006259780907668232,
+      "grad_norm": 4.287437915802002,
+      "learning_rate": 5.5e-05,
+      "loss": 0.6361,
+      "step": 12
+    },
+    {
+      "epoch": 0.006781429316640584,
+      "grad_norm": 1.3290928602218628,
+      "learning_rate": 6e-05,
+      "loss": 0.3109,
+      "step": 13
+    },
+    {
+      "epoch": 0.007303077725612937,
+      "grad_norm": 2.0050501823425293,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.4099,
+      "step": 14
+    },
+    {
+      "epoch": 0.00782472613458529,
+      "grad_norm": 4.360481262207031,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.6363,
+      "step": 15
+    },
+    {
+      "epoch": 0.008346374543557642,
+      "grad_norm": 3.9680511951446533,
+      "learning_rate": 7.5e-05,
+      "loss": 0.6124,
+      "step": 16
+    },
+    {
+      "epoch": 0.008868022952529996,
+      "grad_norm": 1.701784610748291,
+      "learning_rate": 8e-05,
+      "loss": 0.3439,
+      "step": 17
+    },
+    {
+      "epoch": 0.009389671361502348,
+      "grad_norm": 4.544748783111572,
+      "learning_rate": 8.5e-05,
+      "loss": 0.6253,
+      "step": 18
+    },
+    {
+      "epoch": 0.0099113197704747,
+      "grad_norm": 4.58634090423584,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.7669,
+      "step": 19
+    },
+    {
+      "epoch": 0.010432968179447054,
+      "grad_norm": 2.89898419380188,
+      "learning_rate": 9.5e-05,
+      "loss": 0.512,
+      "step": 20
+    },
+    {
+      "epoch": 0.010954616588419406,
+      "grad_norm": 2.61112904548645,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 21
+    },
+    {
+      "epoch": 0.011476264997391758,
+      "grad_norm": 3.217054843902588,
+      "learning_rate": 0.000105,
+      "loss": 0.4959,
+      "step": 22
+    },
+    {
+      "epoch": 0.011997913406364111,
+      "grad_norm": 2.569636821746826,
+      "learning_rate": 0.00011,
+      "loss": 0.3918,
+      "step": 23
+    },
+    {
+      "epoch": 0.012519561815336464,
+      "grad_norm": 1.4626373052597046,
+      "learning_rate": 0.000115,
+      "loss": 0.3316,
+      "step": 24
+    },
+    {
+      "epoch": 0.013041210224308816,
+      "grad_norm": 1.2480732202529907,
+      "learning_rate": 0.00012,
+      "loss": 0.3484,
+      "step": 25
+    },
+    {
+      "epoch": 0.013562858633281168,
+      "grad_norm": 2.5430543422698975,
+      "learning_rate": 0.000125,
+      "loss": 0.4699,
+      "step": 26
+    },
+    {
+      "epoch": 0.014084507042253521,
+      "grad_norm": 1.7051862478256226,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.2139,
+      "step": 27
+    },
+    {
+      "epoch": 0.014606155451225874,
+      "grad_norm": 1.1670981645584106,
+      "learning_rate": 0.000135,
+      "loss": 0.3883,
+      "step": 28
+    },
+    {
+      "epoch": 0.015127803860198226,
+      "grad_norm": 1.336538314819336,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.3714,
+      "step": 29
+    },
+    {
+      "epoch": 0.01564945226917058,
+      "grad_norm": 2.018078565597534,
+      "learning_rate": 0.000145,
+      "loss": 0.2301,
+      "step": 30
+    },
+    {
+      "epoch": 0.01617110067814293,
+      "grad_norm": 1.5743223428726196,
+      "learning_rate": 0.00015,
+      "loss": 0.2935,
+      "step": 31
+    },
+    {
+      "epoch": 0.016692749087115284,
+      "grad_norm": 1.2724987268447876,
+      "learning_rate": 0.000155,
+      "loss": 0.3141,
+      "step": 32
+    },
+    {
+      "epoch": 0.017214397496087636,
+      "grad_norm": 2.2347893714904785,
+      "learning_rate": 0.00016,
+      "loss": 0.2917,
+      "step": 33
+    },
+    {
+      "epoch": 0.01773604590505999,
+      "grad_norm": 1.6726069450378418,
+      "learning_rate": 0.000165,
+      "loss": 0.377,
+      "step": 34
+    },
+    {
+      "epoch": 0.018257694314032343,
+      "grad_norm": 1.2217071056365967,
+      "learning_rate": 0.00017,
+      "loss": 0.3027,
+      "step": 35
+    },
+    {
+      "epoch": 0.018779342723004695,
+      "grad_norm": 1.3436322212219238,
+      "learning_rate": 0.000175,
+      "loss": 0.2853,
+      "step": 36
+    },
+    {
+      "epoch": 0.019300991131977047,
+      "grad_norm": 1.2247120141983032,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.2967,
+      "step": 37
+    },
+    {
+      "epoch": 0.0198226395409494,
+      "grad_norm": 1.0636978149414062,
+      "learning_rate": 0.000185,
+      "loss": 0.2745,
+      "step": 38
+    },
+    {
+      "epoch": 0.02034428794992175,
+      "grad_norm": 1.302099347114563,
+      "learning_rate": 0.00019,
+      "loss": 0.2688,
+      "step": 39
+    },
+    {
+      "epoch": 0.020865936358894107,
+      "grad_norm": 1.0052679777145386,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2539,
+      "step": 40
+    },
+    {
+      "epoch": 0.02138758476786646,
+      "grad_norm": 1.0164084434509277,
+      "learning_rate": 0.0002,
+      "loss": 0.1978,
+      "step": 41
+    },
+    {
+      "epoch": 0.02190923317683881,
+      "grad_norm": 1.3891016244888306,
+      "learning_rate": 0.000205,
+      "loss": 0.3189,
+      "step": 42
+    },
+    {
+      "epoch": 0.022430881585811163,
+      "grad_norm": 0.960986852645874,
+      "learning_rate": 0.00021,
+      "loss": 0.2321,
+      "step": 43
+    },
+    {
+      "epoch": 0.022952529994783515,
+      "grad_norm": 0.9918408393859863,
+      "learning_rate": 0.000215,
+      "loss": 0.2359,
+      "step": 44
+    },
+    {
+      "epoch": 0.023474178403755867,
+      "grad_norm": 1.190205693244934,
+      "learning_rate": 0.00022,
+      "loss": 0.2347,
+      "step": 45
+    },
+    {
+      "epoch": 0.023995826812728223,
+      "grad_norm": 0.7985232472419739,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.2048,
+      "step": 46
+    },
+    {
+      "epoch": 0.024517475221700575,
+      "grad_norm": 0.5192842483520508,
+      "learning_rate": 0.00023,
+      "loss": 0.1116,
+      "step": 47
+    },
+    {
+      "epoch": 0.025039123630672927,
+      "grad_norm": 1.1033375263214111,
+      "learning_rate": 0.000235,
+      "loss": 0.2665,
+      "step": 48
+    },
+    {
+      "epoch": 0.02556077203964528,
+      "grad_norm": 0.7089418172836304,
+      "learning_rate": 0.00024,
+      "loss": 0.1639,
+      "step": 49
+    },
+    {
+      "epoch": 0.02608242044861763,
+      "grad_norm": 1.08647882938385,
+      "learning_rate": 0.000245,
+      "loss": 0.2072,
+      "step": 50
+    },
+    {
+      "epoch": 0.026604068857589983,
+      "grad_norm": 0.9901174902915955,
+      "learning_rate": 0.00025,
+      "loss": 0.2035,
+      "step": 51
+    },
+    {
+      "epoch": 0.027125717266562335,
+      "grad_norm": 0.6938351988792419,
+      "learning_rate": 0.000255,
+      "loss": 0.1851,
+      "step": 52
+    },
+    {
+      "epoch": 0.02764736567553469,
+      "grad_norm": 0.8392678499221802,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.1918,
+      "step": 53
+    },
+    {
+      "epoch": 0.028169014084507043,
+      "grad_norm": 0.5979602932929993,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.1243,
+      "step": 54
+    },
+    {
+      "epoch": 0.028690662493479395,
+      "grad_norm": 0.7119799852371216,
+      "learning_rate": 0.00027,
+      "loss": 0.1594,
+      "step": 55
+    },
+    {
+      "epoch": 0.029212310902451747,
+      "grad_norm": 0.5519995093345642,
+      "learning_rate": 0.000275,
+      "loss": 0.078,
+      "step": 56
+    },
+    {
+      "epoch": 0.0297339593114241,
+      "grad_norm": 0.5917723774909973,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.134,
+      "step": 57
+    },
+    {
+      "epoch": 0.03025560772039645,
+      "grad_norm": 0.6265603303909302,
+      "learning_rate": 0.000285,
+      "loss": 0.1848,
+      "step": 58
+    },
+    {
+      "epoch": 0.030777256129368807,
+      "grad_norm": 1.0653454065322876,
+      "learning_rate": 0.00029,
+      "loss": 0.1831,
+      "step": 59
+    },
+    {
+      "epoch": 0.03129890453834116,
+      "grad_norm": 0.3466293513774872,
+      "learning_rate": 0.000295,
+      "loss": 0.0878,
+      "step": 60
+    },
+    {
+      "epoch": 0.03182055294731351,
+      "grad_norm": 0.5498062372207642,
+      "learning_rate": 0.0003,
+      "loss": 0.1733,
+      "step": 61
+    },
+    {
+      "epoch": 0.03234220135628586,
+      "grad_norm": 0.7708966135978699,
+      "learning_rate": 0.000305,
+      "loss": 0.1975,
+      "step": 62
+    },
+    {
+      "epoch": 0.03286384976525822,
+      "grad_norm": 0.7717278003692627,
+      "learning_rate": 0.00031,
+      "loss": 0.1863,
+      "step": 63
+    },
+    {
+      "epoch": 0.03338549817423057,
+      "grad_norm": 0.8076028823852539,
+      "learning_rate": 0.000315,
+      "loss": 0.1938,
+      "step": 64
+    },
+    {
+      "epoch": 0.03390714658320292,
+      "grad_norm": 0.5629755258560181,
+      "learning_rate": 0.00032,
+      "loss": 0.1471,
+      "step": 65
+    },
+    {
+      "epoch": 0.03442879499217527,
+      "grad_norm": 0.5237282514572144,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1244,
+      "step": 66
+    },
+    {
+      "epoch": 0.03495044340114763,
+      "grad_norm": 0.7248942852020264,
+      "learning_rate": 0.00033,
+      "loss": 0.1933,
+      "step": 67
+    },
+    {
+      "epoch": 0.03547209181011998,
+      "grad_norm": 0.49564772844314575,
+      "learning_rate": 0.000335,
+      "loss": 0.1389,
+      "step": 68
+    },
+    {
+      "epoch": 0.03599374021909233,
+      "grad_norm": 0.4806594252586365,
+      "learning_rate": 0.00034,
+      "loss": 0.1295,
+      "step": 69
+    },
+    {
+      "epoch": 0.036515388628064686,
+      "grad_norm": 0.39995619654655457,
+      "learning_rate": 0.000345,
+      "loss": 0.1324,
+      "step": 70
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.6496027708053589,
+      "learning_rate": 0.00035,
+      "loss": 0.1002,
+      "step": 71
+    },
+    {
+      "epoch": 0.03755868544600939,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.000355,
+      "loss": 0.1277,
+      "step": 72
+    },
+    {
+      "epoch": 0.03808033385498174,
+      "grad_norm": 0.49875250458717346,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.1322,
+      "step": 73
+    },
+    {
+      "epoch": 0.038601982263954095,
+      "grad_norm": 0.44551461935043335,
+      "learning_rate": 0.000365,
+      "loss": 0.1278,
+      "step": 74
+    },
+    {
+      "epoch": 0.03912363067292645,
+      "grad_norm": 0.3314933478832245,
+      "learning_rate": 0.00037,
+      "loss": 0.0918,
+      "step": 75
+    },
+    {
+      "epoch": 0.0396452790818988,
+      "grad_norm": 0.3463922441005707,
+      "learning_rate": 0.000375,
+      "loss": 0.0948,
+      "step": 76
+    },
+    {
+      "epoch": 0.040166927490871154,
+      "grad_norm": 0.5401505827903748,
+      "learning_rate": 0.00038,
+      "loss": 0.1574,
+      "step": 77
+    },
+    {
+      "epoch": 0.0406885758998435,
+      "grad_norm": 0.39233317971229553,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.1312,
+      "step": 78
+    },
+    {
+      "epoch": 0.04121022430881586,
+      "grad_norm": 0.4380398988723755,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0601,
+      "step": 79
+    },
+    {
+      "epoch": 0.041731872717788214,
+      "grad_norm": 0.3931694030761719,
+      "learning_rate": 0.000395,
+      "loss": 0.0962,
+      "step": 80
+    },
+    {
+      "epoch": 0.04225352112676056,
+      "grad_norm": 0.3566243648529053,
+      "learning_rate": 0.0004,
+      "loss": 0.1137,
+      "step": 81
+    },
+    {
+      "epoch": 0.04277516953573292,
+      "grad_norm": 0.40159469842910767,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.1128,
+      "step": 82
+    },
+    {
+      "epoch": 0.04329681794470527,
+      "grad_norm": 0.30474773049354553,
+      "learning_rate": 0.00041,
+      "loss": 0.0922,
+      "step": 83
+    },
+    {
+      "epoch": 0.04381846635367762,
+      "grad_norm": 0.31177017092704773,
+      "learning_rate": 0.000415,
+      "loss": 0.1015,
+      "step": 84
+    },
+    {
+      "epoch": 0.04434011476264997,
+      "grad_norm": 0.3996855914592743,
+      "learning_rate": 0.00042,
+      "loss": 0.1266,
+      "step": 85
+    },
+    {
+      "epoch": 0.044861763171622326,
+      "grad_norm": 0.2281728833913803,
+      "learning_rate": 0.000425,
+      "loss": 0.0758,
+      "step": 86
+    },
+    {
+      "epoch": 0.04538341158059468,
+      "grad_norm": 0.5169669985771179,
+      "learning_rate": 0.00043,
+      "loss": 0.1092,
+      "step": 87
+    },
+    {
+      "epoch": 0.04590505998956703,
+      "grad_norm": 0.5525585412979126,
+      "learning_rate": 0.000435,
+      "loss": 0.1226,
+      "step": 88
+    },
+    {
+      "epoch": 0.046426708398539386,
+      "grad_norm": 0.33093884587287903,
+      "learning_rate": 0.00044,
+      "loss": 0.0879,
+      "step": 89
+    },
+    {
+      "epoch": 0.046948356807511735,
+      "grad_norm": 0.3713582158088684,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.1121,
+      "step": 90
+    },
+    {
+      "epoch": 0.04747000521648409,
+      "grad_norm": 0.565517246723175,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.1469,
+      "step": 91
+    },
+    {
+      "epoch": 0.047991653625456446,
+      "grad_norm": 0.31801939010620117,
+      "learning_rate": 0.000455,
+      "loss": 0.0915,
+      "step": 92
+    },
+    {
+      "epoch": 0.048513302034428794,
+      "grad_norm": 0.42586401104927063,
+      "learning_rate": 0.00046,
+      "loss": 0.0411,
+      "step": 93
+    },
+    {
+      "epoch": 0.04903495044340115,
+      "grad_norm": 0.42403289675712585,
+      "learning_rate": 0.000465,
+      "loss": 0.0589,
+      "step": 94
+    },
+    {
+      "epoch": 0.0495565988523735,
+      "grad_norm": 0.2604529559612274,
+      "learning_rate": 0.00047,
+      "loss": 0.0779,
+      "step": 95
+    },
+    {
+      "epoch": 0.050078247261345854,
+      "grad_norm": 0.32257840037345886,
+      "learning_rate": 0.000475,
+      "loss": 0.0958,
+      "step": 96
+    },
+    {
+      "epoch": 0.0505998956703182,
+      "grad_norm": 0.2648946940898895,
+      "learning_rate": 0.00048,
+      "loss": 0.0591,
+      "step": 97
+    },
+    {
+      "epoch": 0.05112154407929056,
+      "grad_norm": 0.26664629578590393,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0607,
+      "step": 98
+    },
+    {
+      "epoch": 0.051643192488262914,
+      "grad_norm": 0.2891658544540405,
+      "learning_rate": 0.00049,
+      "loss": 0.0478,
+      "step": 99
+    },
+    {
+      "epoch": 0.05216484089723526,
+      "grad_norm": 0.35936883091926575,
+      "learning_rate": 0.000495,
+      "loss": 0.1126,
+      "step": 100
+    },
+    {
+      "epoch": 0.05268648930620762,
+      "grad_norm": 0.3226841986179352,
+      "learning_rate": 0.0005,
+      "loss": 0.0995,
+      "step": 101
+    },
+    {
+      "epoch": 0.053208137715179966,
+      "grad_norm": 0.2140406370162964,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.0636,
+      "step": 102
+    },
+    {
+      "epoch": 0.05372978612415232,
+      "grad_norm": 0.28297877311706543,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.0674,
+      "step": 103
+    },
+    {
+      "epoch": 0.05425143453312467,
+      "grad_norm": 0.27131739258766174,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.0657,
+      "step": 104
+    },
+    {
+      "epoch": 0.054773082942097026,
+      "grad_norm": 0.28402701020240784,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.0894,
+      "step": 105
+    },
+    {
+      "epoch": 0.05529473135106938,
+      "grad_norm": 0.33924373984336853,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.1264,
+      "step": 106
+    },
+    {
+      "epoch": 0.05581637976004173,
+      "grad_norm": 0.3655984401702881,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.0828,
+      "step": 107
+    },
+    {
+      "epoch": 0.056338028169014086,
+      "grad_norm": 0.2262953370809555,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.0662,
+      "step": 108
+    },
+    {
+      "epoch": 0.056859676577986434,
+      "grad_norm": 0.23988084495067596,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.0672,
+      "step": 109
+    },
+    {
+      "epoch": 0.05738132498695879,
+      "grad_norm": 0.228820338845253,
+      "learning_rate": 0.000495,
+      "loss": 0.0615,
+      "step": 110
+    },
+    {
+      "epoch": 0.057902973395931145,
+      "grad_norm": 0.32484373450279236,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.0833,
+      "step": 111
+    },
+    {
+      "epoch": 0.058424621804903494,
+      "grad_norm": 0.22520330548286438,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.0767,
+      "step": 112
+    },
+    {
+      "epoch": 0.05894627021387585,
+      "grad_norm": 0.4783564805984497,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0999,
+      "step": 113
+    },
+    {
+      "epoch": 0.0594679186228482,
+      "grad_norm": 0.2565033733844757,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.0819,
+      "step": 114
+    },
+    {
+      "epoch": 0.059989567031820554,
+      "grad_norm": 0.19332879781723022,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.0702,
+      "step": 115
+    },
+    {
+      "epoch": 0.0605112154407929,
+      "grad_norm": 0.2507823705673218,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.076,
+      "step": 116
+    },
+    {
+      "epoch": 0.06103286384976526,
+      "grad_norm": 0.29689472913742065,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.0748,
+      "step": 117
+    },
+    {
+      "epoch": 0.06155451225873761,
+      "grad_norm": 0.34821203351020813,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.0949,
+      "step": 118
+    },
+    {
+      "epoch": 0.06207616066770996,
+      "grad_norm": 0.25025618076324463,
+      "learning_rate": 0.00049,
+      "loss": 0.0813,
+      "step": 119
+    },
+    {
+      "epoch": 0.06259780907668232,
+      "grad_norm": 0.23138757050037384,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.0806,
+      "step": 120
+    },
+    {
+      "epoch": 0.06311945748565467,
+      "grad_norm": 0.25655433535575867,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.0864,
+      "step": 121
+    },
+    {
+      "epoch": 0.06364110589462701,
+      "grad_norm": 0.2863710820674896,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.0659,
+      "step": 122
+    },
+    {
+      "epoch": 0.06416275430359937,
+      "grad_norm": 0.2628318965435028,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.0746,
+      "step": 123
+    },
+    {
+      "epoch": 0.06468440271257173,
+      "grad_norm": 0.2095496952533722,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.0746,
+      "step": 124
+    },
+    {
+      "epoch": 0.06520605112154408,
+      "grad_norm": 0.25687775015830994,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0867,
+      "step": 125
+    },
+    {
+      "epoch": 0.06572769953051644,
+      "grad_norm": 0.3623638153076172,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.0859,
+      "step": 126
+    },
+    {
+      "epoch": 0.06624934793948878,
+      "grad_norm": 0.22254744172096252,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.0956,
+      "step": 127
+    },
+    {
+      "epoch": 0.06677099634846113,
+      "grad_norm": 0.42705070972442627,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0885,
+      "step": 128
+    },
+    {
+      "epoch": 0.06729264475743349,
+      "grad_norm": 0.23360145092010498,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.0521,
+      "step": 129
+    },
+    {
+      "epoch": 0.06781429316640585,
+      "grad_norm": 0.1959061473608017,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.043,
+      "step": 130
+    },
+    {
+      "epoch": 0.0683359415753782,
+      "grad_norm": 0.32006219029426575,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.0942,
+      "step": 131
+    },
+    {
+      "epoch": 0.06885758998435054,
+      "grad_norm": 0.20010985434055328,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.0645,
+      "step": 132
+    },
+    {
+      "epoch": 0.0693792383933229,
+      "grad_norm": 0.18007700145244598,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0593,
+      "step": 133
+    },
+    {
+      "epoch": 0.06990088680229525,
+      "grad_norm": 0.23080182075500488,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.069,
+      "step": 134
+    },
+    {
+      "epoch": 0.07042253521126761,
+      "grad_norm": 0.16220460832118988,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0499,
+      "step": 135
+    },
+    {
+      "epoch": 0.07094418362023996,
+      "grad_norm": 0.19325301051139832,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.0616,
+      "step": 136
+    },
+    {
+      "epoch": 0.0714658320292123,
+      "grad_norm": 0.16364900767803192,
+      "learning_rate": 0.00048,
+      "loss": 0.0612,
+      "step": 137
+    },
+    {
+      "epoch": 0.07198748043818466,
+      "grad_norm": 0.15745937824249268,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0526,
+      "step": 138
+    },
+    {
+      "epoch": 0.07250912884715702,
+      "grad_norm": 0.22706539928913116,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.067,
+      "step": 139
+    },
+    {
+      "epoch": 0.07303077725612937,
+      "grad_norm": 0.22147034108638763,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.0684,
+      "step": 140
+    },
+    {
+      "epoch": 0.07355242566510173,
+      "grad_norm": 0.2623853385448456,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0491,
+      "step": 141
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.1899435669183731,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.029,
+      "step": 142
+    },
+    {
+      "epoch": 0.07459572248304643,
+      "grad_norm": 0.2669859826564789,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.064,
+      "step": 143
+    },
+    {
+      "epoch": 0.07511737089201878,
+      "grad_norm": 0.18063829839229584,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0624,
+      "step": 144
+    },
+    {
+      "epoch": 0.07563901930099114,
+      "grad_norm": 0.22147716581821442,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.0544,
+      "step": 145
+    },
+    {
+      "epoch": 0.07616066770996348,
+      "grad_norm": 0.30522170662879944,
+      "learning_rate": 0.000475,
+      "loss": 0.077,
+      "step": 146
+    },
+    {
+      "epoch": 0.07668231611893583,
+      "grad_norm": 0.15942497551441193,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.0372,
+      "step": 147
+    },
+    {
+      "epoch": 0.07720396452790819,
+      "grad_norm": 0.1456826627254486,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.0423,
+      "step": 148
+    },
+    {
+      "epoch": 0.07772561293688054,
+      "grad_norm": 0.17793269455432892,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0559,
+      "step": 149
+    },
+    {
+      "epoch": 0.0782472613458529,
+      "grad_norm": 0.152329221367836,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0266,
+      "step": 150
+    },
+    {
+      "epoch": 0.07876890975482524,
+      "grad_norm": 0.19327858090400696,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.0608,
+      "step": 151
+    },
+    {
+      "epoch": 0.0792905581637976,
+      "grad_norm": 0.15060095489025116,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.0461,
+      "step": 152
+    },
+    {
+      "epoch": 0.07981220657276995,
+      "grad_norm": 0.1864742785692215,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0724,
+      "step": 153
+    },
+    {
+      "epoch": 0.08033385498174231,
+      "grad_norm": 0.1422508805990219,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.0325,
+      "step": 154
+    },
+    {
+      "epoch": 0.08085550339071466,
+      "grad_norm": 0.21115481853485107,
+      "learning_rate": 0.00047,
+      "loss": 0.0535,
+      "step": 155
+    },
+    {
+      "epoch": 0.081377151799687,
+      "grad_norm": 0.2197350263595581,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0703,
+      "step": 156
+    },
+    {
+      "epoch": 0.08189880020865936,
+      "grad_norm": 0.1608528196811676,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0447,
+      "step": 157
+    },
+    {
+      "epoch": 0.08242044861763172,
+      "grad_norm": 0.1445985585451126,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0469,
+      "step": 158
+    },
+    {
+      "epoch": 0.08294209702660407,
+      "grad_norm": 0.25215667486190796,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.0709,
+      "step": 159
+    },
+    {
+      "epoch": 0.08346374543557643,
+      "grad_norm": 0.14391636848449707,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.0457,
+      "step": 160
+    },
+    {
+      "epoch": 0.08398539384454877,
+      "grad_norm": 0.29619306325912476,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0765,
+      "step": 161
+    },
+    {
+      "epoch": 0.08450704225352113,
+      "grad_norm": 0.15701289474964142,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.0418,
+      "step": 162
+    },
+    {
+      "epoch": 0.08502869066249348,
+      "grad_norm": 0.1698683649301529,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0294,
+      "step": 163
+    },
+    {
+      "epoch": 0.08555033907146584,
+      "grad_norm": 0.12165573239326477,
+      "learning_rate": 0.000465,
+      "loss": 0.0258,
+      "step": 164
+    },
+    {
+      "epoch": 0.08607198748043818,
+      "grad_norm": 0.1611219197511673,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0491,
+      "step": 165
+    },
+    {
+      "epoch": 0.08659363588941053,
+      "grad_norm": 0.1486036628484726,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.0479,
+      "step": 166
+    },
+    {
+      "epoch": 0.08711528429838289,
+      "grad_norm": 0.13054965436458588,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0401,
+      "step": 167
+    },
+    {
+      "epoch": 0.08763693270735524,
+      "grad_norm": 0.15433131158351898,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.048,
+      "step": 168
+    },
+    {
+      "epoch": 0.0881585811163276,
+      "grad_norm": 0.17511604726314545,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0569,
+      "step": 169
+    },
+    {
+      "epoch": 0.08868022952529994,
+      "grad_norm": 0.1398395150899887,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.034,
+      "step": 170
+    },
+    {
+      "epoch": 0.0892018779342723,
+      "grad_norm": 0.15484075248241425,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0514,
+      "step": 171
+    },
+    {
+      "epoch": 0.08972352634324465,
+      "grad_norm": 0.17851784825325012,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.0571,
+      "step": 172
+    },
+    {
+      "epoch": 0.09024517475221701,
+      "grad_norm": 0.18745650351047516,
+      "learning_rate": 0.00046,
+      "loss": 0.0523,
+      "step": 173
+    },
+    {
+      "epoch": 0.09076682316118936,
+      "grad_norm": 0.18322691321372986,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.0642,
+      "step": 174
+    },
+    {
+      "epoch": 0.0912884715701617,
+      "grad_norm": 0.1173708513379097,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0267,
+      "step": 175
+    },
+    {
+      "epoch": 0.09181011997913406,
+      "grad_norm": 0.1754874438047409,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.0657,
+      "step": 176
+    },
+    {
+      "epoch": 0.09233176838810642,
+      "grad_norm": 0.13830502331256866,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.0433,
+      "step": 177
+    },
+    {
+      "epoch": 0.09285341679707877,
+      "grad_norm": 0.11174938827753067,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.04,
+      "step": 178
+    },
+    {
+      "epoch": 0.09337506520605113,
+      "grad_norm": 0.1829378753900528,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0453,
+      "step": 179
+    },
+    {
+      "epoch": 0.09389671361502347,
+      "grad_norm": 0.10748015344142914,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.05,
+      "step": 180
+    },
+    {
+      "epoch": 0.09441836202399582,
+      "grad_norm": 0.1160806268453598,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0245,
+      "step": 181
+    },
+    {
+      "epoch": 0.09494001043296818,
+      "grad_norm": 0.12387479841709137,
+      "learning_rate": 0.000455,
+      "loss": 0.0259,
+      "step": 182
+    },
+    {
+      "epoch": 0.09546165884194054,
+      "grad_norm": 0.1586403250694275,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0378,
+      "step": 183
+    },
+    {
+      "epoch": 0.09598330725091289,
+      "grad_norm": 0.18905822932720184,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0484,
+      "step": 184
+    },
+    {
+      "epoch": 0.09650495565988523,
+      "grad_norm": 0.17541544139385223,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0503,
+      "step": 185
+    },
+    {
+      "epoch": 0.09702660406885759,
+      "grad_norm": 0.1083071306347847,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.0439,
+      "step": 186
+    },
+    {
+      "epoch": 0.09754825247782994,
+      "grad_norm": 0.10464104264974594,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0271,
+      "step": 187
+    },
+    {
+      "epoch": 0.0980699008868023,
+      "grad_norm": 0.18022054433822632,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0589,
+      "step": 188
+    },
+    {
+      "epoch": 0.09859154929577464,
+      "grad_norm": 0.18715251982212067,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0489,
+      "step": 189
+    },
+    {
+      "epoch": 0.099113197704747,
+      "grad_norm": 0.10440787672996521,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.0221,
+      "step": 190
+    },
+    {
+      "epoch": 0.09963484611371935,
+      "grad_norm": 0.11525921523571014,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0427,
+      "step": 191
+    },
+    {
+      "epoch": 0.10015649452269171,
+      "grad_norm": 0.1573028564453125,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.04,
+      "step": 192
+    },
+    {
+      "epoch": 0.10067814293166406,
+      "grad_norm": 0.15942253172397614,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.045,
+      "step": 193
+    },
+    {
+      "epoch": 0.1011997913406364,
+      "grad_norm": 0.2997572422027588,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.062,
+      "step": 194
+    },
+    {
+      "epoch": 0.10172143974960876,
+      "grad_norm": 0.1859196424484253,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0496,
+      "step": 195
+    },
+    {
+      "epoch": 0.10224308815858112,
+      "grad_norm": 0.1265893131494522,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0457,
+      "step": 196
+    },
+    {
+      "epoch": 0.10276473656755347,
+      "grad_norm": 0.16036029160022736,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.046,
+      "step": 197
+    },
+    {
+      "epoch": 0.10328638497652583,
+      "grad_norm": 0.10421448945999146,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.033,
+      "step": 198
+    },
+    {
+      "epoch": 0.10380803338549817,
+      "grad_norm": 0.12321974337100983,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0458,
+      "step": 199
+    },
+    {
+      "epoch": 0.10432968179447052,
+      "grad_norm": 0.13863791525363922,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0221,
+      "step": 200
+    },
+    {
+      "epoch": 0.10485133020344288,
+      "grad_norm": 0.11896353214979172,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.0477,
+      "step": 201
+    },
+    {
+      "epoch": 0.10537297861241524,
+      "grad_norm": 0.1473105251789093,
+      "learning_rate": 0.0004438888888888889,
+      "loss": 0.056,
+      "step": 202
+    },
+    {
+      "epoch": 0.10589462702138759,
+      "grad_norm": 0.15058237314224243,
+      "learning_rate": 0.00044333333333333334,
+      "loss": 0.0477,
+      "step": 203
+    },
+    {
+      "epoch": 0.10641627543035993,
+      "grad_norm": 0.10770102590322495,
+      "learning_rate": 0.0004427777777777778,
+      "loss": 0.0316,
+      "step": 204
+    },
+    {
+      "epoch": 0.10693792383933229,
+      "grad_norm": 0.13766999542713165,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.041,
+      "step": 205
+    },
+    {
+      "epoch": 0.10745957224830464,
+      "grad_norm": 0.11786706745624542,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 0.0302,
+      "step": 206
+    },
+    {
+      "epoch": 0.107981220657277,
+      "grad_norm": 0.10209888964891434,
+      "learning_rate": 0.00044111111111111114,
+      "loss": 0.0396,
+      "step": 207
+    },
+    {
+      "epoch": 0.10850286906624934,
+      "grad_norm": 0.13609950244426727,
+      "learning_rate": 0.0004405555555555555,
+      "loss": 0.0394,
+      "step": 208
+    },
+    {
+      "epoch": 0.1090245174752217,
+      "grad_norm": 0.11915361881256104,
+      "learning_rate": 0.00044,
+      "loss": 0.0421,
+      "step": 209
+    },
+    {
+      "epoch": 0.10954616588419405,
+      "grad_norm": 0.11170439422130585,
+      "learning_rate": 0.0004394444444444445,
+      "loss": 0.0395,
+      "step": 210
+    },
+    {
+      "epoch": 0.11006781429316641,
+      "grad_norm": 0.12584055960178375,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.0534,
+      "step": 211
+    },
+    {
+      "epoch": 0.11058946270213876,
+      "grad_norm": 0.1454746276140213,
+      "learning_rate": 0.0004383333333333334,
+      "loss": 0.0469,
+      "step": 212
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.10297106951475143,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.0359,
+      "step": 213
+    },
+    {
+      "epoch": 0.11163275952008346,
+      "grad_norm": 0.10994141548871994,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 0.0404,
+      "step": 214
+    },
+    {
+      "epoch": 0.11215440792905582,
+      "grad_norm": 0.13165079057216644,
+      "learning_rate": 0.00043666666666666664,
+      "loss": 0.0475,
+      "step": 215
+    },
+    {
+      "epoch": 0.11267605633802817,
+      "grad_norm": 0.11115416139364243,
+      "learning_rate": 0.00043611111111111113,
+      "loss": 0.0351,
+      "step": 216
+    },
+    {
+      "epoch": 0.11319770474700053,
+      "grad_norm": 0.15927758812904358,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0468,
+      "step": 217
+    },
+    {
+      "epoch": 0.11371935315597287,
+      "grad_norm": 0.0941813513636589,
+      "learning_rate": 0.000435,
+      "loss": 0.0337,
+      "step": 218
+    },
+    {
+      "epoch": 0.11424100156494522,
+      "grad_norm": 0.10850685834884644,
+      "learning_rate": 0.0004344444444444445,
+      "loss": 0.0211,
+      "step": 219
+    },
+    {
+      "epoch": 0.11476264997391758,
+      "grad_norm": 0.0790611058473587,
+      "learning_rate": 0.0004338888888888889,
+      "loss": 0.0196,
+      "step": 220
+    },
+    {
+      "epoch": 0.11528429838288994,
+      "grad_norm": 0.10849782079458237,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.04,
+      "step": 221
+    },
+    {
+      "epoch": 0.11580594679186229,
+      "grad_norm": 0.09607880562543869,
+      "learning_rate": 0.00043277777777777775,
+      "loss": 0.0291,
+      "step": 222
+    },
+    {
+      "epoch": 0.11632759520083463,
+      "grad_norm": 0.17959930002689362,
+      "learning_rate": 0.00043222222222222224,
+      "loss": 0.0426,
+      "step": 223
+    },
+    {
+      "epoch": 0.11684924360980699,
+      "grad_norm": 0.08865644782781601,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 0.0321,
+      "step": 224
+    },
+    {
+      "epoch": 0.11737089201877934,
+      "grad_norm": 0.17324934899806976,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0537,
+      "step": 225
+    },
+    {
+      "epoch": 0.1178925404277517,
+      "grad_norm": 0.10226263850927353,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 0.0342,
+      "step": 226
+    },
+    {
+      "epoch": 0.11841418883672405,
+      "grad_norm": 0.10456152260303497,
+      "learning_rate": 0.00043,
+      "loss": 0.039,
+      "step": 227
+    },
+    {
+      "epoch": 0.1189358372456964,
+      "grad_norm": 0.10196290910243988,
+      "learning_rate": 0.0004294444444444445,
+      "loss": 0.0329,
+      "step": 228
+    },
+    {
+      "epoch": 0.11945748565466875,
+      "grad_norm": 0.12004778534173965,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0434,
+      "step": 229
+    },
+    {
+      "epoch": 0.11997913406364111,
+      "grad_norm": 0.10152442753314972,
+      "learning_rate": 0.00042833333333333335,
+      "loss": 0.0305,
+      "step": 230
+    },
+    {
+      "epoch": 0.12050078247261346,
+      "grad_norm": 0.1072554886341095,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.0407,
+      "step": 231
+    },
+    {
+      "epoch": 0.1210224308815858,
+      "grad_norm": 0.08478479087352753,
+      "learning_rate": 0.00042722222222222223,
+      "loss": 0.0375,
+      "step": 232
+    },
+    {
+      "epoch": 0.12154407929055816,
+      "grad_norm": 0.11901957541704178,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.0281,
+      "step": 233
+    },
+    {
+      "epoch": 0.12206572769953052,
+      "grad_norm": 0.097981758415699,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 0.0365,
+      "step": 234
+    },
+    {
+      "epoch": 0.12258737610850287,
+      "grad_norm": 0.08464547991752625,
+      "learning_rate": 0.0004255555555555556,
+      "loss": 0.0227,
+      "step": 235
+    },
+    {
+      "epoch": 0.12310902451747523,
+      "grad_norm": 0.18886807560920715,
+      "learning_rate": 0.000425,
+      "loss": 0.0494,
+      "step": 236
+    },
+    {
+      "epoch": 0.12363067292644757,
+      "grad_norm": 0.08432997763156891,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.031,
+      "step": 237
+    },
+    {
+      "epoch": 0.12415232133541992,
+      "grad_norm": 0.24738061428070068,
+      "learning_rate": 0.0004238888888888889,
+      "loss": 0.0611,
+      "step": 238
+    },
+    {
+      "epoch": 0.12467396974439228,
+      "grad_norm": 0.11955960839986801,
+      "learning_rate": 0.00042333333333333334,
+      "loss": 0.0481,
+      "step": 239
+    },
+    {
+      "epoch": 0.12519561815336464,
+      "grad_norm": 0.132662832736969,
+      "learning_rate": 0.0004227777777777778,
+      "loss": 0.0432,
+      "step": 240
+    },
+    {
+      "epoch": 0.12571726656233698,
+      "grad_norm": 0.08496639877557755,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0328,
+      "step": 241
+    },
+    {
+      "epoch": 0.12623891497130935,
+      "grad_norm": 0.13830861449241638,
+      "learning_rate": 0.0004216666666666667,
+      "loss": 0.0336,
+      "step": 242
+    },
+    {
+      "epoch": 0.1267605633802817,
+      "grad_norm": 0.12200845032930374,
+      "learning_rate": 0.0004211111111111111,
+      "loss": 0.0346,
+      "step": 243
+    },
+    {
+      "epoch": 0.12728221178925403,
+      "grad_norm": 0.10438041388988495,
+      "learning_rate": 0.0004205555555555556,
+      "loss": 0.039,
+      "step": 244
+    },
+    {
+      "epoch": 0.1278038601982264,
+      "grad_norm": 0.10238846391439438,
+      "learning_rate": 0.00042,
+      "loss": 0.0442,
+      "step": 245
+    },
+    {
+      "epoch": 0.12832550860719874,
+      "grad_norm": 0.10930721461772919,
+      "learning_rate": 0.00041944444444444445,
+      "loss": 0.0426,
+      "step": 246
+    },
+    {
+      "epoch": 0.1288471570161711,
+      "grad_norm": 0.09867265820503235,
+      "learning_rate": 0.0004188888888888889,
+      "loss": 0.0402,
+      "step": 247
+    },
+    {
+      "epoch": 0.12936880542514345,
+      "grad_norm": 0.1137848049402237,
+      "learning_rate": 0.00041833333333333333,
+      "loss": 0.0278,
+      "step": 248
+    },
+    {
+      "epoch": 0.1298904538341158,
+      "grad_norm": 0.1364007592201233,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.0437,
+      "step": 249
+    },
+    {
+      "epoch": 0.13041210224308816,
+      "grad_norm": 0.09385659545660019,
+      "learning_rate": 0.0004172222222222222,
+      "loss": 0.0353,
+      "step": 250
+    },
+    {
+      "epoch": 0.1309337506520605,
+      "grad_norm": 0.1302153617143631,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.0287,
+      "step": 251
+    },
+    {
+      "epoch": 0.13145539906103287,
+      "grad_norm": 0.09976278990507126,
+      "learning_rate": 0.00041611111111111113,
+      "loss": 0.0381,
+      "step": 252
+    },
+    {
+      "epoch": 0.13197704747000522,
+      "grad_norm": 0.0966271236538887,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.0204,
+      "step": 253
+    },
+    {
+      "epoch": 0.13249869587897756,
+      "grad_norm": 0.0773528590798378,
+      "learning_rate": 0.000415,
+      "loss": 0.0285,
+      "step": 254
+    },
+    {
+      "epoch": 0.13302034428794993,
+      "grad_norm": 0.2350674420595169,
+      "learning_rate": 0.00041444444444444444,
+      "loss": 0.0511,
+      "step": 255
+    },
+    {
+      "epoch": 0.13354199269692227,
+      "grad_norm": 0.08375384658575058,
+      "learning_rate": 0.0004138888888888889,
+      "loss": 0.0341,
+      "step": 256
+    },
+    {
+      "epoch": 0.13406364110589464,
+      "grad_norm": 0.09229125827550888,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0329,
+      "step": 257
+    },
+    {
+      "epoch": 0.13458528951486698,
+      "grad_norm": 0.08750821650028229,
+      "learning_rate": 0.0004127777777777778,
+      "loss": 0.0283,
+      "step": 258
+    },
+    {
+      "epoch": 0.13510693792383932,
+      "grad_norm": 0.075618676841259,
+      "learning_rate": 0.00041222222222222224,
+      "loss": 0.0291,
+      "step": 259
+    },
+    {
+      "epoch": 0.1356285863328117,
+      "grad_norm": 0.16954250633716583,
+      "learning_rate": 0.0004116666666666667,
+      "loss": 0.0441,
+      "step": 260
+    },
+    {
+      "epoch": 0.13615023474178403,
+      "grad_norm": 0.07529555261135101,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.0139,
+      "step": 261
+    },
+    {
+      "epoch": 0.1366718831507564,
+      "grad_norm": 0.08884407579898834,
+      "learning_rate": 0.00041055555555555555,
+      "loss": 0.0299,
+      "step": 262
+    },
+    {
+      "epoch": 0.13719353155972874,
+      "grad_norm": 0.09607396274805069,
+      "learning_rate": 0.00041,
+      "loss": 0.0337,
+      "step": 263
+    },
+    {
+      "epoch": 0.13771517996870108,
+      "grad_norm": 0.08453882485628128,
+      "learning_rate": 0.00040944444444444443,
+      "loss": 0.0315,
+      "step": 264
+    },
+    {
+      "epoch": 0.13823682837767345,
+      "grad_norm": 0.09629228711128235,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.0181,
+      "step": 265
+    },
+    {
+      "epoch": 0.1387584767866458,
+      "grad_norm": 0.07212290167808533,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 0.0262,
+      "step": 266
+    },
+    {
+      "epoch": 0.13928012519561817,
+      "grad_norm": 0.09192827343940735,
+      "learning_rate": 0.0004077777777777778,
+      "loss": 0.018,
+      "step": 267
+    },
+    {
+      "epoch": 0.1398017736045905,
+      "grad_norm": 0.10876122117042542,
+      "learning_rate": 0.00040722222222222223,
+      "loss": 0.0316,
+      "step": 268
+    },
+    {
+      "epoch": 0.14032342201356285,
+      "grad_norm": 0.10796765238046646,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.0323,
+      "step": 269
+    },
+    {
+      "epoch": 0.14084507042253522,
+      "grad_norm": 0.08297892659902573,
+      "learning_rate": 0.0004061111111111111,
+      "loss": 0.0244,
+      "step": 270
+    },
+    {
+      "epoch": 0.14136671883150756,
+      "grad_norm": 0.09534858912229538,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.0279,
+      "step": 271
+    },
+    {
+      "epoch": 0.14188836724047993,
+      "grad_norm": 0.07854770123958588,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.032,
+      "step": 272
+    },
+    {
+      "epoch": 0.14241001564945227,
+      "grad_norm": 0.16025401651859283,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0413,
+      "step": 273
+    },
+    {
+      "epoch": 0.1429316640584246,
+      "grad_norm": 0.07919424772262573,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 0.0277,
+      "step": 274
+    },
+    {
+      "epoch": 0.14345331246739698,
+      "grad_norm": 0.07335282117128372,
+      "learning_rate": 0.00040333333333333334,
+      "loss": 0.0363,
+      "step": 275
+    },
+    {
+      "epoch": 0.14397496087636932,
+      "grad_norm": 0.1280767321586609,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 0.0402,
+      "step": 276
+    },
+    {
+      "epoch": 0.1444966092853417,
+      "grad_norm": 0.11371007561683655,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0312,
+      "step": 277
+    },
+    {
+      "epoch": 0.14501825769431403,
+      "grad_norm": 0.12229876220226288,
+      "learning_rate": 0.00040166666666666665,
+      "loss": 0.0357,
+      "step": 278
+    },
+    {
+      "epoch": 0.14553990610328638,
+      "grad_norm": 0.11436333507299423,
+      "learning_rate": 0.0004011111111111111,
+      "loss": 0.0135,
+      "step": 279
+    },
+    {
+      "epoch": 0.14606155451225875,
+      "grad_norm": 0.08084696531295776,
+      "learning_rate": 0.0004005555555555556,
+      "loss": 0.0302,
+      "step": 280
+    },
+    {
+      "epoch": 0.1465832029212311,
+      "grad_norm": 0.09421739727258682,
+      "learning_rate": 0.0004,
+      "loss": 0.0376,
+      "step": 281
+    },
+    {
+      "epoch": 0.14710485133020346,
+      "grad_norm": 0.0744849219918251,
+      "learning_rate": 0.00039944444444444446,
+      "loss": 0.0291,
+      "step": 282
+    },
+    {
+      "epoch": 0.1476264997391758,
+      "grad_norm": 0.06754301488399506,
+      "learning_rate": 0.0003988888888888889,
+      "loss": 0.0262,
+      "step": 283
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.06215747445821762,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 0.0223,
+      "step": 284
+    },
+    {
+      "epoch": 0.1486697965571205,
+      "grad_norm": 0.10289556533098221,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.0401,
+      "step": 285
+    },
+    {
+      "epoch": 0.14919144496609285,
+      "grad_norm": 0.10723885893821716,
+      "learning_rate": 0.0003972222222222222,
+      "loss": 0.018,
+      "step": 286
+    },
+    {
+      "epoch": 0.1497130933750652,
+      "grad_norm": 0.12683100998401642,
+      "learning_rate": 0.0003966666666666667,
+      "loss": 0.0155,
+      "step": 287
+    },
+    {
+      "epoch": 0.15023474178403756,
+      "grad_norm": 0.10709403455257416,
+      "learning_rate": 0.00039611111111111113,
+      "loss": 0.0186,
+      "step": 288
+    },
+    {
+      "epoch": 0.1507563901930099,
+      "grad_norm": 0.09857751429080963,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.0311,
+      "step": 289
+    },
+    {
+      "epoch": 0.15127803860198227,
+      "grad_norm": 0.07990946620702744,
+      "learning_rate": 0.000395,
+      "loss": 0.032,
+      "step": 290
+    },
+    {
+      "epoch": 0.15179968701095461,
+      "grad_norm": 0.06873098760843277,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.0163,
+      "step": 291
+    },
+    {
+      "epoch": 0.15232133541992696,
+      "grad_norm": 0.0788077712059021,
+      "learning_rate": 0.00039388888888888893,
+      "loss": 0.0319,
+      "step": 292
+    },
+    {
+      "epoch": 0.15284298382889933,
+      "grad_norm": 0.08789033442735672,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0352,
+      "step": 293
+    },
+    {
+      "epoch": 0.15336463223787167,
+      "grad_norm": 0.10574653744697571,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 0.0411,
+      "step": 294
+    },
+    {
+      "epoch": 0.15388628064684404,
+      "grad_norm": 0.08198726177215576,
+      "learning_rate": 0.00039222222222222225,
+      "loss": 0.0286,
+      "step": 295
+    },
+    {
+      "epoch": 0.15440792905581638,
+      "grad_norm": 0.2811417579650879,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 0.0508,
+      "step": 296
+    },
+    {
+      "epoch": 0.15492957746478872,
+      "grad_norm": 0.1203279122710228,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.0384,
+      "step": 297
+    },
+    {
+      "epoch": 0.1554512258737611,
+      "grad_norm": 0.08802422881126404,
+      "learning_rate": 0.00039055555555555556,
+      "loss": 0.0305,
+      "step": 298
+    },
+    {
+      "epoch": 0.15597287428273343,
+      "grad_norm": 0.05368930101394653,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0167,
+      "step": 299
+    },
+    {
+      "epoch": 0.1564945226917058,
+      "grad_norm": 0.16041633486747742,
+      "learning_rate": 0.00038944444444444443,
+      "loss": 0.047,
+      "step": 300
+    },
+    {
+      "epoch": 0.15701617110067814,
+      "grad_norm": 0.06771723926067352,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.0242,
+      "step": 301
+    },
+    {
+      "epoch": 0.15753781950965048,
+      "grad_norm": 0.09745685011148453,
+      "learning_rate": 0.0003883333333333333,
+      "loss": 0.0121,
+      "step": 302
+    },
+    {
+      "epoch": 0.15805946791862285,
+      "grad_norm": 0.1079089567065239,
+      "learning_rate": 0.0003877777777777778,
+      "loss": 0.0331,
+      "step": 303
+    },
+    {
+      "epoch": 0.1585811163275952,
+      "grad_norm": 0.07800073176622391,
+      "learning_rate": 0.00038722222222222223,
+      "loss": 0.0325,
+      "step": 304
+    },
+    {
+      "epoch": 0.15910276473656756,
+      "grad_norm": 0.13546329736709595,
+      "learning_rate": 0.00038666666666666667,
+      "loss": 0.0296,
+      "step": 305
+    },
+    {
+      "epoch": 0.1596244131455399,
+      "grad_norm": 0.0735045000910759,
+      "learning_rate": 0.00038611111111111116,
+      "loss": 0.0275,
+      "step": 306
+    },
+    {
+      "epoch": 0.16014606155451225,
+      "grad_norm": 0.056763097643852234,
+      "learning_rate": 0.00038555555555555554,
+      "loss": 0.025,
+      "step": 307
+    },
+    {
+      "epoch": 0.16066770996348462,
+      "grad_norm": 0.0723307803273201,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.0269,
+      "step": 308
+    },
+    {
+      "epoch": 0.16118935837245696,
+      "grad_norm": 0.07295756787061691,
+      "learning_rate": 0.0003844444444444444,
+      "loss": 0.0224,
+      "step": 309
+    },
+    {
+      "epoch": 0.16171100678142933,
+      "grad_norm": 0.1010420173406601,
+      "learning_rate": 0.0003838888888888889,
+      "loss": 0.0154,
+      "step": 310
+    },
+    {
+      "epoch": 0.16223265519040167,
+      "grad_norm": 0.10790162533521652,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 0.0334,
+      "step": 311
+    },
+    {
+      "epoch": 0.162754303599374,
+      "grad_norm": 0.06171411648392677,
+      "learning_rate": 0.0003827777777777778,
+      "loss": 0.026,
+      "step": 312
+    },
+    {
+      "epoch": 0.16327595200834638,
+      "grad_norm": 0.0646505281329155,
+      "learning_rate": 0.0003822222222222223,
+      "loss": 0.0283,
+      "step": 313
+    },
+    {
+      "epoch": 0.16379760041731872,
+      "grad_norm": 0.1241549476981163,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 0.0433,
+      "step": 314
+    },
+    {
+      "epoch": 0.1643192488262911,
+      "grad_norm": 0.08475686609745026,
+      "learning_rate": 0.00038111111111111115,
+      "loss": 0.0264,
+      "step": 315
+    },
+    {
+      "epoch": 0.16484089723526343,
+      "grad_norm": 0.1006927415728569,
+      "learning_rate": 0.00038055555555555553,
+      "loss": 0.0373,
+      "step": 316
+    },
+    {
+      "epoch": 0.16536254564423578,
+      "grad_norm": 0.08395830541849136,
+      "learning_rate": 0.00038,
+      "loss": 0.0151,
+      "step": 317
+    },
+    {
+      "epoch": 0.16588419405320814,
+      "grad_norm": 0.05780460685491562,
+      "learning_rate": 0.0003794444444444444,
+      "loss": 0.018,
+      "step": 318
+    },
+    {
+      "epoch": 0.1664058424621805,
+      "grad_norm": 0.08385057002305984,
+      "learning_rate": 0.0003788888888888889,
+      "loss": 0.0347,
+      "step": 319
+    },
+    {
+      "epoch": 0.16692749087115286,
+      "grad_norm": 0.0629425197839737,
+      "learning_rate": 0.0003783333333333334,
+      "loss": 0.0288,
+      "step": 320
+    },
+    {
+      "epoch": 0.1674491392801252,
+      "grad_norm": 0.07353231310844421,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.0318,
+      "step": 321
+    },
+    {
+      "epoch": 0.16797078768909754,
+      "grad_norm": 0.06632209569215775,
+      "learning_rate": 0.00037722222222222226,
+      "loss": 0.0286,
+      "step": 322
+    },
+    {
+      "epoch": 0.1684924360980699,
+      "grad_norm": 0.10224422067403793,
+      "learning_rate": 0.00037666666666666664,
+      "loss": 0.0287,
+      "step": 323
+    },
+    {
+      "epoch": 0.16901408450704225,
+      "grad_norm": 0.07615455985069275,
+      "learning_rate": 0.00037611111111111113,
+      "loss": 0.0245,
+      "step": 324
+    },
+    {
+      "epoch": 0.16953573291601462,
+      "grad_norm": 0.08341842144727707,
+      "learning_rate": 0.0003755555555555555,
+      "loss": 0.0272,
+      "step": 325
+    },
+    {
+      "epoch": 0.17005738132498696,
+      "grad_norm": 0.06340507417917252,
+      "learning_rate": 0.000375,
+      "loss": 0.0251,
+      "step": 326
+    },
+    {
+      "epoch": 0.1705790297339593,
+      "grad_norm": 0.05245117098093033,
+      "learning_rate": 0.0003744444444444445,
+      "loss": 0.016,
+      "step": 327
+    },
+    {
+      "epoch": 0.17110067814293167,
+      "grad_norm": 0.07821597903966904,
+      "learning_rate": 0.0003738888888888889,
+      "loss": 0.0166,
+      "step": 328
+    },
+    {
+      "epoch": 0.17162232655190401,
+      "grad_norm": 0.05091237649321556,
+      "learning_rate": 0.0003733333333333334,
+      "loss": 0.0169,
+      "step": 329
+    },
+    {
+      "epoch": 0.17214397496087636,
+      "grad_norm": 0.11584059149026871,
+      "learning_rate": 0.00037277777777777776,
+      "loss": 0.0424,
+      "step": 330
+    },
+    {
+      "epoch": 0.17266562336984873,
+      "grad_norm": 0.08996029943227768,
+      "learning_rate": 0.00037222222222222225,
+      "loss": 0.0287,
+      "step": 331
+    },
+    {
+      "epoch": 0.17318727177882107,
+      "grad_norm": 0.06258998066186905,
+      "learning_rate": 0.00037166666666666663,
+      "loss": 0.0243,
+      "step": 332
+    },
+    {
+      "epoch": 0.17370892018779344,
+      "grad_norm": 0.06734970957040787,
+      "learning_rate": 0.0003711111111111111,
+      "loss": 0.0294,
+      "step": 333
+    },
+    {
+      "epoch": 0.17423056859676578,
+      "grad_norm": 0.06081216409802437,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 0.0262,
+      "step": 334
+    },
+    {
+      "epoch": 0.17475221700573812,
+      "grad_norm": 0.06397537142038345,
+      "learning_rate": 0.00037,
+      "loss": 0.0161,
+      "step": 335
+    },
+    {
+      "epoch": 0.1752738654147105,
+      "grad_norm": 0.07987434417009354,
+      "learning_rate": 0.0003694444444444445,
+      "loss": 0.0301,
+      "step": 336
+    },
+    {
+      "epoch": 0.17579551382368283,
+      "grad_norm": 0.09395250678062439,
+      "learning_rate": 0.00036888888888888887,
+      "loss": 0.0289,
+      "step": 337
+    },
+    {
+      "epoch": 0.1763171622326552,
+      "grad_norm": 0.05801301822066307,
+      "learning_rate": 0.00036833333333333336,
+      "loss": 0.0326,
+      "step": 338
+    },
+    {
+      "epoch": 0.17683881064162754,
+      "grad_norm": 0.06285756826400757,
+      "learning_rate": 0.00036777777777777774,
+      "loss": 0.0235,
+      "step": 339
+    },
+    {
+      "epoch": 0.17736045905059988,
+      "grad_norm": 0.06429009139537811,
+      "learning_rate": 0.00036722222222222223,
+      "loss": 0.0136,
+      "step": 340
+    },
+    {
+      "epoch": 0.17788210745957225,
+      "grad_norm": 0.05570930242538452,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.0205,
+      "step": 341
+    },
+    {
+      "epoch": 0.1784037558685446,
+      "grad_norm": 0.061478108167648315,
+      "learning_rate": 0.0003661111111111111,
+      "loss": 0.026,
+      "step": 342
+    },
+    {
+      "epoch": 0.17892540427751696,
+      "grad_norm": 0.07520420104265213,
+      "learning_rate": 0.0003655555555555556,
+      "loss": 0.0265,
+      "step": 343
+    },
+    {
+      "epoch": 0.1794470526864893,
+      "grad_norm": 0.047426123172044754,
+      "learning_rate": 0.000365,
+      "loss": 0.0144,
+      "step": 344
+    },
+    {
+      "epoch": 0.17996870109546165,
+      "grad_norm": 0.09971431642770767,
+      "learning_rate": 0.00036444444444444447,
+      "loss": 0.0359,
+      "step": 345
+    },
+    {
+      "epoch": 0.18049034950443402,
+      "grad_norm": 0.0507560633122921,
+      "learning_rate": 0.00036388888888888886,
+      "loss": 0.0203,
+      "step": 346
+    },
+    {
+      "epoch": 0.18101199791340636,
+      "grad_norm": 0.09610850363969803,
+      "learning_rate": 0.00036333333333333335,
+      "loss": 0.0352,
+      "step": 347
+    },
+    {
+      "epoch": 0.18153364632237873,
+      "grad_norm": 0.04846423119306564,
+      "learning_rate": 0.0003627777777777778,
+      "loss": 0.0162,
+      "step": 348
+    },
+    {
+      "epoch": 0.18205529473135107,
+      "grad_norm": 0.15771976113319397,
+      "learning_rate": 0.0003622222222222222,
+      "loss": 0.0323,
+      "step": 349
+    },
+    {
+      "epoch": 0.1825769431403234,
+      "grad_norm": 0.07306705415248871,
+      "learning_rate": 0.0003616666666666667,
+      "loss": 0.0217,
+      "step": 350
+    },
+    {
+      "epoch": 0.18309859154929578,
+      "grad_norm": 0.05630479007959366,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 0.0149,
+      "step": 351
+    },
+    {
+      "epoch": 0.18362023995826812,
+      "grad_norm": 0.08934023231267929,
+      "learning_rate": 0.0003605555555555556,
+      "loss": 0.0113,
+      "step": 352
+    },
+    {
+      "epoch": 0.1841418883672405,
+      "grad_norm": 0.1724640429019928,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.0435,
+      "step": 353
+    },
+    {
+      "epoch": 0.18466353677621283,
+      "grad_norm": 0.06963273137807846,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 0.0307,
+      "step": 354
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.06084301322698593,
+      "learning_rate": 0.0003588888888888889,
+      "loss": 0.0306,
+      "step": 355
+    },
+    {
+      "epoch": 0.18570683359415754,
+      "grad_norm": 0.07648872584104538,
+      "learning_rate": 0.00035833333333333333,
+      "loss": 0.0271,
+      "step": 356
+    },
+    {
+      "epoch": 0.18622848200312989,
+      "grad_norm": 0.07619331032037735,
+      "learning_rate": 0.00035777777777777777,
+      "loss": 0.0171,
+      "step": 357
+    },
+    {
+      "epoch": 0.18675013041210226,
+      "grad_norm": 0.08520349115133286,
+      "learning_rate": 0.0003572222222222222,
+      "loss": 0.0284,
+      "step": 358
+    },
+    {
+      "epoch": 0.1872717788210746,
+      "grad_norm": 0.057310912758111954,
+      "learning_rate": 0.0003566666666666667,
+      "loss": 0.0223,
+      "step": 359
+    },
+    {
+      "epoch": 0.18779342723004694,
+      "grad_norm": 0.08222941309213638,
+      "learning_rate": 0.0003561111111111111,
+      "loss": 0.0296,
+      "step": 360
+    },
+    {
+      "epoch": 0.1883150756390193,
+      "grad_norm": 0.08427579700946808,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.031,
+      "step": 361
+    },
+    {
+      "epoch": 0.18883672404799165,
+      "grad_norm": 0.06189948692917824,
+      "learning_rate": 0.000355,
+      "loss": 0.0273,
+      "step": 362
+    },
+    {
+      "epoch": 0.18935837245696402,
+      "grad_norm": 0.07053495943546295,
+      "learning_rate": 0.00035444444444444445,
+      "loss": 0.0197,
+      "step": 363
+    },
+    {
+      "epoch": 0.18988002086593636,
+      "grad_norm": 0.0913248062133789,
+      "learning_rate": 0.0003538888888888889,
+      "loss": 0.0336,
+      "step": 364
+    },
+    {
+      "epoch": 0.1904016692749087,
+      "grad_norm": 0.062306199222803116,
+      "learning_rate": 0.0003533333333333333,
+      "loss": 0.0291,
+      "step": 365
+    },
+    {
+      "epoch": 0.19092331768388107,
+      "grad_norm": 0.09297792613506317,
+      "learning_rate": 0.0003527777777777778,
+      "loss": 0.0338,
+      "step": 366
+    },
+    {
+      "epoch": 0.1914449660928534,
+      "grad_norm": 0.12690134346485138,
+      "learning_rate": 0.00035222222222222225,
+      "loss": 0.0399,
+      "step": 367
+    },
+    {
+      "epoch": 0.19196661450182578,
+      "grad_norm": 0.07451540231704712,
+      "learning_rate": 0.0003516666666666667,
+      "loss": 0.0364,
+      "step": 368
+    },
+    {
+      "epoch": 0.19248826291079812,
+      "grad_norm": 0.09954366087913513,
+      "learning_rate": 0.0003511111111111111,
+      "loss": 0.036,
+      "step": 369
+    },
+    {
+      "epoch": 0.19300991131977047,
+      "grad_norm": 0.07105272263288498,
+      "learning_rate": 0.00035055555555555556,
+      "loss": 0.0226,
+      "step": 370
+    },
+    {
+      "epoch": 0.19353155972874284,
+      "grad_norm": 0.06857888400554657,
+      "learning_rate": 0.00035,
+      "loss": 0.0305,
+      "step": 371
+    },
+    {
+      "epoch": 0.19405320813771518,
+      "grad_norm": 0.060487356036901474,
+      "learning_rate": 0.00034944444444444443,
+      "loss": 0.0279,
+      "step": 372
+    },
+    {
+      "epoch": 0.19457485654668752,
+      "grad_norm": 0.07935786992311478,
+      "learning_rate": 0.0003488888888888889,
+      "loss": 0.0377,
+      "step": 373
+    },
+    {
+      "epoch": 0.1950965049556599,
+      "grad_norm": 0.10610669106245041,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 0.0198,
+      "step": 374
+    },
+    {
+      "epoch": 0.19561815336463223,
+      "grad_norm": 0.06738949567079544,
+      "learning_rate": 0.0003477777777777778,
+      "loss": 0.0321,
+      "step": 375
+    },
+    {
+      "epoch": 0.1961398017736046,
+      "grad_norm": 0.09995345771312714,
+      "learning_rate": 0.00034722222222222224,
+      "loss": 0.0168,
+      "step": 376
+    },
+    {
+      "epoch": 0.19666145018257694,
+      "grad_norm": 0.07820367068052292,
+      "learning_rate": 0.00034666666666666667,
+      "loss": 0.0192,
+      "step": 377
+    },
+    {
+      "epoch": 0.19718309859154928,
+      "grad_norm": 0.05883244797587395,
+      "learning_rate": 0.0003461111111111111,
+      "loss": 0.0217,
+      "step": 378
+    },
+    {
+      "epoch": 0.19770474700052165,
+      "grad_norm": 0.06929990649223328,
+      "learning_rate": 0.00034555555555555555,
+      "loss": 0.0248,
+      "step": 379
+    },
+    {
+      "epoch": 0.198226395409494,
+      "grad_norm": 0.060583919286727905,
+      "learning_rate": 0.000345,
+      "loss": 0.0204,
+      "step": 380
+    },
+    {
+      "epoch": 0.19874804381846636,
+      "grad_norm": 0.08263508230447769,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.0261,
+      "step": 381
+    },
+    {
+      "epoch": 0.1992696922274387,
+      "grad_norm": 0.07354709506034851,
+      "learning_rate": 0.0003438888888888889,
+      "loss": 0.0328,
+      "step": 382
+    },
+    {
+      "epoch": 0.19979134063641105,
+      "grad_norm": 0.09151386469602585,
+      "learning_rate": 0.00034333333333333335,
+      "loss": 0.0315,
+      "step": 383
+    },
+    {
+      "epoch": 0.20031298904538342,
+      "grad_norm": 0.06800325959920883,
+      "learning_rate": 0.0003427777777777778,
+      "loss": 0.0265,
+      "step": 384
+    },
+    {
+      "epoch": 0.20083463745435576,
+      "grad_norm": 0.05672604963183403,
+      "learning_rate": 0.0003422222222222222,
+      "loss": 0.0216,
+      "step": 385
+    },
+    {
+      "epoch": 0.20135628586332813,
+      "grad_norm": 0.07447244226932526,
+      "learning_rate": 0.00034166666666666666,
+      "loss": 0.0253,
+      "step": 386
+    },
+    {
+      "epoch": 0.20187793427230047,
+      "grad_norm": 0.051845699548721313,
+      "learning_rate": 0.0003411111111111111,
+      "loss": 0.0291,
+      "step": 387
+    },
+    {
+      "epoch": 0.2023995826812728,
+      "grad_norm": 0.04311797395348549,
+      "learning_rate": 0.0003405555555555556,
+      "loss": 0.0189,
+      "step": 388
+    },
+    {
+      "epoch": 0.20292123109024518,
+      "grad_norm": 0.08901547640562057,
+      "learning_rate": 0.00034,
+      "loss": 0.0313,
+      "step": 389
+    },
+    {
+      "epoch": 0.20344287949921752,
+      "grad_norm": 0.05055601894855499,
+      "learning_rate": 0.00033944444444444446,
+      "loss": 0.0231,
+      "step": 390
+    },
+    {
+      "epoch": 0.2039645279081899,
+      "grad_norm": 0.04943820461630821,
+      "learning_rate": 0.0003388888888888889,
+      "loss": 0.021,
+      "step": 391
+    },
+    {
+      "epoch": 0.20448617631716223,
+      "grad_norm": 0.0558842197060585,
+      "learning_rate": 0.00033833333333333334,
+      "loss": 0.0251,
+      "step": 392
+    },
+    {
+      "epoch": 0.20500782472613457,
+      "grad_norm": 0.06570509821176529,
+      "learning_rate": 0.00033777777777777777,
+      "loss": 0.012,
+      "step": 393
+    },
+    {
+      "epoch": 0.20552947313510694,
+      "grad_norm": 0.13640566170215607,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 0.0396,
+      "step": 394
+    },
+    {
+      "epoch": 0.20605112154407929,
+      "grad_norm": 0.05271435156464577,
+      "learning_rate": 0.0003366666666666667,
+      "loss": 0.028,
+      "step": 395
+    },
+    {
+      "epoch": 0.20657276995305165,
+      "grad_norm": 0.04778929427266121,
+      "learning_rate": 0.00033611111111111114,
+      "loss": 0.0126,
+      "step": 396
+    },
+    {
+      "epoch": 0.207094418362024,
+      "grad_norm": 0.04178643599152565,
+      "learning_rate": 0.0003355555555555556,
+      "loss": 0.0148,
+      "step": 397
+    },
+    {
+      "epoch": 0.20761606677099634,
+      "grad_norm": 0.05933418869972229,
+      "learning_rate": 0.000335,
+      "loss": 0.0218,
+      "step": 398
+    },
+    {
+      "epoch": 0.2081377151799687,
+      "grad_norm": 0.05561219900846481,
+      "learning_rate": 0.00033444444444444445,
+      "loss": 0.0167,
+      "step": 399
+    },
+    {
+      "epoch": 0.20865936358894105,
+      "grad_norm": 0.0431622713804245,
+      "learning_rate": 0.0003338888888888889,
+      "loss": 0.0263,
+      "step": 400
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-400/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}