bsq1989 commited on
Commit
eaecfc9
·
verified ·
1 Parent(s): 69f3572

Upload stronger Qwen3-4B SQL model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen3-4B-Base
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: train_run_06_qwen3_4b_formal
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_run_06_qwen3_4b_formal
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base) on the sql_train dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2490
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-06
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 8
46
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_steps: 300
49
+ - num_epochs: 4.0
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss |
54
+ |:-------------:|:------:|:----:|:---------------:|
55
+ | 0.1851 | 0.3044 | 500 | 0.2562 |
56
+ | 0.1331 | 0.6088 | 1000 | 0.2490 |
57
+ | 0.1293 | 0.9132 | 1500 | 0.2554 |
58
+ | 0.0885 | 1.2174 | 2000 | 0.2661 |
59
+ | 0.0830 | 1.5218 | 2500 | 0.2772 |
60
+ | 0.0843 | 1.8262 | 3000 | 0.2774 |
61
+ | 0.0443 | 2.1303 | 3500 | 0.3124 |
62
+ | 0.0402 | 2.4347 | 4000 | 0.3158 |
63
+ | 0.0414 | 2.7391 | 4500 | 0.3392 |
64
+ | 0.0178 | 3.0432 | 5000 | 0.3912 |
65
+ | 0.0128 | 3.3476 | 5500 | 0.4340 |
66
+ | 0.0110 | 3.6521 | 6000 | 0.4406 |
67
+ | 0.0109 | 3.9565 | 6500 | 0.4410 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - Transformers 5.2.0
73
+ - Pytorch 2.9.1+cu130
74
+ - Datasets 4.0.0
75
+ - Tokenizers 0.22.2
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "effective_tokens_per_sec": 1153.798380212981,
3
+ "epoch": 4.0,
4
+ "eval_loss": 0.24901165068149567,
5
+ "eval_runtime": 46.8427,
6
+ "eval_samples_per_second": 18.786,
7
+ "eval_steps_per_second": 18.786,
8
+ "total_flos": 3.545203061907456e+17,
9
+ "train_loss": 0.08566030774240586,
10
+ "train_runtime": 13933.4985,
11
+ "train_samples_per_second": 3.772,
12
+ "train_steps_per_second": 0.472
13
+ }
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 32768,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 1000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.2.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_loss": 0.24901165068149567,
4
+ "eval_runtime": 46.8427,
5
+ "eval_samples_per_second": 18.786,
6
+ "eval_steps_per_second": 18.786
7
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": false,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "max_new_tokens": 2048,
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "5.2.0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a04a88d8d7c026499bc570cdb42e1cbc0af368cf779b2f1060543a5d47dee1a5
3
+ size 16089918232
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "effective_tokens_per_sec": 1153.798380212981,
3
+ "epoch": 4.0,
4
+ "total_flos": 3.545203061907456e+17,
5
+ "train_loss": 0.08566030774240586,
6
+ "train_runtime": 13933.4985,
7
+ "train_samples_per_second": 3.772,
8
+ "train_steps_per_second": 0.472
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 20, "total_steps": 6572, "loss": 1.119423007965088, "lr": 3.166666666666667e-07, "epoch": 0.0121765601217656, "percentage": 0.3, "elapsed_time": "0:00:54", "remaining_time": "4:54:53"}
2
+ {"current_steps": 40, "total_steps": 6572, "loss": 0.8098940849304199, "lr": 6.5e-07, "epoch": 0.0243531202435312, "percentage": 0.61, "elapsed_time": "0:01:37", "remaining_time": "4:25:19"}
3
+ {"current_steps": 60, "total_steps": 6572, "loss": 0.6310151576995849, "lr": 9.833333333333334e-07, "epoch": 0.0365296803652968, "percentage": 0.91, "elapsed_time": "0:02:14", "remaining_time": "4:03:37"}
4
+ {"current_steps": 80, "total_steps": 6572, "loss": 0.5526751518249512, "lr": 1.3166666666666666e-06, "epoch": 0.0487062404870624, "percentage": 1.22, "elapsed_time": "0:02:51", "remaining_time": "3:52:34"}
5
+ {"current_steps": 100, "total_steps": 6572, "loss": 0.4834024906158447, "lr": 1.6500000000000003e-06, "epoch": 0.060882800608828, "percentage": 1.52, "elapsed_time": "0:03:29", "remaining_time": "3:45:38"}
6
+ {"current_steps": 120, "total_steps": 6572, "loss": 0.44873433113098143, "lr": 1.9833333333333335e-06, "epoch": 0.0730593607305936, "percentage": 1.83, "elapsed_time": "0:04:06", "remaining_time": "3:40:39"}
7
+ {"current_steps": 140, "total_steps": 6572, "loss": 0.38286705017089845, "lr": 2.316666666666667e-06, "epoch": 0.0852359208523592, "percentage": 2.13, "elapsed_time": "0:04:44", "remaining_time": "3:37:42"}
8
+ {"current_steps": 160, "total_steps": 6572, "loss": 0.38465352058410646, "lr": 2.6500000000000005e-06, "epoch": 0.0974124809741248, "percentage": 2.43, "elapsed_time": "0:05:21", "remaining_time": "3:34:27"}
9
+ {"current_steps": 180, "total_steps": 6572, "loss": 0.3305965900421143, "lr": 2.9833333333333337e-06, "epoch": 0.1095890410958904, "percentage": 2.74, "elapsed_time": "0:05:56", "remaining_time": "3:31:11"}
10
+ {"current_steps": 200, "total_steps": 6572, "loss": 0.2875316619873047, "lr": 3.316666666666667e-06, "epoch": 0.121765601217656, "percentage": 3.04, "elapsed_time": "0:06:33", "remaining_time": "3:28:43"}
11
+ {"current_steps": 220, "total_steps": 6572, "loss": 0.3140716075897217, "lr": 3.65e-06, "epoch": 0.1339421613394216, "percentage": 3.35, "elapsed_time": "0:07:09", "remaining_time": "3:26:36"}
12
+ {"current_steps": 240, "total_steps": 6572, "loss": 0.2985520839691162, "lr": 3.983333333333334e-06, "epoch": 0.1461187214611872, "percentage": 3.65, "elapsed_time": "0:07:45", "remaining_time": "3:24:30"}
13
+ {"current_steps": 260, "total_steps": 6572, "loss": 0.2449601411819458, "lr": 4.316666666666667e-06, "epoch": 0.1582952815829528, "percentage": 3.96, "elapsed_time": "0:08:21", "remaining_time": "3:22:57"}
14
+ {"current_steps": 280, "total_steps": 6572, "loss": 0.24566495418548584, "lr": 4.65e-06, "epoch": 0.1704718417047184, "percentage": 4.26, "elapsed_time": "0:08:58", "remaining_time": "3:21:42"}
15
+ {"current_steps": 300, "total_steps": 6572, "loss": 0.20863604545593262, "lr": 4.983333333333334e-06, "epoch": 0.182648401826484, "percentage": 4.56, "elapsed_time": "0:09:34", "remaining_time": "3:20:20"}
16
+ {"current_steps": 320, "total_steps": 6572, "loss": 0.19693111181259154, "lr": 4.9998867856224845e-06, "epoch": 0.1948249619482496, "percentage": 4.87, "elapsed_time": "0:10:11", "remaining_time": "3:19:13"}
17
+ {"current_steps": 340, "total_steps": 6572, "loss": 0.21833207607269287, "lr": 4.999523005839606e-06, "epoch": 0.2070015220700152, "percentage": 5.17, "elapsed_time": "0:10:47", "remaining_time": "3:17:45"}
18
+ {"current_steps": 360, "total_steps": 6572, "loss": 0.2382877826690674, "lr": 4.998908383543311e-06, "epoch": 0.2191780821917808, "percentage": 5.48, "elapsed_time": "0:11:21", "remaining_time": "3:16:06"}
19
+ {"current_steps": 380, "total_steps": 6572, "loss": 0.21118538379669188, "lr": 4.9980429804147276e-06, "epoch": 0.2313546423135464, "percentage": 5.78, "elapsed_time": "0:11:58", "remaining_time": "3:15:01"}
20
+ {"current_steps": 400, "total_steps": 6572, "loss": 0.18367968797683715, "lr": 4.996926883302385e-06, "epoch": 0.243531202435312, "percentage": 6.09, "elapsed_time": "0:12:34", "remaining_time": "3:13:55"}
21
+ {"current_steps": 420, "total_steps": 6572, "loss": 0.22483460903167723, "lr": 4.995560204213496e-06, "epoch": 0.2557077625570776, "percentage": 6.39, "elapsed_time": "0:13:12", "remaining_time": "3:13:22"}
22
+ {"current_steps": 440, "total_steps": 6572, "loss": 0.22673561573028564, "lr": 4.993943080302715e-06, "epoch": 0.2678843226788432, "percentage": 6.7, "elapsed_time": "0:13:50", "remaining_time": "3:12:57"}
23
+ {"current_steps": 460, "total_steps": 6572, "loss": 0.185296630859375, "lr": 4.992075673858379e-06, "epoch": 0.2800608828006088, "percentage": 7.0, "elapsed_time": "0:14:26", "remaining_time": "3:11:55"}
24
+ {"current_steps": 480, "total_steps": 6572, "loss": 0.18437937498092652, "lr": 4.989958172286214e-06, "epoch": 0.2922374429223744, "percentage": 7.3, "elapsed_time": "0:15:04", "remaining_time": "3:11:23"}
25
+ {"current_steps": 500, "total_steps": 6572, "loss": 0.1850834846496582, "lr": 4.987590788090533e-06, "epoch": 0.30441400304414, "percentage": 7.61, "elapsed_time": "0:15:42", "remaining_time": "3:10:48"}
26
+ {"current_steps": 500, "total_steps": 6572, "eval_loss": 0.2562323212623596, "epoch": 0.30441400304414, "percentage": 7.61, "elapsed_time": "0:16:30", "remaining_time": "3:20:23"}
27
+ {"current_steps": 520, "total_steps": 6572, "loss": 0.16346561908721924, "lr": 4.984973758852904e-06, "epoch": 0.3165905631659056, "percentage": 7.91, "elapsed_time": "0:18:27", "remaining_time": "3:34:45"}
28
+ {"current_steps": 540, "total_steps": 6572, "loss": 0.18838067054748536, "lr": 4.982107347208317e-06, "epoch": 0.3287671232876712, "percentage": 8.22, "elapsed_time": "0:19:03", "remaining_time": "3:32:49"}
29
+ {"current_steps": 560, "total_steps": 6572, "loss": 0.177593994140625, "lr": 4.978991840818816e-06, "epoch": 0.3409436834094368, "percentage": 8.52, "elapsed_time": "0:19:40", "remaining_time": "3:31:13"}
30
+ {"current_steps": 580, "total_steps": 6572, "loss": 0.20775914192199707, "lr": 4.975627552344638e-06, "epoch": 0.3531202435312024, "percentage": 8.83, "elapsed_time": "0:20:18", "remaining_time": "3:29:49"}
31
+ {"current_steps": 600, "total_steps": 6572, "loss": 0.16498700380325318, "lr": 4.97201481941283e-06, "epoch": 0.365296803652968, "percentage": 9.13, "elapsed_time": "0:20:55", "remaining_time": "3:28:12"}
32
+ {"current_steps": 620, "total_steps": 6572, "loss": 0.17565951347351075, "lr": 4.968154004583374e-06, "epoch": 0.3774733637747336, "percentage": 9.43, "elapsed_time": "0:21:31", "remaining_time": "3:26:35"}
33
+ {"current_steps": 640, "total_steps": 6572, "loss": 0.16204673051834106, "lr": 4.964045495312794e-06, "epoch": 0.3896499238964992, "percentage": 9.74, "elapsed_time": "0:22:07", "remaining_time": "3:24:59"}
34
+ {"current_steps": 660, "total_steps": 6572, "loss": 0.17068564891815186, "lr": 4.959689703915272e-06, "epoch": 0.4018264840182648, "percentage": 10.04, "elapsed_time": "0:22:42", "remaining_time": "3:23:27"}
35
+ {"current_steps": 680, "total_steps": 6572, "loss": 0.1589680790901184, "lr": 4.95508706752128e-06, "epoch": 0.4140030441400304, "percentage": 10.35, "elapsed_time": "0:23:19", "remaining_time": "3:22:07"}
36
+ {"current_steps": 700, "total_steps": 6572, "loss": 0.17568455934524535, "lr": 4.9502380480337e-06, "epoch": 0.426179604261796, "percentage": 10.65, "elapsed_time": "0:23:56", "remaining_time": "3:20:51"}
37
+ {"current_steps": 720, "total_steps": 6572, "loss": 0.16204804182052612, "lr": 4.9451431320814715e-06, "epoch": 0.4383561643835616, "percentage": 10.96, "elapsed_time": "0:24:33", "remaining_time": "3:19:40"}
38
+ {"current_steps": 740, "total_steps": 6572, "loss": 0.16562143564224244, "lr": 4.939802830970762e-06, "epoch": 0.4505327245053272, "percentage": 11.26, "elapsed_time": "0:25:10", "remaining_time": "3:18:26"}
39
+ {"current_steps": 760, "total_steps": 6572, "loss": 0.17697544097900392, "lr": 4.934217680633646e-06, "epoch": 0.4627092846270928, "percentage": 11.56, "elapsed_time": "0:25:46", "remaining_time": "3:17:03"}
40
+ {"current_steps": 780, "total_steps": 6572, "loss": 0.1649466037750244, "lr": 4.928388241574327e-06, "epoch": 0.4748858447488584, "percentage": 11.87, "elapsed_time": "0:26:23", "remaining_time": "3:15:55"}
41
+ {"current_steps": 800, "total_steps": 6572, "loss": 0.1602837324142456, "lr": 4.922315098812883e-06, "epoch": 0.487062404870624, "percentage": 12.17, "elapsed_time": "0:26:59", "remaining_time": "3:14:43"}
42
+ {"current_steps": 820, "total_steps": 6572, "loss": 0.142719042301178, "lr": 4.9159988618265585e-06, "epoch": 0.4992389649923896, "percentage": 12.48, "elapsed_time": "0:27:36", "remaining_time": "3:13:37"}
43
+ {"current_steps": 840, "total_steps": 6572, "loss": 0.14508233070373536, "lr": 4.9094401644886e-06, "epoch": 0.5114155251141552, "percentage": 12.78, "elapsed_time": "0:28:12", "remaining_time": "3:12:31"}
44
+ {"current_steps": 860, "total_steps": 6572, "loss": 0.1821539044380188, "lr": 4.902639665004641e-06, "epoch": 0.5235920852359208, "percentage": 13.09, "elapsed_time": "0:28:49", "remaining_time": "3:11:27"}
45
+ {"current_steps": 880, "total_steps": 6572, "loss": 0.16131887435913086, "lr": 4.89559804584665e-06, "epoch": 0.5357686453576864, "percentage": 13.39, "elapsed_time": "0:29:24", "remaining_time": "3:10:16"}
46
+ {"current_steps": 900, "total_steps": 6572, "loss": 0.17404688596725465, "lr": 4.888316013684435e-06, "epoch": 0.547945205479452, "percentage": 13.69, "elapsed_time": "0:30:01", "remaining_time": "3:09:11"}
47
+ {"current_steps": 920, "total_steps": 6572, "loss": 0.14134640693664552, "lr": 4.880794299314732e-06, "epoch": 0.5601217656012176, "percentage": 14.0, "elapsed_time": "0:30:38", "remaining_time": "3:08:12"}
48
+ {"current_steps": 940, "total_steps": 6572, "loss": 0.14891813993453978, "lr": 4.87303365758786e-06, "epoch": 0.5722983257229832, "percentage": 14.3, "elapsed_time": "0:31:15", "remaining_time": "3:07:19"}
49
+ {"current_steps": 960, "total_steps": 6572, "loss": 0.1696299910545349, "lr": 4.865034867331967e-06, "epoch": 0.5844748858447488, "percentage": 14.61, "elapsed_time": "0:31:52", "remaining_time": "3:06:21"}
50
+ {"current_steps": 980, "total_steps": 6572, "loss": 0.14085158109664916, "lr": 4.856798731274874e-06, "epoch": 0.5966514459665144, "percentage": 14.91, "elapsed_time": "0:32:29", "remaining_time": "3:05:22"}
51
+ {"current_steps": 1000, "total_steps": 6572, "loss": 0.133053982257843, "lr": 4.84832607596351e-06, "epoch": 0.60882800608828, "percentage": 15.22, "elapsed_time": "0:33:04", "remaining_time": "3:04:16"}
52
+ {"current_steps": 1000, "total_steps": 6572, "eval_loss": 0.24901165068149567, "epoch": 0.60882800608828, "percentage": 15.22, "elapsed_time": "0:33:51", "remaining_time": "3:08:37"}
53
+ {"current_steps": 1020, "total_steps": 6572, "loss": 0.12680984735488893, "lr": 4.8396177516809695e-06, "epoch": 0.6210045662100456, "percentage": 15.52, "elapsed_time": "0:36:06", "remaining_time": "3:16:32"}
54
+ {"current_steps": 1040, "total_steps": 6572, "loss": 0.14880582094192504, "lr": 4.830674632361178e-06, "epoch": 0.6331811263318112, "percentage": 15.82, "elapsed_time": "0:36:43", "remaining_time": "3:15:23"}
55
+ {"current_steps": 1060, "total_steps": 6572, "loss": 0.1447562575340271, "lr": 4.821497615501186e-06, "epoch": 0.6453576864535768, "percentage": 16.13, "elapsed_time": "0:37:21", "remaining_time": "3:14:16"}
56
+ {"current_steps": 1080, "total_steps": 6572, "loss": 0.15530819892883302, "lr": 4.812087622071104e-06, "epoch": 0.6575342465753424, "percentage": 16.43, "elapsed_time": "0:37:59", "remaining_time": "3:13:10"}
57
+ {"current_steps": 1100, "total_steps": 6572, "loss": 0.14426586627960206, "lr": 4.80244559642167e-06, "epoch": 0.669710806697108, "percentage": 16.74, "elapsed_time": "0:38:36", "remaining_time": "3:12:03"}
58
+ {"current_steps": 1120, "total_steps": 6572, "loss": 0.15436025857925414, "lr": 4.792572506189489e-06, "epoch": 0.6818873668188736, "percentage": 17.04, "elapsed_time": "0:39:13", "remaining_time": "3:10:54"}
59
+ {"current_steps": 1140, "total_steps": 6572, "loss": 0.14860854148864747, "lr": 4.782469342199915e-06, "epoch": 0.6940639269406392, "percentage": 17.35, "elapsed_time": "0:39:50", "remaining_time": "3:09:48"}
60
+ {"current_steps": 1160, "total_steps": 6572, "loss": 0.1313084125518799, "lr": 4.7721371183676205e-06, "epoch": 0.7062404870624048, "percentage": 17.65, "elapsed_time": "0:40:26", "remaining_time": "3:08:40"}
61
+ {"current_steps": 1180, "total_steps": 6572, "loss": 0.150812029838562, "lr": 4.761576871594841e-06, "epoch": 0.7184170471841704, "percentage": 17.95, "elapsed_time": "0:41:02", "remaining_time": "3:07:33"}
62
+ {"current_steps": 1200, "total_steps": 6572, "loss": 0.13278884887695314, "lr": 4.750789661667318e-06, "epoch": 0.730593607305936, "percentage": 18.26, "elapsed_time": "0:41:38", "remaining_time": "3:06:24"}
63
+ {"current_steps": 1220, "total_steps": 6572, "loss": 0.1612934350967407, "lr": 4.739776571147943e-06, "epoch": 0.7427701674277016, "percentage": 18.56, "elapsed_time": "0:42:14", "remaining_time": "3:05:16"}
64
+ {"current_steps": 1240, "total_steps": 6572, "loss": 0.16211290359497071, "lr": 4.728538705268116e-06, "epoch": 0.7549467275494672, "percentage": 18.87, "elapsed_time": "0:42:49", "remaining_time": "3:04:10"}
65
+ {"current_steps": 1260, "total_steps": 6572, "loss": 0.14386119842529296, "lr": 4.717077191816824e-06, "epoch": 0.7671232876712328, "percentage": 19.17, "elapsed_time": "0:43:26", "remaining_time": "3:03:09"}
66
+ {"current_steps": 1280, "total_steps": 6572, "loss": 0.12942540645599365, "lr": 4.705393181027463e-06, "epoch": 0.7792998477929984, "percentage": 19.48, "elapsed_time": "0:44:01", "remaining_time": "3:02:02"}
67
+ {"current_steps": 1300, "total_steps": 6572, "loss": 0.14771063327789308, "lr": 4.693487845462413e-06, "epoch": 0.791476407914764, "percentage": 19.78, "elapsed_time": "0:44:38", "remaining_time": "3:01:02"}
68
+ {"current_steps": 1320, "total_steps": 6572, "loss": 0.1276724100112915, "lr": 4.681362379895349e-06, "epoch": 0.8036529680365296, "percentage": 20.09, "elapsed_time": "0:45:14", "remaining_time": "3:00:00"}
69
+ {"current_steps": 1340, "total_steps": 6572, "loss": 0.1319241166114807, "lr": 4.6690180011913524e-06, "epoch": 0.8158295281582952, "percentage": 20.39, "elapsed_time": "0:45:50", "remaining_time": "2:58:59"}
70
+ {"current_steps": 1360, "total_steps": 6572, "loss": 0.1557891011238098, "lr": 4.6564559481847795e-06, "epoch": 0.8280060882800608, "percentage": 20.69, "elapsed_time": "0:46:27", "remaining_time": "2:58:01"}
71
+ {"current_steps": 1380, "total_steps": 6572, "loss": 0.11075855493545532, "lr": 4.643677481554947e-06, "epoch": 0.8401826484018264, "percentage": 21.0, "elapsed_time": "0:47:04", "remaining_time": "2:57:07"}
72
+ {"current_steps": 1400, "total_steps": 6572, "loss": 0.1580789566040039, "lr": 4.630683883699607e-06, "epoch": 0.852359208523592, "percentage": 21.3, "elapsed_time": "0:47:40", "remaining_time": "2:56:08"}
73
+ {"current_steps": 1420, "total_steps": 6572, "loss": 0.16006500720977784, "lr": 4.6174764586062556e-06, "epoch": 0.8645357686453576, "percentage": 21.61, "elapsed_time": "0:48:18", "remaining_time": "2:55:14"}
74
+ {"current_steps": 1440, "total_steps": 6572, "loss": 0.1462727189064026, "lr": 4.6040565317212685e-06, "epoch": 0.8767123287671232, "percentage": 21.91, "elapsed_time": "0:48:54", "remaining_time": "2:54:20"}
75
+ {"current_steps": 1460, "total_steps": 6572, "loss": 0.14725338220596312, "lr": 4.59042544981688e-06, "epoch": 0.8888888888888888, "percentage": 22.22, "elapsed_time": "0:49:30", "remaining_time": "2:53:22"}
76
+ {"current_steps": 1480, "total_steps": 6572, "loss": 0.1304166793823242, "lr": 4.5765845808560334e-06, "epoch": 0.9010654490106544, "percentage": 22.52, "elapsed_time": "0:50:07", "remaining_time": "2:52:27"}
77
+ {"current_steps": 1500, "total_steps": 6572, "loss": 0.1293134570121765, "lr": 4.562535313855094e-06, "epoch": 0.91324200913242, "percentage": 22.82, "elapsed_time": "0:50:45", "remaining_time": "2:51:38"}
78
+ {"current_steps": 1500, "total_steps": 6572, "eval_loss": 0.25538697838783264, "epoch": 0.91324200913242, "percentage": 22.82, "elapsed_time": "0:51:32", "remaining_time": "2:54:17"}
79
+ {"current_steps": 1520, "total_steps": 6572, "loss": 0.11359381675720215, "lr": 4.548279058744451e-06, "epoch": 0.9254185692541856, "percentage": 23.13, "elapsed_time": "0:53:39", "remaining_time": "2:58:20"}
80
+ {"current_steps": 1540, "total_steps": 6572, "loss": 0.15145074129104613, "lr": 4.533817246227024e-06, "epoch": 0.9375951293759512, "percentage": 23.43, "elapsed_time": "0:54:15", "remaining_time": "2:57:18"}
81
+ {"current_steps": 1560, "total_steps": 6572, "loss": 0.11953675746917725, "lr": 4.519151327634685e-06, "epoch": 0.9497716894977168, "percentage": 23.74, "elapsed_time": "0:54:52", "remaining_time": "2:56:18"}
82
+ {"current_steps": 1580, "total_steps": 6572, "loss": 0.13977375030517578, "lr": 4.504282774782605e-06, "epoch": 0.9619482496194824, "percentage": 24.04, "elapsed_time": "0:55:30", "remaining_time": "2:55:21"}
83
+ {"current_steps": 1600, "total_steps": 6572, "loss": 0.1338045358657837, "lr": 4.489213079821551e-06, "epoch": 0.974124809741248, "percentage": 24.35, "elapsed_time": "0:56:07", "remaining_time": "2:54:25"}
84
+ {"current_steps": 1620, "total_steps": 6572, "loss": 0.11776142120361328, "lr": 4.4739437550881355e-06, "epoch": 0.9863013698630136, "percentage": 24.65, "elapsed_time": "0:56:43", "remaining_time": "2:53:24"}
85
+ {"current_steps": 1640, "total_steps": 6572, "loss": 0.12504475116729735, "lr": 4.458476332953051e-06, "epoch": 0.9984779299847792, "percentage": 24.95, "elapsed_time": "0:57:19", "remaining_time": "2:52:23"}
86
+ {"current_steps": 1660, "total_steps": 6572, "loss": 0.08379222154617309, "lr": 4.442812365667281e-06, "epoch": 1.0103500761035007, "percentage": 25.26, "elapsed_time": "0:57:55", "remaining_time": "2:51:23"}
87
+ {"current_steps": 1680, "total_steps": 6572, "loss": 0.08407147526741028, "lr": 4.426953425206322e-06, "epoch": 1.0225266362252663, "percentage": 25.56, "elapsed_time": "0:58:32", "remaining_time": "2:50:27"}
88
+ {"current_steps": 1700, "total_steps": 6572, "loss": 0.08041079640388489, "lr": 4.410901103112434e-06, "epoch": 1.034703196347032, "percentage": 25.87, "elapsed_time": "0:59:08", "remaining_time": "2:49:28"}
89
+ {"current_steps": 1720, "total_steps": 6572, "loss": 0.07876392006874085, "lr": 4.394657010334908e-06, "epoch": 1.0468797564687975, "percentage": 26.17, "elapsed_time": "0:59:43", "remaining_time": "2:48:29"}
90
+ {"current_steps": 1740, "total_steps": 6572, "loss": 0.10302903652191162, "lr": 4.378222777068406e-06, "epoch": 1.059056316590563, "percentage": 26.48, "elapsed_time": "1:00:19", "remaining_time": "2:47:32"}
91
+ {"current_steps": 1760, "total_steps": 6572, "loss": 0.08733606934547425, "lr": 4.361600052589358e-06, "epoch": 1.0712328767123287, "percentage": 26.78, "elapsed_time": "1:00:55", "remaining_time": "2:46:34"}
92
+ {"current_steps": 1780, "total_steps": 6572, "loss": 0.08532609939575195, "lr": 4.344790505090447e-06, "epoch": 1.0834094368340943, "percentage": 27.08, "elapsed_time": "1:01:31", "remaining_time": "2:45:38"}
93
+ {"current_steps": 1800, "total_steps": 6572, "loss": 0.08734336495399475, "lr": 4.327795821513195e-06, "epoch": 1.09558599695586, "percentage": 27.39, "elapsed_time": "1:02:09", "remaining_time": "2:44:46"}
94
+ {"current_steps": 1820, "total_steps": 6572, "loss": 0.0913870632648468, "lr": 4.3106177073786684e-06, "epoch": 1.1077625570776255, "percentage": 27.69, "elapsed_time": "1:02:46", "remaining_time": "2:43:53"}
95
+ {"current_steps": 1840, "total_steps": 6572, "loss": 0.08115516304969787, "lr": 4.293257886616318e-06, "epoch": 1.119939117199391, "percentage": 28.0, "elapsed_time": "1:03:22", "remaining_time": "2:42:59"}
96
+ {"current_steps": 1860, "total_steps": 6572, "loss": 0.08891176581382751, "lr": 4.275718101390975e-06, "epoch": 1.1321156773211567, "percentage": 28.3, "elapsed_time": "1:03:59", "remaining_time": "2:42:07"}
97
+ {"current_steps": 1880, "total_steps": 6572, "loss": 0.07950961589813232, "lr": 4.25800011192801e-06, "epoch": 1.1442922374429223, "percentage": 28.61, "elapsed_time": "1:04:37", "remaining_time": "2:41:16"}
98
+ {"current_steps": 1900, "total_steps": 6572, "loss": 0.08310645222663879, "lr": 4.240105696336687e-06, "epoch": 1.156468797564688, "percentage": 28.91, "elapsed_time": "1:05:13", "remaining_time": "2:40:22"}
99
+ {"current_steps": 1920, "total_steps": 6572, "loss": 0.07682961225509644, "lr": 4.222036650431715e-06, "epoch": 1.1686453576864535, "percentage": 29.21, "elapsed_time": "1:05:49", "remaining_time": "2:39:29"}
100
+ {"current_steps": 1940, "total_steps": 6572, "loss": 0.07520227432250977, "lr": 4.203794787553032e-06, "epoch": 1.180821917808219, "percentage": 29.52, "elapsed_time": "1:06:26", "remaining_time": "2:38:37"}
101
+ {"current_steps": 1960, "total_steps": 6572, "loss": 0.0754019558429718, "lr": 4.185381938383821e-06, "epoch": 1.1929984779299847, "percentage": 29.82, "elapsed_time": "1:07:02", "remaining_time": "2:37:44"}
102
+ {"current_steps": 1980, "total_steps": 6572, "loss": 0.08085885643959045, "lr": 4.166799950766793e-06, "epoch": 1.2051750380517503, "percentage": 30.13, "elapsed_time": "1:07:38", "remaining_time": "2:36:51"}
103
+ {"current_steps": 2000, "total_steps": 6572, "loss": 0.0884653627872467, "lr": 4.14805068951874e-06, "epoch": 1.217351598173516, "percentage": 30.43, "elapsed_time": "1:08:14", "remaining_time": "2:36:00"}
104
+ {"current_steps": 2000, "total_steps": 6572, "eval_loss": 0.2661186456680298, "epoch": 1.217351598173516, "percentage": 30.43, "elapsed_time": "1:09:01", "remaining_time": "2:37:47"}
105
+ {"current_steps": 2020, "total_steps": 6572, "loss": 0.06684748530387878, "lr": 4.1291360362433965e-06, "epoch": 1.2295281582952815, "percentage": 30.74, "elapsed_time": "1:11:15", "remaining_time": "2:40:34"}
106
+ {"current_steps": 2040, "total_steps": 6572, "loss": 0.0720324158668518, "lr": 4.110057889142601e-06, "epoch": 1.241704718417047, "percentage": 31.04, "elapsed_time": "1:11:52", "remaining_time": "2:39:40"}
107
+ {"current_steps": 2060, "total_steps": 6572, "loss": 0.08799988031387329, "lr": 4.090818162825804e-06, "epoch": 1.2538812785388127, "percentage": 31.35, "elapsed_time": "1:12:29", "remaining_time": "2:38:47"}
108
+ {"current_steps": 2080, "total_steps": 6572, "loss": 0.09275985956192016, "lr": 4.071418788117926e-06, "epoch": 1.2660578386605783, "percentage": 31.65, "elapsed_time": "1:13:07", "remaining_time": "2:37:55"}
109
+ {"current_steps": 2100, "total_steps": 6572, "loss": 0.08431113958358764, "lr": 4.0518617118655845e-06, "epoch": 1.278234398782344, "percentage": 31.95, "elapsed_time": "1:13:43", "remaining_time": "2:37:00"}
110
+ {"current_steps": 2120, "total_steps": 6572, "loss": 0.09995608925819396, "lr": 4.032148896741717e-06, "epoch": 1.2904109589041095, "percentage": 32.26, "elapsed_time": "1:14:20", "remaining_time": "2:36:07"}
111
+ {"current_steps": 2140, "total_steps": 6572, "loss": 0.07387629747390748, "lr": 4.012282321048618e-06, "epoch": 1.302587519025875, "percentage": 32.56, "elapsed_time": "1:14:57", "remaining_time": "2:35:14"}
112
+ {"current_steps": 2160, "total_steps": 6572, "loss": 0.07667248249053955, "lr": 3.992263978519398e-06, "epoch": 1.3147640791476407, "percentage": 32.87, "elapsed_time": "1:15:34", "remaining_time": "2:34:22"}
113
+ {"current_steps": 2180, "total_steps": 6572, "loss": 0.09203824400901794, "lr": 3.972095878117904e-06, "epoch": 1.3269406392694063, "percentage": 33.17, "elapsed_time": "1:16:11", "remaining_time": "2:33:30"}
114
+ {"current_steps": 2200, "total_steps": 6572, "loss": 0.07835246920585633, "lr": 3.951780043837107e-06, "epoch": 1.339117199391172, "percentage": 33.48, "elapsed_time": "1:16:48", "remaining_time": "2:32:37"}
115
+ {"current_steps": 2220, "total_steps": 6572, "loss": 0.08577624559402466, "lr": 3.9313185144959835e-06, "epoch": 1.3512937595129375, "percentage": 33.78, "elapsed_time": "1:17:25", "remaining_time": "2:31:46"}
116
+ {"current_steps": 2240, "total_steps": 6572, "loss": 0.0789969801902771, "lr": 3.9107133435349025e-06, "epoch": 1.363470319634703, "percentage": 34.08, "elapsed_time": "1:18:01", "remaining_time": "2:30:53"}
117
+ {"current_steps": 2260, "total_steps": 6572, "loss": 0.07151145935058593, "lr": 3.889966598809557e-06, "epoch": 1.3756468797564687, "percentage": 34.39, "elapsed_time": "1:18:38", "remaining_time": "2:30:02"}
118
+ {"current_steps": 2280, "total_steps": 6572, "loss": 0.09204544425010681, "lr": 3.869080362383437e-06, "epoch": 1.3878234398782343, "percentage": 34.69, "elapsed_time": "1:19:14", "remaining_time": "2:29:10"}
119
+ {"current_steps": 2300, "total_steps": 6572, "loss": 0.11079612970352173, "lr": 3.848056730318881e-06, "epoch": 1.4, "percentage": 35.0, "elapsed_time": "1:19:54", "remaining_time": "2:28:25"}
120
+ {"current_steps": 2320, "total_steps": 6572, "loss": 0.06770140528678895, "lr": 3.826897812466728e-06, "epoch": 1.4121765601217655, "percentage": 35.3, "elapsed_time": "1:20:30", "remaining_time": "2:27:32"}
121
+ {"current_steps": 2340, "total_steps": 6572, "loss": 0.08210510611534119, "lr": 3.8056057322545763e-06, "epoch": 1.4243531202435311, "percentage": 35.61, "elapsed_time": "1:21:07", "remaining_time": "2:26:42"}
122
+ {"current_steps": 2360, "total_steps": 6572, "loss": 0.09583572745323181, "lr": 3.7841826264736888e-06, "epoch": 1.4365296803652967, "percentage": 35.91, "elapsed_time": "1:21:43", "remaining_time": "2:25:51"}
123
+ {"current_steps": 2380, "total_steps": 6572, "loss": 0.09235450625419617, "lr": 3.762630645064547e-06, "epoch": 1.4487062404870623, "percentage": 36.21, "elapsed_time": "1:22:20", "remaining_time": "2:25:01"}
124
+ {"current_steps": 2400, "total_steps": 6572, "loss": 0.08658097982406616, "lr": 3.7409519509010985e-06, "epoch": 1.460882800608828, "percentage": 36.52, "elapsed_time": "1:22:56", "remaining_time": "2:24:11"}
125
+ {"current_steps": 2420, "total_steps": 6572, "loss": 0.08892765045166015, "lr": 3.7191487195736915e-06, "epoch": 1.4730593607305935, "percentage": 36.82, "elapsed_time": "1:23:32", "remaining_time": "2:23:20"}
126
+ {"current_steps": 2440, "total_steps": 6572, "loss": 0.07849371433258057, "lr": 3.697223139170748e-06, "epoch": 1.4852359208523591, "percentage": 37.13, "elapsed_time": "1:24:09", "remaining_time": "2:22:30"}
127
+ {"current_steps": 2460, "total_steps": 6572, "loss": 0.07469035387039184, "lr": 3.6751774100591716e-06, "epoch": 1.4974124809741247, "percentage": 37.43, "elapsed_time": "1:24:45", "remaining_time": "2:21:40"}
128
+ {"current_steps": 2480, "total_steps": 6572, "loss": 0.0782626211643219, "lr": 3.6530137446635265e-06, "epoch": 1.5095890410958903, "percentage": 37.74, "elapsed_time": "1:25:20", "remaining_time": "2:20:49"}
129
+ {"current_steps": 2500, "total_steps": 6572, "loss": 0.08304058909416198, "lr": 3.630734367244012e-06, "epoch": 1.521765601217656, "percentage": 38.04, "elapsed_time": "1:25:57", "remaining_time": "2:20:00"}
130
+ {"current_steps": 2500, "total_steps": 6572, "eval_loss": 0.2771773338317871, "epoch": 1.521765601217656, "percentage": 38.04, "elapsed_time": "1:26:44", "remaining_time": "2:21:17"}
131
+ {"current_steps": 2520, "total_steps": 6572, "loss": 0.08037537932395936, "lr": 3.6083415136732374e-06, "epoch": 1.5339421613394215, "percentage": 38.34, "elapsed_time": "1:28:46", "remaining_time": "2:22:44"}
132
+ {"current_steps": 2540, "total_steps": 6572, "loss": 0.08990358114242554, "lr": 3.585837431211845e-06, "epoch": 1.5461187214611871, "percentage": 38.65, "elapsed_time": "1:29:23", "remaining_time": "2:21:53"}
133
+ {"current_steps": 2560, "total_steps": 6572, "loss": 0.0773526132106781, "lr": 3.563224378282978e-06, "epoch": 1.5582952815829527, "percentage": 38.95, "elapsed_time": "1:29:59", "remaining_time": "2:21:02"}
134
+ {"current_steps": 2580, "total_steps": 6572, "loss": 0.0777865469455719, "lr": 3.5405046242456396e-06, "epoch": 1.5704718417047183, "percentage": 39.26, "elapsed_time": "1:30:36", "remaining_time": "2:20:11"}
135
+ {"current_steps": 2600, "total_steps": 6572, "loss": 0.08037815093994141, "lr": 3.517680449166943e-06, "epoch": 1.582648401826484, "percentage": 39.56, "elapsed_time": "1:31:11", "remaining_time": "2:19:18"}
136
+ {"current_steps": 2620, "total_steps": 6572, "loss": 0.08837634325027466, "lr": 3.4947541435932976e-06, "epoch": 1.5948249619482495, "percentage": 39.87, "elapsed_time": "1:31:49", "remaining_time": "2:18:30"}
137
+ {"current_steps": 2640, "total_steps": 6572, "loss": 0.08201563358306885, "lr": 3.471728008320532e-06, "epoch": 1.6070015220700151, "percentage": 40.17, "elapsed_time": "1:32:26", "remaining_time": "2:17:41"}
138
+ {"current_steps": 2660, "total_steps": 6572, "loss": 0.07379403114318847, "lr": 3.4486043541630066e-06, "epoch": 1.6191780821917807, "percentage": 40.47, "elapsed_time": "1:33:03", "remaining_time": "2:16:51"}
139
+ {"current_steps": 2680, "total_steps": 6572, "loss": 0.08871785402297974, "lr": 3.425385501721696e-06, "epoch": 1.6313546423135463, "percentage": 40.78, "elapsed_time": "1:33:39", "remaining_time": "2:16:00"}
140
+ {"current_steps": 2700, "total_steps": 6572, "loss": 0.07757498621940613, "lr": 3.4020737811513107e-06, "epoch": 1.643531202435312, "percentage": 41.08, "elapsed_time": "1:34:16", "remaining_time": "2:15:11"}
141
+ {"current_steps": 2720, "total_steps": 6572, "loss": 0.08565697669982911, "lr": 3.3786715319264483e-06, "epoch": 1.6557077625570775, "percentage": 41.39, "elapsed_time": "1:34:52", "remaining_time": "2:14:21"}
142
+ {"current_steps": 2740, "total_steps": 6572, "loss": 0.08129348754882812, "lr": 3.355181102606816e-06, "epoch": 1.6678843226788431, "percentage": 41.69, "elapsed_time": "1:35:28", "remaining_time": "2:13:31"}
143
+ {"current_steps": 2760, "total_steps": 6572, "loss": 0.07875375747680664, "lr": 3.331604850601533e-06, "epoch": 1.6800608828006087, "percentage": 42.0, "elapsed_time": "1:36:06", "remaining_time": "2:12:43"}
144
+ {"current_steps": 2780, "total_steps": 6572, "loss": 0.08989614248275757, "lr": 3.307945141932556e-06, "epoch": 1.6922374429223743, "percentage": 42.3, "elapsed_time": "1:36:41", "remaining_time": "2:11:53"}
145
+ {"current_steps": 2800, "total_steps": 6572, "loss": 0.08251298069953919, "lr": 3.2842043509972294e-06, "epoch": 1.70441400304414, "percentage": 42.6, "elapsed_time": "1:37:18", "remaining_time": "2:11:05"}
146
+ {"current_steps": 2820, "total_steps": 6572, "loss": 0.07760271430015564, "lr": 3.2603848603300026e-06, "epoch": 1.7165905631659055, "percentage": 42.91, "elapsed_time": "1:37:56", "remaining_time": "2:10:18"}
147
+ {"current_steps": 2840, "total_steps": 6572, "loss": 0.07395396232604981, "lr": 3.236489060363329e-06, "epoch": 1.7287671232876711, "percentage": 43.21, "elapsed_time": "1:38:33", "remaining_time": "2:09:31"}
148
+ {"current_steps": 2860, "total_steps": 6572, "loss": 0.07028600573539734, "lr": 3.212519349187766e-06, "epoch": 1.7409436834094367, "percentage": 43.52, "elapsed_time": "1:39:10", "remaining_time": "2:08:42"}
149
+ {"current_steps": 2880, "total_steps": 6572, "loss": 0.08469281196594239, "lr": 3.188478132311319e-06, "epoch": 1.7531202435312023, "percentage": 43.82, "elapsed_time": "1:39:46", "remaining_time": "2:07:54"}
150
+ {"current_steps": 2900, "total_steps": 6572, "loss": 0.09567424058914184, "lr": 3.164367822418029e-06, "epoch": 1.765296803652968, "percentage": 44.13, "elapsed_time": "1:40:22", "remaining_time": "2:07:05"}
151
+ {"current_steps": 2920, "total_steps": 6572, "loss": 0.07239987254142762, "lr": 3.1401908391258474e-06, "epoch": 1.7774733637747335, "percentage": 44.43, "elapsed_time": "1:41:00", "remaining_time": "2:06:19"}
152
+ {"current_steps": 2940, "total_steps": 6572, "loss": 0.0891954243183136, "lr": 3.1159496087438098e-06, "epoch": 1.7896499238964991, "percentage": 44.74, "elapsed_time": "1:41:36", "remaining_time": "2:05:31"}
153
+ {"current_steps": 2960, "total_steps": 6572, "loss": 0.07796849608421326, "lr": 3.0916465640285426e-06, "epoch": 1.8018264840182647, "percentage": 45.04, "elapsed_time": "1:42:12", "remaining_time": "2:04:43"}
154
+ {"current_steps": 2980, "total_steps": 6572, "loss": 0.08645985722541809, "lr": 3.0672841439401223e-06, "epoch": 1.8140030441400303, "percentage": 45.34, "elapsed_time": "1:42:49", "remaining_time": "2:03:55"}
155
+ {"current_steps": 3000, "total_steps": 6572, "loss": 0.08427774310111999, "lr": 3.0428647933973103e-06, "epoch": 1.826179604261796, "percentage": 45.65, "elapsed_time": "1:43:26", "remaining_time": "2:03:10"}
156
+ {"current_steps": 3000, "total_steps": 6572, "eval_loss": 0.2774485647678375, "epoch": 1.826179604261796, "percentage": 45.65, "elapsed_time": "1:44:13", "remaining_time": "2:04:05"}
157
+ {"current_steps": 3020, "total_steps": 6572, "loss": 0.07381275296211243, "lr": 3.0183909630321865e-06, "epoch": 1.8383561643835615, "percentage": 45.95, "elapsed_time": "1:46:17", "remaining_time": "2:05:00"}
158
+ {"current_steps": 3040, "total_steps": 6572, "loss": 0.07289664745330811, "lr": 2.9938651089442184e-06, "epoch": 1.8505327245053271, "percentage": 46.26, "elapsed_time": "1:46:55", "remaining_time": "2:04:14"}
159
+ {"current_steps": 3060, "total_steps": 6572, "loss": 0.07124295830726624, "lr": 2.969289692453773e-06, "epoch": 1.8627092846270927, "percentage": 46.56, "elapsed_time": "1:47:32", "remaining_time": "2:03:25"}
160
+ {"current_steps": 3080, "total_steps": 6572, "loss": 0.08125877976417542, "lr": 2.944667179855109e-06, "epoch": 1.8748858447488583, "percentage": 46.87, "elapsed_time": "1:48:09", "remaining_time": "2:02:37"}
161
+ {"current_steps": 3100, "total_steps": 6572, "loss": 0.0724608838558197, "lr": 2.920000042168871e-06, "epoch": 1.887062404870624, "percentage": 47.17, "elapsed_time": "1:48:47", "remaining_time": "2:01:51"}
162
+ {"current_steps": 3120, "total_steps": 6572, "loss": 0.07104775309562683, "lr": 2.8952907548941057e-06, "epoch": 1.8992389649923895, "percentage": 47.47, "elapsed_time": "1:49:23", "remaining_time": "2:01:01"}
163
+ {"current_steps": 3140, "total_steps": 6572, "loss": 0.0677955150604248, "lr": 2.8705417977598277e-06, "epoch": 1.9114155251141551, "percentage": 47.78, "elapsed_time": "1:50:00", "remaining_time": "2:00:14"}
164
+ {"current_steps": 3160, "total_steps": 6572, "loss": 0.07164496779441834, "lr": 2.8457556544761687e-06, "epoch": 1.9235920852359207, "percentage": 48.08, "elapsed_time": "1:50:36", "remaining_time": "1:59:25"}
165
+ {"current_steps": 3180, "total_steps": 6572, "loss": 0.071807599067688, "lr": 2.8209348124851187e-06, "epoch": 1.9357686453576863, "percentage": 48.39, "elapsed_time": "1:51:13", "remaining_time": "1:58:38"}
166
+ {"current_steps": 3200, "total_steps": 6572, "loss": 0.095755535364151, "lr": 2.7960817627108965e-06, "epoch": 1.947945205479452, "percentage": 48.69, "elapsed_time": "1:51:50", "remaining_time": "1:57:50"}
167
+ {"current_steps": 3220, "total_steps": 6572, "loss": 0.07055851817131042, "lr": 2.77119899930997e-06, "epoch": 1.9601217656012175, "percentage": 49.0, "elapsed_time": "1:52:26", "remaining_time": "1:57:03"}
168
+ {"current_steps": 3240, "total_steps": 6572, "loss": 0.07278798818588257, "lr": 2.7462890194207513e-06, "epoch": 1.9722983257229831, "percentage": 49.3, "elapsed_time": "1:53:03", "remaining_time": "1:56:16"}
169
+ {"current_steps": 3260, "total_steps": 6572, "loss": 0.07153088450431824, "lr": 2.7213543229129956e-06, "epoch": 1.9844748858447487, "percentage": 49.6, "elapsed_time": "1:53:40", "remaining_time": "1:55:28"}
170
+ {"current_steps": 3280, "total_steps": 6572, "loss": 0.07440360188484192, "lr": 2.6963974121369242e-06, "epoch": 1.9966514459665143, "percentage": 49.91, "elapsed_time": "1:54:16", "remaining_time": "1:54:41"}
171
+ {"current_steps": 3300, "total_steps": 6572, "loss": 0.0517767608165741, "lr": 2.671420791672093e-06, "epoch": 2.008523592085236, "percentage": 50.21, "elapsed_time": "1:54:51", "remaining_time": "1:53:53"}
172
+ {"current_steps": 3320, "total_steps": 6572, "loss": 0.03812239170074463, "lr": 2.646426968076052e-06, "epoch": 2.0207001522070014, "percentage": 50.52, "elapsed_time": "1:55:27", "remaining_time": "1:53:06"}
173
+ {"current_steps": 3340, "total_steps": 6572, "loss": 0.04107579588890076, "lr": 2.6214184496327865e-06, "epoch": 2.032876712328767, "percentage": 50.82, "elapsed_time": "1:56:05", "remaining_time": "1:52:19"}
174
+ {"current_steps": 3360, "total_steps": 6572, "loss": 0.04673115909099579, "lr": 2.5963977461010022e-06, "epoch": 2.0450532724505326, "percentage": 51.13, "elapsed_time": "1:56:41", "remaining_time": "1:51:33"}
175
+ {"current_steps": 3380, "total_steps": 6572, "loss": 0.03674449622631073, "lr": 2.5713673684622524e-06, "epoch": 2.057229832572298, "percentage": 51.43, "elapsed_time": "1:57:17", "remaining_time": "1:50:46"}
176
+ {"current_steps": 3400, "total_steps": 6572, "loss": 0.03422380387783051, "lr": 2.546329828668949e-06, "epoch": 2.069406392694064, "percentage": 51.73, "elapsed_time": "1:57:53", "remaining_time": "1:49:59"}
177
+ {"current_steps": 3420, "total_steps": 6572, "loss": 0.035878732800483704, "lr": 2.5212876393922657e-06, "epoch": 2.0815829528158294, "percentage": 52.04, "elapsed_time": "1:58:29", "remaining_time": "1:49:12"}
178
+ {"current_steps": 3440, "total_steps": 6572, "loss": 0.03577531576156616, "lr": 2.496243313769986e-06, "epoch": 2.093759512937595, "percentage": 52.34, "elapsed_time": "1:59:06", "remaining_time": "1:48:27"}
179
+ {"current_steps": 3460, "total_steps": 6572, "loss": 0.04281675517559051, "lr": 2.471199365154283e-06, "epoch": 2.1059360730593606, "percentage": 52.65, "elapsed_time": "1:59:42", "remaining_time": "1:47:40"}
180
+ {"current_steps": 3480, "total_steps": 6572, "loss": 0.042955422401428224, "lr": 2.4461583068595014e-06, "epoch": 2.118112633181126, "percentage": 52.95, "elapsed_time": "2:00:20", "remaining_time": "1:46:55"}
181
+ {"current_steps": 3500, "total_steps": 6572, "loss": 0.04432957172393799, "lr": 2.421122651909918e-06, "epoch": 2.130289193302892, "percentage": 53.26, "elapsed_time": "2:00:56", "remaining_time": "1:46:09"}
182
+ {"current_steps": 3500, "total_steps": 6572, "eval_loss": 0.3123805522918701, "epoch": 2.130289193302892, "percentage": 53.26, "elapsed_time": "2:01:42", "remaining_time": "1:46:49"}
183
+ {"current_steps": 3520, "total_steps": 6572, "loss": 0.03356837034225464, "lr": 2.3960949127875556e-06, "epoch": 2.1424657534246574, "percentage": 53.56, "elapsed_time": "2:03:47", "remaining_time": "1:47:19"}
184
+ {"current_steps": 3540, "total_steps": 6572, "loss": 0.036935809254646304, "lr": 2.371077601180031e-06, "epoch": 2.154642313546423, "percentage": 53.86, "elapsed_time": "2:04:23", "remaining_time": "1:46:32"}
185
+ {"current_steps": 3560, "total_steps": 6572, "loss": 0.0395690768957138, "lr": 2.3460732277284994e-06, "epoch": 2.1668188736681886, "percentage": 54.17, "elapsed_time": "2:05:01", "remaining_time": "1:45:46"}
186
+ {"current_steps": 3580, "total_steps": 6572, "loss": 0.044693085551261905, "lr": 2.321084301775689e-06, "epoch": 2.178995433789954, "percentage": 54.47, "elapsed_time": "2:05:39", "remaining_time": "1:45:01"}
187
+ {"current_steps": 3600, "total_steps": 6572, "loss": 0.03243565857410431, "lr": 2.29611333111408e-06, "epoch": 2.19117199391172, "percentage": 54.78, "elapsed_time": "2:06:17", "remaining_time": "1:44:15"}
188
+ {"current_steps": 3620, "total_steps": 6572, "loss": 0.04325798749923706, "lr": 2.271162821734225e-06, "epoch": 2.2033485540334854, "percentage": 55.08, "elapsed_time": "2:06:56", "remaining_time": "1:43:30"}
189
+ {"current_steps": 3640, "total_steps": 6572, "loss": 0.03856868743896484, "lr": 2.2462352775732653e-06, "epoch": 2.215525114155251, "percentage": 55.39, "elapsed_time": "2:07:33", "remaining_time": "1:42:45"}
190
+ {"current_steps": 3660, "total_steps": 6572, "loss": 0.041602414846420285, "lr": 2.221333200263637e-06, "epoch": 2.2277016742770166, "percentage": 55.69, "elapsed_time": "2:08:11", "remaining_time": "1:41:59"}
191
+ {"current_steps": 3680, "total_steps": 6572, "loss": 0.04286134541034699, "lr": 2.1964590888820233e-06, "epoch": 2.239878234398782, "percentage": 56.0, "elapsed_time": "2:08:48", "remaining_time": "1:41:13"}
192
+ {"current_steps": 3700, "total_steps": 6572, "loss": 0.041756758093833925, "lr": 2.1716154396985526e-06, "epoch": 2.252054794520548, "percentage": 56.3, "elapsed_time": "2:09:25", "remaining_time": "1:40:28"}
193
+ {"current_steps": 3720, "total_steps": 6572, "loss": 0.0359495222568512, "lr": 2.1468047459262882e-06, "epoch": 2.2642313546423134, "percentage": 56.6, "elapsed_time": "2:10:03", "remaining_time": "1:39:42"}
194
+ {"current_steps": 3740, "total_steps": 6572, "loss": 0.04322676360607147, "lr": 2.12202949747101e-06, "epoch": 2.276407914764079, "percentage": 56.91, "elapsed_time": "2:10:41", "remaining_time": "1:38:57"}
195
+ {"current_steps": 3760, "total_steps": 6572, "loss": 0.04191597998142242, "lr": 2.0972921806813468e-06, "epoch": 2.2885844748858446, "percentage": 57.21, "elapsed_time": "2:11:19", "remaining_time": "1:38:13"}
196
+ {"current_steps": 3780, "total_steps": 6572, "loss": 0.041278204321861266, "lr": 2.072595278099247e-06, "epoch": 2.30076103500761, "percentage": 57.52, "elapsed_time": "2:11:58", "remaining_time": "1:37:28"}
197
+ {"current_steps": 3800, "total_steps": 6572, "loss": 0.04312986135482788, "lr": 2.047941268210849e-06, "epoch": 2.312937595129376, "percentage": 57.82, "elapsed_time": "2:12:35", "remaining_time": "1:36:43"}
198
+ {"current_steps": 3820, "total_steps": 6572, "loss": 0.04236046075820923, "lr": 2.0233326251977426e-06, "epoch": 2.3251141552511414, "percentage": 58.13, "elapsed_time": "2:13:14", "remaining_time": "1:35:59"}
199
+ {"current_steps": 3840, "total_steps": 6572, "loss": 0.04433901011943817, "lr": 1.9987718186886724e-06, "epoch": 2.337290715372907, "percentage": 58.43, "elapsed_time": "2:13:52", "remaining_time": "1:35:14"}
200
+ {"current_steps": 3860, "total_steps": 6572, "loss": 0.04127628207206726, "lr": 1.9742613135116986e-06, "epoch": 2.3494672754946726, "percentage": 58.73, "elapsed_time": "2:14:29", "remaining_time": "1:34:29"}
201
+ {"current_steps": 3880, "total_steps": 6572, "loss": 0.04586326479911804, "lr": 1.949803569446828e-06, "epoch": 2.361643835616438, "percentage": 59.04, "elapsed_time": "2:15:07", "remaining_time": "1:33:45"}
202
+ {"current_steps": 3900, "total_steps": 6572, "loss": 0.03624185025691986, "lr": 1.925401040979171e-06, "epoch": 2.373820395738204, "percentage": 59.34, "elapsed_time": "2:15:44", "remaining_time": "1:33:00"}
203
+ {"current_steps": 3920, "total_steps": 6572, "loss": 0.035064518451690674, "lr": 1.9010561770526076e-06, "epoch": 2.3859969558599694, "percentage": 59.65, "elapsed_time": "2:16:21", "remaining_time": "1:32:15"}
204
+ {"current_steps": 3940, "total_steps": 6572, "loss": 0.042050021886825564, "lr": 1.8767714208240312e-06, "epoch": 2.398173515981735, "percentage": 59.95, "elapsed_time": "2:16:59", "remaining_time": "1:31:30"}
205
+ {"current_steps": 3960, "total_steps": 6572, "loss": 0.038166466355323794, "lr": 1.852549209418154e-06, "epoch": 2.4103500761035006, "percentage": 60.26, "elapsed_time": "2:17:37", "remaining_time": "1:30:46"}
206
+ {"current_steps": 3980, "total_steps": 6572, "loss": 0.040885674953460696, "lr": 1.8283919736829332e-06, "epoch": 2.422526636225266, "percentage": 60.56, "elapsed_time": "2:18:15", "remaining_time": "1:30:02"}
207
+ {"current_steps": 4000, "total_steps": 6572, "loss": 0.040162667632102966, "lr": 1.804302137945614e-06, "epoch": 2.434703196347032, "percentage": 60.86, "elapsed_time": "2:18:51", "remaining_time": "1:29:17"}
208
+ {"current_steps": 4000, "total_steps": 6572, "eval_loss": 0.31583163142204285, "epoch": 2.434703196347032, "percentage": 60.86, "elapsed_time": "2:19:38", "remaining_time": "1:29:47"}
209
+ {"current_steps": 4020, "total_steps": 6572, "loss": 0.04170995056629181, "lr": 1.7802821197694426e-06, "epoch": 2.4468797564687974, "percentage": 61.17, "elapsed_time": "2:21:49", "remaining_time": "1:30:02"}
210
+ {"current_steps": 4040, "total_steps": 6572, "loss": 0.03834344446659088, "lr": 1.7563343297110375e-06, "epoch": 2.459056316590563, "percentage": 61.47, "elapsed_time": "2:22:25", "remaining_time": "1:29:15"}
211
+ {"current_steps": 4060, "total_steps": 6572, "loss": 0.03928310573101044, "lr": 1.732461171078486e-06, "epoch": 2.4712328767123286, "percentage": 61.78, "elapsed_time": "2:23:01", "remaining_time": "1:28:29"}
212
+ {"current_steps": 4080, "total_steps": 6572, "loss": 0.03358933925628662, "lr": 1.7086650396901489e-06, "epoch": 2.483409436834094, "percentage": 62.08, "elapsed_time": "2:23:37", "remaining_time": "1:27:43"}
213
+ {"current_steps": 4100, "total_steps": 6572, "loss": 0.03547535240650177, "lr": 1.6849483236342322e-06, "epoch": 2.49558599695586, "percentage": 62.39, "elapsed_time": "2:24:14", "remaining_time": "1:26:58"}
214
+ {"current_steps": 4120, "total_steps": 6572, "loss": 0.03600102663040161, "lr": 1.6613134030291217e-06, "epoch": 2.5077625570776254, "percentage": 62.69, "elapsed_time": "2:24:51", "remaining_time": "1:26:12"}
215
+ {"current_steps": 4140, "total_steps": 6572, "loss": 0.04347077012062073, "lr": 1.6377626497845278e-06, "epoch": 2.519939117199391, "percentage": 62.99, "elapsed_time": "2:25:27", "remaining_time": "1:25:27"}
216
+ {"current_steps": 4160, "total_steps": 6572, "loss": 0.02908192276954651, "lr": 1.6142984273634505e-06, "epoch": 2.5321156773211566, "percentage": 63.3, "elapsed_time": "2:26:04", "remaining_time": "1:24:41"}
217
+ {"current_steps": 4180, "total_steps": 6572, "loss": 0.03611198365688324, "lr": 1.5909230905449846e-06, "epoch": 2.544292237442922, "percentage": 63.6, "elapsed_time": "2:26:40", "remaining_time": "1:23:56"}
218
+ {"current_steps": 4200, "total_steps": 6572, "loss": 0.03758668601512909, "lr": 1.567638985188012e-06, "epoch": 2.556468797564688, "percentage": 63.91, "elapsed_time": "2:27:17", "remaining_time": "1:23:10"}
219
+ {"current_steps": 4220, "total_steps": 6572, "loss": 0.033633843064308167, "lr": 1.544448447995773e-06, "epoch": 2.5686453576864534, "percentage": 64.21, "elapsed_time": "2:27:53", "remaining_time": "1:22:25"}
220
+ {"current_steps": 4240, "total_steps": 6572, "loss": 0.036797890067100526, "lr": 1.52135380628137e-06, "epoch": 2.580821917808219, "percentage": 64.52, "elapsed_time": "2:28:29", "remaining_time": "1:21:40"}
221
+ {"current_steps": 4260, "total_steps": 6572, "loss": 0.039166563749313356, "lr": 1.498357377734201e-06, "epoch": 2.5929984779299846, "percentage": 64.82, "elapsed_time": "2:29:07", "remaining_time": "1:20:55"}
222
+ {"current_steps": 4280, "total_steps": 6572, "loss": 0.03717599511146545, "lr": 1.4754614701873703e-06, "epoch": 2.60517503805175, "percentage": 65.12, "elapsed_time": "2:29:44", "remaining_time": "1:20:11"}
223
+ {"current_steps": 4300, "total_steps": 6572, "loss": 0.03962793946266174, "lr": 1.4526683813860792e-06, "epoch": 2.6173515981735163, "percentage": 65.43, "elapsed_time": "2:30:20", "remaining_time": "1:19:26"}
224
+ {"current_steps": 4320, "total_steps": 6572, "loss": 0.035475924611091614, "lr": 1.4299803987570396e-06, "epoch": 2.6295281582952814, "percentage": 65.73, "elapsed_time": "2:30:57", "remaining_time": "1:18:41"}
225
+ {"current_steps": 4340, "total_steps": 6572, "loss": 0.03256964683532715, "lr": 1.4073997991789078e-06, "epoch": 2.6417047184170475, "percentage": 66.04, "elapsed_time": "2:31:35", "remaining_time": "1:17:57"}
226
+ {"current_steps": 4360, "total_steps": 6572, "loss": 0.03712306022644043, "lr": 1.384928848753792e-06, "epoch": 2.6538812785388126, "percentage": 66.34, "elapsed_time": "2:32:10", "remaining_time": "1:17:12"}
227
+ {"current_steps": 4380, "total_steps": 6572, "loss": 0.041410398483276364, "lr": 1.3625698025798322e-06, "epoch": 2.6660578386605787, "percentage": 66.65, "elapsed_time": "2:32:47", "remaining_time": "1:16:27"}
228
+ {"current_steps": 4400, "total_steps": 6572, "loss": 0.03158504366874695, "lr": 1.3403249045248907e-06, "epoch": 2.678234398782344, "percentage": 66.95, "elapsed_time": "2:33:23", "remaining_time": "1:15:43"}
229
+ {"current_steps": 4420, "total_steps": 6572, "loss": 0.03525224924087524, "lr": 1.3181963870013604e-06, "epoch": 2.69041095890411, "percentage": 67.26, "elapsed_time": "2:33:59", "remaining_time": "1:14:58"}
230
+ {"current_steps": 4440, "total_steps": 6572, "loss": 0.03239959478378296, "lr": 1.2961864707421345e-06, "epoch": 2.702587519025875, "percentage": 67.56, "elapsed_time": "2:34:36", "remaining_time": "1:14:14"}
231
+ {"current_steps": 4460, "total_steps": 6572, "loss": 0.031032082438468934, "lr": 1.2742973645777394e-06, "epoch": 2.714764079147641, "percentage": 67.86, "elapsed_time": "2:35:13", "remaining_time": "1:13:30"}
232
+ {"current_steps": 4480, "total_steps": 6572, "loss": 0.030566230416297913, "lr": 1.252531265214662e-06, "epoch": 2.726940639269406, "percentage": 68.17, "elapsed_time": "2:35:48", "remaining_time": "1:12:45"}
233
+ {"current_steps": 4500, "total_steps": 6572, "loss": 0.041362547874450685, "lr": 1.2308903570149048e-06, "epoch": 2.7391171993911723, "percentage": 68.47, "elapsed_time": "2:36:23", "remaining_time": "1:12:00"}
234
+ {"current_steps": 4500, "total_steps": 6572, "eval_loss": 0.3391737937927246, "epoch": 2.7391171993911723, "percentage": 68.47, "elapsed_time": "2:37:10", "remaining_time": "1:12:22"}
235
+ {"current_steps": 4520, "total_steps": 6572, "loss": 0.0388390064239502, "lr": 1.2093768117767613e-06, "epoch": 2.7512937595129374, "percentage": 68.78, "elapsed_time": "2:39:12", "remaining_time": "1:12:16"}
236
+ {"current_steps": 4540, "total_steps": 6572, "loss": 0.032555675506591795, "lr": 1.1879927885168733e-06, "epoch": 2.7634703196347035, "percentage": 69.08, "elapsed_time": "2:39:48", "remaining_time": "1:11:31"}
237
+ {"current_steps": 4560, "total_steps": 6572, "loss": 0.03606459796428681, "lr": 1.1667404332535504e-06, "epoch": 2.7756468797564686, "percentage": 69.39, "elapsed_time": "2:40:25", "remaining_time": "1:10:47"}
238
+ {"current_steps": 4580, "total_steps": 6572, "loss": 0.032086309790611264, "lr": 1.1456218787914128e-06, "epoch": 2.7878234398782347, "percentage": 69.69, "elapsed_time": "2:41:01", "remaining_time": "1:10:02"}
239
+ {"current_steps": 4600, "total_steps": 6572, "loss": 0.033362787961959836, "lr": 1.1246392445073438e-06, "epoch": 2.8, "percentage": 69.99, "elapsed_time": "2:41:36", "remaining_time": "1:09:16"}
240
+ {"current_steps": 4620, "total_steps": 6572, "loss": 0.03638745844364166, "lr": 1.1037946361378027e-06, "epoch": 2.812176560121766, "percentage": 70.3, "elapsed_time": "2:42:12", "remaining_time": "1:08:32"}
241
+ {"current_steps": 4640, "total_steps": 6572, "loss": 0.030933958292007447, "lr": 1.0830901455674977e-06, "epoch": 2.824353120243531, "percentage": 70.6, "elapsed_time": "2:42:49", "remaining_time": "1:07:47"}
242
+ {"current_steps": 4660, "total_steps": 6572, "loss": 0.02879139482975006, "lr": 1.0625278506194538e-06, "epoch": 2.836529680365297, "percentage": 70.91, "elapsed_time": "2:43:26", "remaining_time": "1:07:03"}
243
+ {"current_steps": 4680, "total_steps": 6572, "loss": 0.03345020413398743, "lr": 1.04210981484649e-06, "epoch": 2.8487062404870622, "percentage": 71.21, "elapsed_time": "2:44:01", "remaining_time": "1:06:18"}
244
+ {"current_steps": 4700, "total_steps": 6572, "loss": 0.02593054175376892, "lr": 1.0218380873241314e-06, "epoch": 2.8608828006088283, "percentage": 71.52, "elapsed_time": "2:44:38", "remaining_time": "1:05:34"}
245
+ {"current_steps": 4720, "total_steps": 6572, "loss": 0.03906567096710205, "lr": 1.0017147024449674e-06, "epoch": 2.8730593607305934, "percentage": 71.82, "elapsed_time": "2:45:15", "remaining_time": "1:04:50"}
246
+ {"current_steps": 4740, "total_steps": 6572, "loss": 0.03371626436710358, "lr": 9.81741679714493e-07, "epoch": 2.8852359208523595, "percentage": 72.12, "elapsed_time": "2:45:52", "remaining_time": "1:04:06"}
247
+ {"current_steps": 4760, "total_steps": 6572, "loss": 0.03090968132019043, "lr": 9.619210235484333e-07, "epoch": 2.8974124809741246, "percentage": 72.43, "elapsed_time": "2:46:29", "remaining_time": "1:03:22"}
248
+ {"current_steps": 4780, "total_steps": 6572, "loss": 0.0322105199098587, "lr": 9.422547230715931e-07, "epoch": 2.9095890410958907, "percentage": 72.73, "elapsed_time": "2:47:05", "remaining_time": "1:02:38"}
249
+ {"current_steps": 4800, "total_steps": 6572, "loss": 0.035210177302360535, "lr": 9.227447519182353e-07, "epoch": 2.921765601217656, "percentage": 73.04, "elapsed_time": "2:47:42", "remaining_time": "1:01:54"}
250
+ {"current_steps": 4820, "total_steps": 6572, "loss": 0.026842504739761353, "lr": 9.033930680340097e-07, "epoch": 2.933942161339422, "percentage": 73.34, "elapsed_time": "2:48:18", "remaining_time": "1:01:10"}
251
+ {"current_steps": 4840, "total_steps": 6572, "loss": 0.03439584076404571, "lr": 8.842016134794682e-07, "epoch": 2.946118721461187, "percentage": 73.65, "elapsed_time": "2:48:55", "remaining_time": "1:00:27"}
252
+ {"current_steps": 4860, "total_steps": 6572, "loss": 0.04011322855949402, "lr": 8.651723142351603e-07, "epoch": 2.958295281582953, "percentage": 73.95, "elapsed_time": "2:49:32", "remaining_time": "0:59:43"}
253
+ {"current_steps": 4880, "total_steps": 6572, "loss": 0.03800423145294189, "lr": 8.463070800083562e-07, "epoch": 2.9704718417047182, "percentage": 74.25, "elapsed_time": "2:50:10", "remaining_time": "0:59:00"}
254
+ {"current_steps": 4900, "total_steps": 6572, "loss": 0.03839131891727447, "lr": 8.276078040413879e-07, "epoch": 2.9826484018264843, "percentage": 74.56, "elapsed_time": "2:50:47", "remaining_time": "0:58:16"}
255
+ {"current_steps": 4920, "total_steps": 6572, "loss": 0.02721840739250183, "lr": 8.090763629216589e-07, "epoch": 2.9948249619482494, "percentage": 74.86, "elapsed_time": "2:51:25", "remaining_time": "0:57:33"}
256
+ {"current_steps": 4940, "total_steps": 6572, "loss": 0.023991990089416503, "lr": 7.907146163933102e-07, "epoch": 3.006697108066971, "percentage": 75.17, "elapsed_time": "2:52:02", "remaining_time": "0:56:50"}
257
+ {"current_steps": 4960, "total_steps": 6572, "loss": 0.01451514959335327, "lr": 7.725244071705871e-07, "epoch": 3.0188736681887365, "percentage": 75.47, "elapsed_time": "2:52:40", "remaining_time": "0:56:07"}
258
+ {"current_steps": 4980, "total_steps": 6572, "loss": 0.014327619969844819, "lr": 7.545075607529104e-07, "epoch": 3.031050228310502, "percentage": 75.78, "elapsed_time": "2:53:20", "remaining_time": "0:55:24"}
259
+ {"current_steps": 5000, "total_steps": 6572, "loss": 0.017832010984420776, "lr": 7.366658852416788e-07, "epoch": 3.0432267884322677, "percentage": 76.08, "elapsed_time": "2:53:56", "remaining_time": "0:54:41"}
260
+ {"current_steps": 5000, "total_steps": 6572, "eval_loss": 0.39115142822265625, "epoch": 3.0432267884322677, "percentage": 76.08, "elapsed_time": "2:54:44", "remaining_time": "0:54:56"}
261
+ {"current_steps": 5020, "total_steps": 6572, "loss": 0.011674411594867706, "lr": 7.190011711588101e-07, "epoch": 3.0554033485540333, "percentage": 76.38, "elapsed_time": "2:56:57", "remaining_time": "0:54:42"}
262
+ {"current_steps": 5040, "total_steps": 6572, "loss": 0.013690856099128724, "lr": 7.015151912670562e-07, "epoch": 3.067579908675799, "percentage": 76.69, "elapsed_time": "2:57:34", "remaining_time": "0:53:58"}
263
+ {"current_steps": 5060, "total_steps": 6572, "loss": 0.011978642642498016, "lr": 6.842097003920903e-07, "epoch": 3.0797564687975645, "percentage": 76.99, "elapsed_time": "2:58:10", "remaining_time": "0:53:14"}
264
+ {"current_steps": 5080, "total_steps": 6572, "loss": 0.013893941044807434, "lr": 6.67086435246406e-07, "epoch": 3.09193302891933, "percentage": 77.3, "elapsed_time": "2:58:46", "remaining_time": "0:52:30"}
265
+ {"current_steps": 5100, "total_steps": 6572, "loss": 0.009910025447607041, "lr": 6.501471142550194e-07, "epoch": 3.1041095890410957, "percentage": 77.6, "elapsed_time": "2:59:23", "remaining_time": "0:51:46"}
266
+ {"current_steps": 5120, "total_steps": 6572, "loss": 0.008863755315542222, "lr": 6.333934373830222e-07, "epoch": 3.1162861491628613, "percentage": 77.91, "elapsed_time": "2:59:59", "remaining_time": "0:51:02"}
267
+ {"current_steps": 5140, "total_steps": 6572, "loss": 0.010502541810274124, "lr": 6.168270859649761e-07, "epoch": 3.128462709284627, "percentage": 78.21, "elapsed_time": "3:00:37", "remaining_time": "0:50:19"}
268
+ {"current_steps": 5160, "total_steps": 6572, "loss": 0.012096930295228958, "lr": 6.004497225361786e-07, "epoch": 3.1406392694063925, "percentage": 78.51, "elapsed_time": "3:01:13", "remaining_time": "0:49:35"}
269
+ {"current_steps": 5180, "total_steps": 6572, "loss": 0.013278065621852875, "lr": 5.842629906658226e-07, "epoch": 3.1528158295281585, "percentage": 78.82, "elapsed_time": "3:01:52", "remaining_time": "0:48:52"}
270
+ {"current_steps": 5200, "total_steps": 6572, "loss": 0.013548998534679413, "lr": 5.682685147920481e-07, "epoch": 3.1649923896499237, "percentage": 79.12, "elapsed_time": "3:02:28", "remaining_time": "0:48:08"}
271
+ {"current_steps": 5220, "total_steps": 6572, "loss": 0.013736458122730255, "lr": 5.524679000589256e-07, "epoch": 3.1771689497716897, "percentage": 79.43, "elapsed_time": "3:03:05", "remaining_time": "0:47:25"}
272
+ {"current_steps": 5240, "total_steps": 6572, "loss": 0.013177134096622467, "lr": 5.36862732155366e-07, "epoch": 3.189345509893455, "percentage": 79.73, "elapsed_time": "3:03:42", "remaining_time": "0:46:41"}
273
+ {"current_steps": 5260, "total_steps": 6572, "loss": 0.011971819400787353, "lr": 5.214545771559879e-07, "epoch": 3.201522070015221, "percentage": 80.04, "elapsed_time": "3:04:17", "remaining_time": "0:45:58"}
274
+ {"current_steps": 5280, "total_steps": 6572, "loss": 0.014422819018363953, "lr": 5.062449813639528e-07, "epoch": 3.213698630136986, "percentage": 80.34, "elapsed_time": "3:04:53", "remaining_time": "0:45:14"}
275
+ {"current_steps": 5300, "total_steps": 6572, "loss": 0.010663678497076034, "lr": 4.912354711557856e-07, "epoch": 3.225875190258752, "percentage": 80.65, "elapsed_time": "3:05:29", "remaining_time": "0:44:31"}
276
+ {"current_steps": 5320, "total_steps": 6572, "loss": 0.011400717496871948, "lr": 4.764275528281892e-07, "epoch": 3.2380517503805173, "percentage": 80.95, "elapsed_time": "3:06:07", "remaining_time": "0:43:48"}
277
+ {"current_steps": 5340, "total_steps": 6572, "loss": 0.008456526696681977, "lr": 4.6182271244688355e-07, "epoch": 3.2502283105022833, "percentage": 81.25, "elapsed_time": "3:06:44", "remaining_time": "0:43:04"}
278
+ {"current_steps": 5360, "total_steps": 6572, "loss": 0.014539115130901337, "lr": 4.4742241569746407e-07, "epoch": 3.2624048706240485, "percentage": 81.56, "elapsed_time": "3:07:22", "remaining_time": "0:42:22"}
279
+ {"current_steps": 5380, "total_steps": 6572, "loss": 0.017625690996646882, "lr": 4.332281077383177e-07, "epoch": 3.2745814307458145, "percentage": 81.86, "elapsed_time": "3:07:58", "remaining_time": "0:41:38"}
280
+ {"current_steps": 5400, "total_steps": 6572, "loss": 0.007641100138425827, "lr": 4.1924121305558563e-07, "epoch": 3.2867579908675797, "percentage": 82.17, "elapsed_time": "3:08:37", "remaining_time": "0:40:56"}
281
+ {"current_steps": 5420, "total_steps": 6572, "loss": 0.011799700558185577, "lr": 4.054631353202121e-07, "epoch": 3.2989345509893457, "percentage": 82.47, "elapsed_time": "3:09:14", "remaining_time": "0:40:13"}
282
+ {"current_steps": 5440, "total_steps": 6572, "loss": 0.011455408483743667, "lr": 3.9189525724707634e-07, "epoch": 3.311111111111111, "percentage": 82.78, "elapsed_time": "3:09:51", "remaining_time": "0:39:30"}
283
+ {"current_steps": 5460, "total_steps": 6572, "loss": 0.012499115616083144, "lr": 3.785389404562259e-07, "epoch": 3.323287671232877, "percentage": 83.08, "elapsed_time": "3:10:28", "remaining_time": "0:38:47"}
284
+ {"current_steps": 5480, "total_steps": 6572, "loss": 0.01148865669965744, "lr": 3.653955253362351e-07, "epoch": 3.335464231354642, "percentage": 83.38, "elapsed_time": "3:11:03", "remaining_time": "0:38:04"}
285
+ {"current_steps": 5500, "total_steps": 6572, "loss": 0.012819178402423859, "lr": 3.5246633090968205e-07, "epoch": 3.347640791476408, "percentage": 83.69, "elapsed_time": "3:11:39", "remaining_time": "0:37:21"}
286
+ {"current_steps": 5500, "total_steps": 6572, "eval_loss": 0.43404534459114075, "epoch": 3.347640791476408, "percentage": 83.69, "elapsed_time": "3:12:26", "remaining_time": "0:37:30"}
287
+ {"current_steps": 5520, "total_steps": 6572, "loss": 0.013325585424900055, "lr": 3.397526547007832e-07, "epoch": 3.3598173515981733, "percentage": 83.99, "elapsed_time": "3:14:26", "remaining_time": "0:37:03"}
288
+ {"current_steps": 5540, "total_steps": 6572, "loss": 0.011712662875652313, "lr": 3.2725577260517396e-07, "epoch": 3.3719939117199393, "percentage": 84.3, "elapsed_time": "3:15:02", "remaining_time": "0:36:19"}
289
+ {"current_steps": 5560, "total_steps": 6572, "loss": 0.01580573171377182, "lr": 3.14976938761867e-07, "epoch": 3.3841704718417045, "percentage": 84.6, "elapsed_time": "3:15:38", "remaining_time": "0:35:36"}
290
+ {"current_steps": 5580, "total_steps": 6572, "loss": 0.012312603741884231, "lr": 3.029173854273909e-07, "epoch": 3.3963470319634705, "percentage": 84.91, "elapsed_time": "3:16:16", "remaining_time": "0:34:53"}
291
+ {"current_steps": 5600, "total_steps": 6572, "loss": 0.011797953397035599, "lr": 2.910783228521269e-07, "epoch": 3.4085235920852357, "percentage": 85.21, "elapsed_time": "3:16:54", "remaining_time": "0:34:10"}
292
+ {"current_steps": 5620, "total_steps": 6572, "loss": 0.012182456254959107, "lr": 2.794609391588504e-07, "epoch": 3.4207001522070017, "percentage": 85.51, "elapsed_time": "3:17:32", "remaining_time": "0:33:27"}
293
+ {"current_steps": 5640, "total_steps": 6572, "loss": 0.013599888980388641, "lr": 2.6806640022349897e-07, "epoch": 3.432876712328767, "percentage": 85.82, "elapsed_time": "3:18:08", "remaining_time": "0:32:44"}
294
+ {"current_steps": 5660, "total_steps": 6572, "loss": 0.009272868931293487, "lr": 2.5689584955816497e-07, "epoch": 3.445053272450533, "percentage": 86.12, "elapsed_time": "3:18:45", "remaining_time": "0:32:01"}
295
+ {"current_steps": 5680, "total_steps": 6572, "loss": 0.008165979385375976, "lr": 2.459504081963421e-07, "epoch": 3.457229832572298, "percentage": 86.43, "elapsed_time": "3:19:21", "remaining_time": "0:31:18"}
296
+ {"current_steps": 5700, "total_steps": 6572, "loss": 0.009182130545377731, "lr": 2.3523117458041865e-07, "epoch": 3.469406392694064, "percentage": 86.73, "elapsed_time": "3:19:56", "remaining_time": "0:30:35"}
297
+ {"current_steps": 5720, "total_steps": 6572, "loss": 0.0107998326420784, "lr": 2.2473922445144485e-07, "epoch": 3.4815829528158293, "percentage": 87.04, "elapsed_time": "3:20:34", "remaining_time": "0:29:52"}
298
+ {"current_steps": 5740, "total_steps": 6572, "loss": 0.014469687640666962, "lr": 2.144756107411733e-07, "epoch": 3.4937595129375953, "percentage": 87.34, "elapsed_time": "3:21:11", "remaining_time": "0:29:09"}
299
+ {"current_steps": 5760, "total_steps": 6572, "loss": 0.0121701680123806, "lr": 2.0444136346639333e-07, "epoch": 3.5059360730593605, "percentage": 87.64, "elapsed_time": "3:21:48", "remaining_time": "0:28:26"}
300
+ {"current_steps": 5780, "total_steps": 6572, "loss": 0.014668506383895875, "lr": 1.9463748962556096e-07, "epoch": 3.5181126331811265, "percentage": 87.95, "elapsed_time": "3:22:25", "remaining_time": "0:27:44"}
301
+ {"current_steps": 5800, "total_steps": 6572, "loss": 0.010488402843475342, "lr": 1.8506497309773885e-07, "epoch": 3.5302891933028917, "percentage": 88.25, "elapsed_time": "3:23:02", "remaining_time": "0:27:01"}
302
+ {"current_steps": 5820, "total_steps": 6572, "loss": 0.010667071491479874, "lr": 1.7572477454386257e-07, "epoch": 3.5424657534246577, "percentage": 88.56, "elapsed_time": "3:23:37", "remaining_time": "0:26:18"}
303
+ {"current_steps": 5840, "total_steps": 6572, "loss": 0.011079683899879456, "lr": 1.6661783131032726e-07, "epoch": 3.554642313546423, "percentage": 88.86, "elapsed_time": "3:24:14", "remaining_time": "0:25:35"}
304
+ {"current_steps": 5860, "total_steps": 6572, "loss": 0.009308797866106033, "lr": 1.5774505733492263e-07, "epoch": 3.566818873668189, "percentage": 89.17, "elapsed_time": "3:24:50", "remaining_time": "0:24:53"}
305
+ {"current_steps": 5880, "total_steps": 6572, "loss": 0.012319787591695785, "lr": 1.49107343055111e-07, "epoch": 3.578995433789954, "percentage": 89.47, "elapsed_time": "3:25:27", "remaining_time": "0:24:10"}
306
+ {"current_steps": 5900, "total_steps": 6572, "loss": 0.00843576118350029, "lr": 1.407055553186701e-07, "epoch": 3.59117199391172, "percentage": 89.77, "elapsed_time": "3:26:04", "remaining_time": "0:23:28"}
307
+ {"current_steps": 5920, "total_steps": 6572, "loss": 0.00938587412238121, "lr": 1.3254053729669564e-07, "epoch": 3.6033485540334853, "percentage": 90.08, "elapsed_time": "3:26:41", "remaining_time": "0:22:45"}
308
+ {"current_steps": 5940, "total_steps": 6572, "loss": 0.011934128403663636, "lr": 1.2461310839898656e-07, "epoch": 3.6155251141552514, "percentage": 90.38, "elapsed_time": "3:27:17", "remaining_time": "0:22:03"}
309
+ {"current_steps": 5960, "total_steps": 6572, "loss": 0.013170333206653595, "lr": 1.169240641918104e-07, "epoch": 3.6277016742770165, "percentage": 90.69, "elapsed_time": "3:27:54", "remaining_time": "0:21:20"}
310
+ {"current_steps": 5980, "total_steps": 6572, "loss": 0.014534834027290344, "lr": 1.0947417631806539e-07, "epoch": 3.6398782343987826, "percentage": 90.99, "elapsed_time": "3:28:31", "remaining_time": "0:20:38"}
311
+ {"current_steps": 6000, "total_steps": 6572, "loss": 0.011021688580513, "lr": 1.0226419241983865e-07, "epoch": 3.6520547945205477, "percentage": 91.3, "elapsed_time": "3:29:07", "remaining_time": "0:19:56"}
312
+ {"current_steps": 6000, "total_steps": 6572, "eval_loss": 0.44063475728034973, "epoch": 3.6520547945205477, "percentage": 91.3, "elapsed_time": "3:29:53", "remaining_time": "0:20:00"}
313
+ {"current_steps": 6020, "total_steps": 6572, "loss": 0.010764393210411071, "lr": 9.529483606337902e-08, "epoch": 3.6642313546423138, "percentage": 91.6, "elapsed_time": "3:31:58", "remaining_time": "0:19:26"}
314
+ {"current_steps": 6040, "total_steps": 6572, "loss": 0.012128306180238723, "lr": 8.856680666647882e-08, "epoch": 3.676407914764079, "percentage": 91.91, "elapsed_time": "3:32:33", "remaining_time": "0:18:43"}
315
+ {"current_steps": 6060, "total_steps": 6572, "loss": 0.011729901283979416, "lr": 8.208077942828713e-08, "epoch": 3.688584474885845, "percentage": 92.21, "elapsed_time": "3:33:10", "remaining_time": "0:18:00"}
316
+ {"current_steps": 6080, "total_steps": 6572, "loss": 0.009119105339050294, "lr": 7.58374052615457e-08, "epoch": 3.70076103500761, "percentage": 92.51, "elapsed_time": "3:33:46", "remaining_time": "0:17:17"}
317
+ {"current_steps": 6100, "total_steps": 6572, "loss": 0.017101363837718965, "lr": 6.983731072726818e-08, "epoch": 3.712937595129376, "percentage": 92.82, "elapsed_time": "3:34:23", "remaining_time": "0:16:35"}
318
+ {"current_steps": 6120, "total_steps": 6572, "loss": 0.012368235737085342, "lr": 6.408109797186118e-08, "epoch": 3.7251141552511413, "percentage": 93.12, "elapsed_time": "3:34:59", "remaining_time": "0:15:52"}
319
+ {"current_steps": 6140, "total_steps": 6572, "loss": 0.008782628178596496, "lr": 5.856934466669212e-08, "epoch": 3.7372907153729074, "percentage": 93.43, "elapsed_time": "3:35:36", "remaining_time": "0:15:10"}
320
+ {"current_steps": 6160, "total_steps": 6572, "loss": 0.008994438499212266, "lr": 5.3302603950119994e-08, "epoch": 3.7494672754946725, "percentage": 93.73, "elapsed_time": "3:36:13", "remaining_time": "0:14:27"}
321
+ {"current_steps": 6180, "total_steps": 6572, "loss": 0.011286454647779465, "lr": 4.8281404371981755e-08, "epoch": 3.7616438356164386, "percentage": 94.04, "elapsed_time": "3:36:50", "remaining_time": "0:13:45"}
322
+ {"current_steps": 6200, "total_steps": 6572, "loss": 0.011785905063152313, "lr": 4.350624984055196e-08, "epoch": 3.7738203957382037, "percentage": 94.34, "elapsed_time": "3:37:26", "remaining_time": "0:13:02"}
323
+ {"current_steps": 6220, "total_steps": 6572, "loss": 0.013624191284179688, "lr": 3.897761957196877e-08, "epoch": 3.7859969558599698, "percentage": 94.64, "elapsed_time": "3:38:01", "remaining_time": "0:12:20"}
324
+ {"current_steps": 6240, "total_steps": 6572, "loss": 0.011700452119112015, "lr": 3.469596804214548e-08, "epoch": 3.798173515981735, "percentage": 94.95, "elapsed_time": "3:38:37", "remaining_time": "0:11:37"}
325
+ {"current_steps": 6260, "total_steps": 6572, "loss": 0.011029987037181855, "lr": 3.06617249411581e-08, "epoch": 3.810350076103501, "percentage": 95.25, "elapsed_time": "3:39:12", "remaining_time": "0:10:55"}
326
+ {"current_steps": 6280, "total_steps": 6572, "loss": 0.010965974628925323, "lr": 2.687529513012488e-08, "epoch": 3.822526636225266, "percentage": 95.56, "elapsed_time": "3:39:49", "remaining_time": "0:10:13"}
327
+ {"current_steps": 6300, "total_steps": 6572, "loss": 0.012378603965044022, "lr": 2.3337058600575722e-08, "epoch": 3.834703196347032, "percentage": 95.86, "elapsed_time": "3:40:25", "remaining_time": "0:09:30"}
328
+ {"current_steps": 6320, "total_steps": 6572, "loss": 0.011792077124118805, "lr": 2.0047370436317437e-08, "epoch": 3.8468797564687973, "percentage": 96.17, "elapsed_time": "3:41:00", "remaining_time": "0:08:48"}
329
+ {"current_steps": 6340, "total_steps": 6572, "loss": 0.01145942509174347, "lr": 1.7006560777798608e-08, "epoch": 3.8590563165905634, "percentage": 96.47, "elapsed_time": "3:41:35", "remaining_time": "0:08:06"}
330
+ {"current_steps": 6360, "total_steps": 6572, "loss": 0.011088228970766067, "lr": 1.421493478897945e-08, "epoch": 3.8712328767123285, "percentage": 96.77, "elapsed_time": "3:42:11", "remaining_time": "0:07:24"}
331
+ {"current_steps": 6380, "total_steps": 6572, "loss": 0.010828402638435364, "lr": 1.1672772626704909e-08, "epoch": 3.8834094368340946, "percentage": 97.08, "elapsed_time": "3:42:47", "remaining_time": "0:06:42"}
332
+ {"current_steps": 6400, "total_steps": 6572, "loss": 0.01165580153465271, "lr": 9.38032941258965e-09, "epoch": 3.8955859969558597, "percentage": 97.38, "elapsed_time": "3:43:22", "remaining_time": "0:06:00"}
333
+ {"current_steps": 6420, "total_steps": 6572, "loss": 0.010783226788043975, "lr": 7.3378352074163215e-09, "epoch": 3.9077625570776258, "percentage": 97.69, "elapsed_time": "3:43:58", "remaining_time": "0:05:18"}
334
+ {"current_steps": 6440, "total_steps": 6572, "loss": 0.011295531690120698, "lr": 5.545494988045963e-09, "epoch": 3.919939117199391, "percentage": 97.99, "elapsed_time": "3:44:33", "remaining_time": "0:04:36"}
335
+ {"current_steps": 6460, "total_steps": 6572, "loss": 0.013613662123680115, "lr": 4.003488626848073e-09, "epoch": 3.932115677321157, "percentage": 98.3, "elapsed_time": "3:45:10", "remaining_time": "0:03:54"}
336
+ {"current_steps": 6480, "total_steps": 6572, "loss": 0.011696261167526246, "lr": 2.7119708736486615e-09, "epoch": 3.944292237442922, "percentage": 98.6, "elapsed_time": "3:45:47", "remaining_time": "0:03:12"}
337
+ {"current_steps": 6500, "total_steps": 6572, "loss": 0.010873865336179733, "lr": 1.6710713402015577e-09, "epoch": 3.956468797564688, "percentage": 98.9, "elapsed_time": "3:46:23", "remaining_time": "0:02:30"}
338
+ {"current_steps": 6500, "total_steps": 6572, "eval_loss": 0.441041499376297, "epoch": 3.956468797564688, "percentage": 98.9, "elapsed_time": "3:47:09", "remaining_time": "0:02:30"}
339
+ {"current_steps": 6520, "total_steps": 6572, "loss": 0.012961818277835846, "lr": 8.80894487179651e-10, "epoch": 3.9686453576864533, "percentage": 99.21, "elapsed_time": "3:49:12", "remaining_time": "0:01:49"}
340
+ {"current_steps": 6540, "total_steps": 6572, "loss": 0.01224210560321808, "lr": 3.4151961369188745e-10, "epoch": 3.9808219178082194, "percentage": 99.51, "elapsed_time": "3:49:48", "remaining_time": "0:01:07"}
341
+ {"current_steps": 6560, "total_steps": 6572, "loss": 0.010245455056428909, "lr": 5.300084932574612e-11, "epoch": 3.9929984779299845, "percentage": 99.82, "elapsed_time": "3:50:24", "remaining_time": "0:00:25"}
342
+ {"current_steps": 6572, "total_steps": 6572, "epoch": 4.0, "percentage": 100.0, "elapsed_time": "3:52:13", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,2443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.24901165068149567,
4
+ "best_model_checkpoint": "/root/workspace/finetune/checkpoints/train_run_06_qwen3_4b_formal/checkpoint-1000",
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6572,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0121765601217656,
14
+ "grad_norm": 2.8322572708129883,
15
+ "learning_rate": 3.166666666666667e-07,
16
+ "loss": 1.119423007965088,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.0243531202435312,
21
+ "grad_norm": 2.00534987449646,
22
+ "learning_rate": 6.5e-07,
23
+ "loss": 0.8098940849304199,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.0365296803652968,
28
+ "grad_norm": 1.4319002628326416,
29
+ "learning_rate": 9.833333333333334e-07,
30
+ "loss": 0.6310151576995849,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.0487062404870624,
35
+ "grad_norm": 1.3265702724456787,
36
+ "learning_rate": 1.3166666666666666e-06,
37
+ "loss": 0.5526751518249512,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.060882800608828,
42
+ "grad_norm": 0.8091968297958374,
43
+ "learning_rate": 1.6500000000000003e-06,
44
+ "loss": 0.4834024906158447,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.0730593607305936,
49
+ "grad_norm": 0.8584772348403931,
50
+ "learning_rate": 1.9833333333333335e-06,
51
+ "loss": 0.44873433113098143,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.0852359208523592,
56
+ "grad_norm": 0.9426456093788147,
57
+ "learning_rate": 2.316666666666667e-06,
58
+ "loss": 0.38286705017089845,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.0974124809741248,
63
+ "grad_norm": 0.939085066318512,
64
+ "learning_rate": 2.6500000000000005e-06,
65
+ "loss": 0.38465352058410646,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.1095890410958904,
70
+ "grad_norm": 0.8642272353172302,
71
+ "learning_rate": 2.9833333333333337e-06,
72
+ "loss": 0.3305965900421143,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.121765601217656,
77
+ "grad_norm": 0.7353930473327637,
78
+ "learning_rate": 3.316666666666667e-06,
79
+ "loss": 0.2875316619873047,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.1339421613394216,
84
+ "grad_norm": 0.7686158418655396,
85
+ "learning_rate": 3.65e-06,
86
+ "loss": 0.3140716075897217,
87
+ "step": 220
88
+ },
89
+ {
90
+ "epoch": 0.1461187214611872,
91
+ "grad_norm": 1.153424859046936,
92
+ "learning_rate": 3.983333333333334e-06,
93
+ "loss": 0.2985520839691162,
94
+ "step": 240
95
+ },
96
+ {
97
+ "epoch": 0.1582952815829528,
98
+ "grad_norm": 0.7382195591926575,
99
+ "learning_rate": 4.316666666666667e-06,
100
+ "loss": 0.2449601411819458,
101
+ "step": 260
102
+ },
103
+ {
104
+ "epoch": 0.1704718417047184,
105
+ "grad_norm": 0.7017386555671692,
106
+ "learning_rate": 4.65e-06,
107
+ "loss": 0.24566495418548584,
108
+ "step": 280
109
+ },
110
+ {
111
+ "epoch": 0.182648401826484,
112
+ "grad_norm": 1.0982152223587036,
113
+ "learning_rate": 4.983333333333334e-06,
114
+ "loss": 0.20863604545593262,
115
+ "step": 300
116
+ },
117
+ {
118
+ "epoch": 0.1948249619482496,
119
+ "grad_norm": 0.6582516431808472,
120
+ "learning_rate": 4.9998867856224845e-06,
121
+ "loss": 0.19693111181259154,
122
+ "step": 320
123
+ },
124
+ {
125
+ "epoch": 0.2070015220700152,
126
+ "grad_norm": 0.6201784610748291,
127
+ "learning_rate": 4.999523005839606e-06,
128
+ "loss": 0.21833207607269287,
129
+ "step": 340
130
+ },
131
+ {
132
+ "epoch": 0.2191780821917808,
133
+ "grad_norm": 0.7551426887512207,
134
+ "learning_rate": 4.998908383543311e-06,
135
+ "loss": 0.2382877826690674,
136
+ "step": 360
137
+ },
138
+ {
139
+ "epoch": 0.2313546423135464,
140
+ "grad_norm": 0.6009605526924133,
141
+ "learning_rate": 4.9980429804147276e-06,
142
+ "loss": 0.21118538379669188,
143
+ "step": 380
144
+ },
145
+ {
146
+ "epoch": 0.243531202435312,
147
+ "grad_norm": 0.7086856365203857,
148
+ "learning_rate": 4.996926883302385e-06,
149
+ "loss": 0.18367968797683715,
150
+ "step": 400
151
+ },
152
+ {
153
+ "epoch": 0.2557077625570776,
154
+ "grad_norm": 0.964198648929596,
155
+ "learning_rate": 4.995560204213496e-06,
156
+ "loss": 0.22483460903167723,
157
+ "step": 420
158
+ },
159
+ {
160
+ "epoch": 0.2678843226788432,
161
+ "grad_norm": 0.6596850156784058,
162
+ "learning_rate": 4.993943080302715e-06,
163
+ "loss": 0.22673561573028564,
164
+ "step": 440
165
+ },
166
+ {
167
+ "epoch": 0.2800608828006088,
168
+ "grad_norm": 0.5922938585281372,
169
+ "learning_rate": 4.992075673858379e-06,
170
+ "loss": 0.185296630859375,
171
+ "step": 460
172
+ },
173
+ {
174
+ "epoch": 0.2922374429223744,
175
+ "grad_norm": 0.6671269536018372,
176
+ "learning_rate": 4.989958172286214e-06,
177
+ "loss": 0.18437937498092652,
178
+ "step": 480
179
+ },
180
+ {
181
+ "epoch": 0.30441400304414,
182
+ "grad_norm": 0.7064187526702881,
183
+ "learning_rate": 4.987590788090533e-06,
184
+ "loss": 0.1850834846496582,
185
+ "step": 500
186
+ },
187
+ {
188
+ "epoch": 0.30441400304414,
189
+ "eval_loss": 0.2562323212623596,
190
+ "eval_runtime": 47.3626,
191
+ "eval_samples_per_second": 18.58,
192
+ "eval_steps_per_second": 18.58,
193
+ "step": 500
194
+ },
195
+ {
196
+ "epoch": 0.3165905631659056,
197
+ "grad_norm": 0.7698928117752075,
198
+ "learning_rate": 4.984973758852904e-06,
199
+ "loss": 0.16346561908721924,
200
+ "step": 520
201
+ },
202
+ {
203
+ "epoch": 0.3287671232876712,
204
+ "grad_norm": 0.4859939515590668,
205
+ "learning_rate": 4.982107347208317e-06,
206
+ "loss": 0.18838067054748536,
207
+ "step": 540
208
+ },
209
+ {
210
+ "epoch": 0.3409436834094368,
211
+ "grad_norm": 0.5321416854858398,
212
+ "learning_rate": 4.978991840818816e-06,
213
+ "loss": 0.177593994140625,
214
+ "step": 560
215
+ },
216
+ {
217
+ "epoch": 0.3531202435312024,
218
+ "grad_norm": 0.593889594078064,
219
+ "learning_rate": 4.975627552344638e-06,
220
+ "loss": 0.20775914192199707,
221
+ "step": 580
222
+ },
223
+ {
224
+ "epoch": 0.365296803652968,
225
+ "grad_norm": 0.7926055192947388,
226
+ "learning_rate": 4.97201481941283e-06,
227
+ "loss": 0.16498700380325318,
228
+ "step": 600
229
+ },
230
+ {
231
+ "epoch": 0.3774733637747336,
232
+ "grad_norm": 0.2909524440765381,
233
+ "learning_rate": 4.968154004583374e-06,
234
+ "loss": 0.17565951347351075,
235
+ "step": 620
236
+ },
237
+ {
238
+ "epoch": 0.3896499238964992,
239
+ "grad_norm": 0.4575704038143158,
240
+ "learning_rate": 4.964045495312794e-06,
241
+ "loss": 0.16204673051834106,
242
+ "step": 640
243
+ },
244
+ {
245
+ "epoch": 0.4018264840182648,
246
+ "grad_norm": 0.5366008281707764,
247
+ "learning_rate": 4.959689703915272e-06,
248
+ "loss": 0.17068564891815186,
249
+ "step": 660
250
+ },
251
+ {
252
+ "epoch": 0.4140030441400304,
253
+ "grad_norm": 0.5129569172859192,
254
+ "learning_rate": 4.95508706752128e-06,
255
+ "loss": 0.1589680790901184,
256
+ "step": 680
257
+ },
258
+ {
259
+ "epoch": 0.426179604261796,
260
+ "grad_norm": 0.4709528684616089,
261
+ "learning_rate": 4.9502380480337e-06,
262
+ "loss": 0.17568455934524535,
263
+ "step": 700
264
+ },
265
+ {
266
+ "epoch": 0.4383561643835616,
267
+ "grad_norm": 0.6092886328697205,
268
+ "learning_rate": 4.9451431320814715e-06,
269
+ "loss": 0.16204804182052612,
270
+ "step": 720
271
+ },
272
+ {
273
+ "epoch": 0.4505327245053272,
274
+ "grad_norm": 0.5957323908805847,
275
+ "learning_rate": 4.939802830970762e-06,
276
+ "loss": 0.16562143564224244,
277
+ "step": 740
278
+ },
279
+ {
280
+ "epoch": 0.4627092846270928,
281
+ "grad_norm": 0.4758240580558777,
282
+ "learning_rate": 4.934217680633646e-06,
283
+ "loss": 0.17697544097900392,
284
+ "step": 760
285
+ },
286
+ {
287
+ "epoch": 0.4748858447488584,
288
+ "grad_norm": 0.865627646446228,
289
+ "learning_rate": 4.928388241574327e-06,
290
+ "loss": 0.1649466037750244,
291
+ "step": 780
292
+ },
293
+ {
294
+ "epoch": 0.487062404870624,
295
+ "grad_norm": 0.466294527053833,
296
+ "learning_rate": 4.922315098812883e-06,
297
+ "loss": 0.1602837324142456,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.4992389649923896,
302
+ "grad_norm": 0.6357060670852661,
303
+ "learning_rate": 4.9159988618265585e-06,
304
+ "loss": 0.142719042301178,
305
+ "step": 820
306
+ },
307
+ {
308
+ "epoch": 0.5114155251141552,
309
+ "grad_norm": 0.6055647730827332,
310
+ "learning_rate": 4.9094401644886e-06,
311
+ "loss": 0.14508233070373536,
312
+ "step": 840
313
+ },
314
+ {
315
+ "epoch": 0.5235920852359208,
316
+ "grad_norm": 0.45214834809303284,
317
+ "learning_rate": 4.902639665004641e-06,
318
+ "loss": 0.1821539044380188,
319
+ "step": 860
320
+ },
321
+ {
322
+ "epoch": 0.5357686453576864,
323
+ "grad_norm": 0.5735688805580139,
324
+ "learning_rate": 4.89559804584665e-06,
325
+ "loss": 0.16131887435913086,
326
+ "step": 880
327
+ },
328
+ {
329
+ "epoch": 0.547945205479452,
330
+ "grad_norm": 0.6279348731040955,
331
+ "learning_rate": 4.888316013684435e-06,
332
+ "loss": 0.17404688596725465,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 0.5601217656012176,
337
+ "grad_norm": 0.6474089026451111,
338
+ "learning_rate": 4.880794299314732e-06,
339
+ "loss": 0.14134640693664552,
340
+ "step": 920
341
+ },
342
+ {
343
+ "epoch": 0.5722983257229832,
344
+ "grad_norm": 0.5808464884757996,
345
+ "learning_rate": 4.87303365758786e-06,
346
+ "loss": 0.14891813993453978,
347
+ "step": 940
348
+ },
349
+ {
350
+ "epoch": 0.5844748858447488,
351
+ "grad_norm": 0.5440990328788757,
352
+ "learning_rate": 4.865034867331967e-06,
353
+ "loss": 0.1696299910545349,
354
+ "step": 960
355
+ },
356
+ {
357
+ "epoch": 0.5966514459665144,
358
+ "grad_norm": 0.6859214901924133,
359
+ "learning_rate": 4.856798731274874e-06,
360
+ "loss": 0.14085158109664916,
361
+ "step": 980
362
+ },
363
+ {
364
+ "epoch": 0.60882800608828,
365
+ "grad_norm": 0.3178713619709015,
366
+ "learning_rate": 4.84832607596351e-06,
367
+ "loss": 0.133053982257843,
368
+ "step": 1000
369
+ },
370
+ {
371
+ "epoch": 0.60882800608828,
372
+ "eval_loss": 0.24901165068149567,
373
+ "eval_runtime": 46.6976,
374
+ "eval_samples_per_second": 18.845,
375
+ "eval_steps_per_second": 18.845,
376
+ "step": 1000
377
+ },
378
+ {
379
+ "epoch": 0.6210045662100456,
380
+ "grad_norm": 0.3812738358974457,
381
+ "learning_rate": 4.8396177516809695e-06,
382
+ "loss": 0.12680984735488893,
383
+ "step": 1020
384
+ },
385
+ {
386
+ "epoch": 0.6331811263318112,
387
+ "grad_norm": 0.5174199342727661,
388
+ "learning_rate": 4.830674632361178e-06,
389
+ "loss": 0.14880582094192504,
390
+ "step": 1040
391
+ },
392
+ {
393
+ "epoch": 0.6453576864535768,
394
+ "grad_norm": 0.4705193042755127,
395
+ "learning_rate": 4.821497615501186e-06,
396
+ "loss": 0.1447562575340271,
397
+ "step": 1060
398
+ },
399
+ {
400
+ "epoch": 0.6575342465753424,
401
+ "grad_norm": 0.42298850417137146,
402
+ "learning_rate": 4.812087622071104e-06,
403
+ "loss": 0.15530819892883302,
404
+ "step": 1080
405
+ },
406
+ {
407
+ "epoch": 0.669710806697108,
408
+ "grad_norm": 0.30658382177352905,
409
+ "learning_rate": 4.80244559642167e-06,
410
+ "loss": 0.14426586627960206,
411
+ "step": 1100
412
+ },
413
+ {
414
+ "epoch": 0.6818873668188736,
415
+ "grad_norm": 0.4838867783546448,
416
+ "learning_rate": 4.792572506189489e-06,
417
+ "loss": 0.15436025857925414,
418
+ "step": 1120
419
+ },
420
+ {
421
+ "epoch": 0.6940639269406392,
422
+ "grad_norm": 0.716833770275116,
423
+ "learning_rate": 4.782469342199915e-06,
424
+ "loss": 0.14860854148864747,
425
+ "step": 1140
426
+ },
427
+ {
428
+ "epoch": 0.7062404870624048,
429
+ "grad_norm": 0.36538004875183105,
430
+ "learning_rate": 4.7721371183676205e-06,
431
+ "loss": 0.1313084125518799,
432
+ "step": 1160
433
+ },
434
+ {
435
+ "epoch": 0.7184170471841704,
436
+ "grad_norm": 0.5409316420555115,
437
+ "learning_rate": 4.761576871594841e-06,
438
+ "loss": 0.150812029838562,
439
+ "step": 1180
440
+ },
441
+ {
442
+ "epoch": 0.730593607305936,
443
+ "grad_norm": 0.5275493264198303,
444
+ "learning_rate": 4.750789661667318e-06,
445
+ "loss": 0.13278884887695314,
446
+ "step": 1200
447
+ },
448
+ {
449
+ "epoch": 0.7427701674277016,
450
+ "grad_norm": 0.5485584735870361,
451
+ "learning_rate": 4.739776571147943e-06,
452
+ "loss": 0.1612934350967407,
453
+ "step": 1220
454
+ },
455
+ {
456
+ "epoch": 0.7549467275494672,
457
+ "grad_norm": 0.5949460864067078,
458
+ "learning_rate": 4.728538705268116e-06,
459
+ "loss": 0.16211290359497071,
460
+ "step": 1240
461
+ },
462
+ {
463
+ "epoch": 0.7671232876712328,
464
+ "grad_norm": 0.43323376774787903,
465
+ "learning_rate": 4.717077191816824e-06,
466
+ "loss": 0.14386119842529296,
467
+ "step": 1260
468
+ },
469
+ {
470
+ "epoch": 0.7792998477929984,
471
+ "grad_norm": 0.6409174799919128,
472
+ "learning_rate": 4.705393181027463e-06,
473
+ "loss": 0.12942540645599365,
474
+ "step": 1280
475
+ },
476
+ {
477
+ "epoch": 0.791476407914764,
478
+ "grad_norm": 0.4871342182159424,
479
+ "learning_rate": 4.693487845462413e-06,
480
+ "loss": 0.14771063327789308,
481
+ "step": 1300
482
+ },
483
+ {
484
+ "epoch": 0.8036529680365296,
485
+ "grad_norm": 0.5108008980751038,
486
+ "learning_rate": 4.681362379895349e-06,
487
+ "loss": 0.1276724100112915,
488
+ "step": 1320
489
+ },
490
+ {
491
+ "epoch": 0.8158295281582952,
492
+ "grad_norm": 0.915285587310791,
493
+ "learning_rate": 4.6690180011913524e-06,
494
+ "loss": 0.1319241166114807,
495
+ "step": 1340
496
+ },
497
+ {
498
+ "epoch": 0.8280060882800608,
499
+ "grad_norm": 0.5282526612281799,
500
+ "learning_rate": 4.6564559481847795e-06,
501
+ "loss": 0.1557891011238098,
502
+ "step": 1360
503
+ },
504
+ {
505
+ "epoch": 0.8401826484018264,
506
+ "grad_norm": 0.46745216846466064,
507
+ "learning_rate": 4.643677481554947e-06,
508
+ "loss": 0.11075855493545532,
509
+ "step": 1380
510
+ },
511
+ {
512
+ "epoch": 0.852359208523592,
513
+ "grad_norm": 0.40246087312698364,
514
+ "learning_rate": 4.630683883699607e-06,
515
+ "loss": 0.1580789566040039,
516
+ "step": 1400
517
+ },
518
+ {
519
+ "epoch": 0.8645357686453576,
520
+ "grad_norm": 0.3718211352825165,
521
+ "learning_rate": 4.6174764586062556e-06,
522
+ "loss": 0.16006500720977784,
523
+ "step": 1420
524
+ },
525
+ {
526
+ "epoch": 0.8767123287671232,
527
+ "grad_norm": 0.4359384775161743,
528
+ "learning_rate": 4.6040565317212685e-06,
529
+ "loss": 0.1462727189064026,
530
+ "step": 1440
531
+ },
532
+ {
533
+ "epoch": 0.8888888888888888,
534
+ "grad_norm": 0.3503302037715912,
535
+ "learning_rate": 4.59042544981688e-06,
536
+ "loss": 0.14725338220596312,
537
+ "step": 1460
538
+ },
539
+ {
540
+ "epoch": 0.9010654490106544,
541
+ "grad_norm": 0.3662220537662506,
542
+ "learning_rate": 4.5765845808560334e-06,
543
+ "loss": 0.1304166793823242,
544
+ "step": 1480
545
+ },
546
+ {
547
+ "epoch": 0.91324200913242,
548
+ "grad_norm": 0.45357516407966614,
549
+ "learning_rate": 4.562535313855094e-06,
550
+ "loss": 0.1293134570121765,
551
+ "step": 1500
552
+ },
553
+ {
554
+ "epoch": 0.91324200913242,
555
+ "eval_loss": 0.25538697838783264,
556
+ "eval_runtime": 47.0026,
557
+ "eval_samples_per_second": 18.722,
558
+ "eval_steps_per_second": 18.722,
559
+ "step": 1500
560
+ },
561
+ {
562
+ "epoch": 0.9254185692541856,
563
+ "grad_norm": 0.31984779238700867,
564
+ "learning_rate": 4.548279058744451e-06,
565
+ "loss": 0.11359381675720215,
566
+ "step": 1520
567
+ },
568
+ {
569
+ "epoch": 0.9375951293759512,
570
+ "grad_norm": 0.6947388052940369,
571
+ "learning_rate": 4.533817246227024e-06,
572
+ "loss": 0.15145074129104613,
573
+ "step": 1540
574
+ },
575
+ {
576
+ "epoch": 0.9497716894977168,
577
+ "grad_norm": 0.5685822367668152,
578
+ "learning_rate": 4.519151327634685e-06,
579
+ "loss": 0.11953675746917725,
580
+ "step": 1560
581
+ },
582
+ {
583
+ "epoch": 0.9619482496194824,
584
+ "grad_norm": 0.2805669903755188,
585
+ "learning_rate": 4.504282774782605e-06,
586
+ "loss": 0.13977375030517578,
587
+ "step": 1580
588
+ },
589
+ {
590
+ "epoch": 0.974124809741248,
591
+ "grad_norm": 0.333103746175766,
592
+ "learning_rate": 4.489213079821551e-06,
593
+ "loss": 0.1338045358657837,
594
+ "step": 1600
595
+ },
596
+ {
597
+ "epoch": 0.9863013698630136,
598
+ "grad_norm": 0.5493115782737732,
599
+ "learning_rate": 4.4739437550881355e-06,
600
+ "loss": 0.11776142120361328,
601
+ "step": 1620
602
+ },
603
+ {
604
+ "epoch": 0.9984779299847792,
605
+ "grad_norm": 0.4903205931186676,
606
+ "learning_rate": 4.458476332953051e-06,
607
+ "loss": 0.12504475116729735,
608
+ "step": 1640
609
+ },
610
+ {
611
+ "epoch": 1.0103500761035007,
612
+ "grad_norm": 0.2682284712791443,
613
+ "learning_rate": 4.442812365667281e-06,
614
+ "loss": 0.08379222154617309,
615
+ "step": 1660
616
+ },
617
+ {
618
+ "epoch": 1.0225266362252663,
619
+ "grad_norm": 0.34869125485420227,
620
+ "learning_rate": 4.426953425206322e-06,
621
+ "loss": 0.08407147526741028,
622
+ "step": 1680
623
+ },
624
+ {
625
+ "epoch": 1.034703196347032,
626
+ "grad_norm": 0.38522422313690186,
627
+ "learning_rate": 4.410901103112434e-06,
628
+ "loss": 0.08041079640388489,
629
+ "step": 1700
630
+ },
631
+ {
632
+ "epoch": 1.0468797564687975,
633
+ "grad_norm": 0.4700411856174469,
634
+ "learning_rate": 4.394657010334908e-06,
635
+ "loss": 0.07876392006874085,
636
+ "step": 1720
637
+ },
638
+ {
639
+ "epoch": 1.059056316590563,
640
+ "grad_norm": 0.3719494938850403,
641
+ "learning_rate": 4.378222777068406e-06,
642
+ "loss": 0.10302903652191162,
643
+ "step": 1740
644
+ },
645
+ {
646
+ "epoch": 1.0712328767123287,
647
+ "grad_norm": 0.406753808259964,
648
+ "learning_rate": 4.361600052589358e-06,
649
+ "loss": 0.08733606934547425,
650
+ "step": 1760
651
+ },
652
+ {
653
+ "epoch": 1.0834094368340943,
654
+ "grad_norm": 0.19822958111763,
655
+ "learning_rate": 4.344790505090447e-06,
656
+ "loss": 0.08532609939575195,
657
+ "step": 1780
658
+ },
659
+ {
660
+ "epoch": 1.09558599695586,
661
+ "grad_norm": 0.3569728434085846,
662
+ "learning_rate": 4.327795821513195e-06,
663
+ "loss": 0.08734336495399475,
664
+ "step": 1800
665
+ },
666
+ {
667
+ "epoch": 1.1077625570776255,
668
+ "grad_norm": 0.4549162685871124,
669
+ "learning_rate": 4.3106177073786684e-06,
670
+ "loss": 0.0913870632648468,
671
+ "step": 1820
672
+ },
673
+ {
674
+ "epoch": 1.119939117199391,
675
+ "grad_norm": 0.5802178978919983,
676
+ "learning_rate": 4.293257886616318e-06,
677
+ "loss": 0.08115516304969787,
678
+ "step": 1840
679
+ },
680
+ {
681
+ "epoch": 1.1321156773211567,
682
+ "grad_norm": 0.30524641275405884,
683
+ "learning_rate": 4.275718101390975e-06,
684
+ "loss": 0.08891176581382751,
685
+ "step": 1860
686
+ },
687
+ {
688
+ "epoch": 1.1442922374429223,
689
+ "grad_norm": 0.33804091811180115,
690
+ "learning_rate": 4.25800011192801e-06,
691
+ "loss": 0.07950961589813232,
692
+ "step": 1880
693
+ },
694
+ {
695
+ "epoch": 1.156468797564688,
696
+ "grad_norm": 0.33472684025764465,
697
+ "learning_rate": 4.240105696336687e-06,
698
+ "loss": 0.08310645222663879,
699
+ "step": 1900
700
+ },
701
+ {
702
+ "epoch": 1.1686453576864535,
703
+ "grad_norm": 0.3032575845718384,
704
+ "learning_rate": 4.222036650431715e-06,
705
+ "loss": 0.07682961225509644,
706
+ "step": 1920
707
+ },
708
+ {
709
+ "epoch": 1.180821917808219,
710
+ "grad_norm": 0.47542238235473633,
711
+ "learning_rate": 4.203794787553032e-06,
712
+ "loss": 0.07520227432250977,
713
+ "step": 1940
714
+ },
715
+ {
716
+ "epoch": 1.1929984779299847,
717
+ "grad_norm": 0.39914897084236145,
718
+ "learning_rate": 4.185381938383821e-06,
719
+ "loss": 0.0754019558429718,
720
+ "step": 1960
721
+ },
722
+ {
723
+ "epoch": 1.2051750380517503,
724
+ "grad_norm": 0.4697635769844055,
725
+ "learning_rate": 4.166799950766793e-06,
726
+ "loss": 0.08085885643959045,
727
+ "step": 1980
728
+ },
729
+ {
730
+ "epoch": 1.217351598173516,
731
+ "grad_norm": 0.28078529238700867,
732
+ "learning_rate": 4.14805068951874e-06,
733
+ "loss": 0.0884653627872467,
734
+ "step": 2000
735
+ },
736
+ {
737
+ "epoch": 1.217351598173516,
738
+ "eval_loss": 0.2661186456680298,
739
+ "eval_runtime": 46.548,
740
+ "eval_samples_per_second": 18.905,
741
+ "eval_steps_per_second": 18.905,
742
+ "step": 2000
743
+ },
744
+ {
745
+ "epoch": 1.2295281582952815,
746
+ "grad_norm": 0.2885560989379883,
747
+ "learning_rate": 4.1291360362433965e-06,
748
+ "loss": 0.06684748530387878,
749
+ "step": 2020
750
+ },
751
+ {
752
+ "epoch": 1.241704718417047,
753
+ "grad_norm": 0.24163688719272614,
754
+ "learning_rate": 4.110057889142601e-06,
755
+ "loss": 0.0720324158668518,
756
+ "step": 2040
757
+ },
758
+ {
759
+ "epoch": 1.2538812785388127,
760
+ "grad_norm": 0.52589350938797,
761
+ "learning_rate": 4.090818162825804e-06,
762
+ "loss": 0.08799988031387329,
763
+ "step": 2060
764
+ },
765
+ {
766
+ "epoch": 1.2660578386605783,
767
+ "grad_norm": 0.35164448618888855,
768
+ "learning_rate": 4.071418788117926e-06,
769
+ "loss": 0.09275985956192016,
770
+ "step": 2080
771
+ },
772
+ {
773
+ "epoch": 1.278234398782344,
774
+ "grad_norm": 0.4981421232223511,
775
+ "learning_rate": 4.0518617118655845e-06,
776
+ "loss": 0.08431113958358764,
777
+ "step": 2100
778
+ },
779
+ {
780
+ "epoch": 1.2904109589041095,
781
+ "grad_norm": 0.2867731750011444,
782
+ "learning_rate": 4.032148896741717e-06,
783
+ "loss": 0.09995608925819396,
784
+ "step": 2120
785
+ },
786
+ {
787
+ "epoch": 1.302587519025875,
788
+ "grad_norm": 0.5612165331840515,
789
+ "learning_rate": 4.012282321048618e-06,
790
+ "loss": 0.07387629747390748,
791
+ "step": 2140
792
+ },
793
+ {
794
+ "epoch": 1.3147640791476407,
795
+ "grad_norm": 0.4880141317844391,
796
+ "learning_rate": 3.992263978519398e-06,
797
+ "loss": 0.07667248249053955,
798
+ "step": 2160
799
+ },
800
+ {
801
+ "epoch": 1.3269406392694063,
802
+ "grad_norm": 0.3143049478530884,
803
+ "learning_rate": 3.972095878117904e-06,
804
+ "loss": 0.09203824400901794,
805
+ "step": 2180
806
+ },
807
+ {
808
+ "epoch": 1.339117199391172,
809
+ "grad_norm": 0.47859013080596924,
810
+ "learning_rate": 3.951780043837107e-06,
811
+ "loss": 0.07835246920585633,
812
+ "step": 2200
813
+ },
814
+ {
815
+ "epoch": 1.3512937595129375,
816
+ "grad_norm": 0.28448912501335144,
817
+ "learning_rate": 3.9313185144959835e-06,
818
+ "loss": 0.08577624559402466,
819
+ "step": 2220
820
+ },
821
+ {
822
+ "epoch": 1.363470319634703,
823
+ "grad_norm": 0.32139304280281067,
824
+ "learning_rate": 3.9107133435349025e-06,
825
+ "loss": 0.0789969801902771,
826
+ "step": 2240
827
+ },
828
+ {
829
+ "epoch": 1.3756468797564687,
830
+ "grad_norm": 0.4797567129135132,
831
+ "learning_rate": 3.889966598809557e-06,
832
+ "loss": 0.07151145935058593,
833
+ "step": 2260
834
+ },
835
+ {
836
+ "epoch": 1.3878234398782343,
837
+ "grad_norm": 0.2404891699552536,
838
+ "learning_rate": 3.869080362383437e-06,
839
+ "loss": 0.09204544425010681,
840
+ "step": 2280
841
+ },
842
+ {
843
+ "epoch": 1.4,
844
+ "grad_norm": 0.328392893075943,
845
+ "learning_rate": 3.848056730318881e-06,
846
+ "loss": 0.11079612970352173,
847
+ "step": 2300
848
+ },
849
+ {
850
+ "epoch": 1.4121765601217655,
851
+ "grad_norm": 0.2993980646133423,
852
+ "learning_rate": 3.826897812466728e-06,
853
+ "loss": 0.06770140528678895,
854
+ "step": 2320
855
+ },
856
+ {
857
+ "epoch": 1.4243531202435311,
858
+ "grad_norm": 0.47816380858421326,
859
+ "learning_rate": 3.8056057322545763e-06,
860
+ "loss": 0.08210510611534119,
861
+ "step": 2340
862
+ },
863
+ {
864
+ "epoch": 1.4365296803652967,
865
+ "grad_norm": 0.38082119822502136,
866
+ "learning_rate": 3.7841826264736888e-06,
867
+ "loss": 0.09583572745323181,
868
+ "step": 2360
869
+ },
870
+ {
871
+ "epoch": 1.4487062404870623,
872
+ "grad_norm": 0.3811774253845215,
873
+ "learning_rate": 3.762630645064547e-06,
874
+ "loss": 0.09235450625419617,
875
+ "step": 2380
876
+ },
877
+ {
878
+ "epoch": 1.460882800608828,
879
+ "grad_norm": 0.3869916498661041,
880
+ "learning_rate": 3.7409519509010985e-06,
881
+ "loss": 0.08658097982406616,
882
+ "step": 2400
883
+ },
884
+ {
885
+ "epoch": 1.4730593607305935,
886
+ "grad_norm": 0.5042543411254883,
887
+ "learning_rate": 3.7191487195736915e-06,
888
+ "loss": 0.08892765045166015,
889
+ "step": 2420
890
+ },
891
+ {
892
+ "epoch": 1.4852359208523591,
893
+ "grad_norm": 0.4064173996448517,
894
+ "learning_rate": 3.697223139170748e-06,
895
+ "loss": 0.07849371433258057,
896
+ "step": 2440
897
+ },
898
+ {
899
+ "epoch": 1.4974124809741247,
900
+ "grad_norm": 0.3958194851875305,
901
+ "learning_rate": 3.6751774100591716e-06,
902
+ "loss": 0.07469035387039184,
903
+ "step": 2460
904
+ },
905
+ {
906
+ "epoch": 1.5095890410958903,
907
+ "grad_norm": 0.39424121379852295,
908
+ "learning_rate": 3.6530137446635265e-06,
909
+ "loss": 0.0782626211643219,
910
+ "step": 2480
911
+ },
912
+ {
913
+ "epoch": 1.521765601217656,
914
+ "grad_norm": 0.46179333329200745,
915
+ "learning_rate": 3.630734367244012e-06,
916
+ "loss": 0.08304058909416198,
917
+ "step": 2500
918
+ },
919
+ {
920
+ "epoch": 1.521765601217656,
921
+ "eval_loss": 0.2771773338317871,
922
+ "eval_runtime": 46.9989,
923
+ "eval_samples_per_second": 18.724,
924
+ "eval_steps_per_second": 18.724,
925
+ "step": 2500
926
+ },
927
+ {
928
+ "epoch": 1.5339421613394215,
929
+ "grad_norm": 0.3447970151901245,
930
+ "learning_rate": 3.6083415136732374e-06,
931
+ "loss": 0.08037537932395936,
932
+ "step": 2520
933
+ },
934
+ {
935
+ "epoch": 1.5461187214611871,
936
+ "grad_norm": 0.544224202632904,
937
+ "learning_rate": 3.585837431211845e-06,
938
+ "loss": 0.08990358114242554,
939
+ "step": 2540
940
+ },
941
+ {
942
+ "epoch": 1.5582952815829527,
943
+ "grad_norm": 0.2808915376663208,
944
+ "learning_rate": 3.563224378282978e-06,
945
+ "loss": 0.0773526132106781,
946
+ "step": 2560
947
+ },
948
+ {
949
+ "epoch": 1.5704718417047183,
950
+ "grad_norm": 0.27800413966178894,
951
+ "learning_rate": 3.5405046242456396e-06,
952
+ "loss": 0.0777865469455719,
953
+ "step": 2580
954
+ },
955
+ {
956
+ "epoch": 1.582648401826484,
957
+ "grad_norm": 0.4247933328151703,
958
+ "learning_rate": 3.517680449166943e-06,
959
+ "loss": 0.08037815093994141,
960
+ "step": 2600
961
+ },
962
+ {
963
+ "epoch": 1.5948249619482495,
964
+ "grad_norm": 0.748912513256073,
965
+ "learning_rate": 3.4947541435932976e-06,
966
+ "loss": 0.08837634325027466,
967
+ "step": 2620
968
+ },
969
+ {
970
+ "epoch": 1.6070015220700151,
971
+ "grad_norm": 0.29709526896476746,
972
+ "learning_rate": 3.471728008320532e-06,
973
+ "loss": 0.08201563358306885,
974
+ "step": 2640
975
+ },
976
+ {
977
+ "epoch": 1.6191780821917807,
978
+ "grad_norm": 0.3287765681743622,
979
+ "learning_rate": 3.4486043541630066e-06,
980
+ "loss": 0.07379403114318847,
981
+ "step": 2660
982
+ },
983
+ {
984
+ "epoch": 1.6313546423135463,
985
+ "grad_norm": 0.4224306643009186,
986
+ "learning_rate": 3.425385501721696e-06,
987
+ "loss": 0.08871785402297974,
988
+ "step": 2680
989
+ },
990
+ {
991
+ "epoch": 1.643531202435312,
992
+ "grad_norm": 0.5213542580604553,
993
+ "learning_rate": 3.4020737811513107e-06,
994
+ "loss": 0.07757498621940613,
995
+ "step": 2700
996
+ },
997
+ {
998
+ "epoch": 1.6557077625570775,
999
+ "grad_norm": 0.21830426156520844,
1000
+ "learning_rate": 3.3786715319264483e-06,
1001
+ "loss": 0.08565697669982911,
1002
+ "step": 2720
1003
+ },
1004
+ {
1005
+ "epoch": 1.6678843226788431,
1006
+ "grad_norm": 0.30557703971862793,
1007
+ "learning_rate": 3.355181102606816e-06,
1008
+ "loss": 0.08129348754882812,
1009
+ "step": 2740
1010
+ },
1011
+ {
1012
+ "epoch": 1.6800608828006087,
1013
+ "grad_norm": 0.453703910112381,
1014
+ "learning_rate": 3.331604850601533e-06,
1015
+ "loss": 0.07875375747680664,
1016
+ "step": 2760
1017
+ },
1018
+ {
1019
+ "epoch": 1.6922374429223743,
1020
+ "grad_norm": 0.33889397978782654,
1021
+ "learning_rate": 3.307945141932556e-06,
1022
+ "loss": 0.08989614248275757,
1023
+ "step": 2780
1024
+ },
1025
+ {
1026
+ "epoch": 1.70441400304414,
1027
+ "grad_norm": 0.21036894619464874,
1028
+ "learning_rate": 3.2842043509972294e-06,
1029
+ "loss": 0.08251298069953919,
1030
+ "step": 2800
1031
+ },
1032
+ {
1033
+ "epoch": 1.7165905631659055,
1034
+ "grad_norm": 0.2764931321144104,
1035
+ "learning_rate": 3.2603848603300026e-06,
1036
+ "loss": 0.07760271430015564,
1037
+ "step": 2820
1038
+ },
1039
+ {
1040
+ "epoch": 1.7287671232876711,
1041
+ "grad_norm": 0.3566988706588745,
1042
+ "learning_rate": 3.236489060363329e-06,
1043
+ "loss": 0.07395396232604981,
1044
+ "step": 2840
1045
+ },
1046
+ {
1047
+ "epoch": 1.7409436834094367,
1048
+ "grad_norm": 0.3616081476211548,
1049
+ "learning_rate": 3.212519349187766e-06,
1050
+ "loss": 0.07028600573539734,
1051
+ "step": 2860
1052
+ },
1053
+ {
1054
+ "epoch": 1.7531202435312023,
1055
+ "grad_norm": 0.41925984621047974,
1056
+ "learning_rate": 3.188478132311319e-06,
1057
+ "loss": 0.08469281196594239,
1058
+ "step": 2880
1059
+ },
1060
+ {
1061
+ "epoch": 1.765296803652968,
1062
+ "grad_norm": 0.4866960346698761,
1063
+ "learning_rate": 3.164367822418029e-06,
1064
+ "loss": 0.09567424058914184,
1065
+ "step": 2900
1066
+ },
1067
+ {
1068
+ "epoch": 1.7774733637747335,
1069
+ "grad_norm": 0.2977049648761749,
1070
+ "learning_rate": 3.1401908391258474e-06,
1071
+ "loss": 0.07239987254142762,
1072
+ "step": 2920
1073
+ },
1074
+ {
1075
+ "epoch": 1.7896499238964991,
1076
+ "grad_norm": 0.3723915219306946,
1077
+ "learning_rate": 3.1159496087438098e-06,
1078
+ "loss": 0.0891954243183136,
1079
+ "step": 2940
1080
+ },
1081
+ {
1082
+ "epoch": 1.8018264840182647,
1083
+ "grad_norm": 0.22685140371322632,
1084
+ "learning_rate": 3.0916465640285426e-06,
1085
+ "loss": 0.07796849608421326,
1086
+ "step": 2960
1087
+ },
1088
+ {
1089
+ "epoch": 1.8140030441400303,
1090
+ "grad_norm": 0.23534800112247467,
1091
+ "learning_rate": 3.0672841439401223e-06,
1092
+ "loss": 0.08645985722541809,
1093
+ "step": 2980
1094
+ },
1095
+ {
1096
+ "epoch": 1.826179604261796,
1097
+ "grad_norm": 0.40188542008399963,
1098
+ "learning_rate": 3.0428647933973103e-06,
1099
+ "loss": 0.08427774310111999,
1100
+ "step": 3000
1101
+ },
1102
+ {
1103
+ "epoch": 1.826179604261796,
1104
+ "eval_loss": 0.2774485647678375,
1105
+ "eval_runtime": 46.6449,
1106
+ "eval_samples_per_second": 18.866,
1107
+ "eval_steps_per_second": 18.866,
1108
+ "step": 3000
1109
+ },
1110
+ {
1111
+ "epoch": 1.8383561643835615,
1112
+ "grad_norm": 0.35732510685920715,
1113
+ "learning_rate": 3.0183909630321865e-06,
1114
+ "loss": 0.07381275296211243,
1115
+ "step": 3020
1116
+ },
1117
+ {
1118
+ "epoch": 1.8505327245053271,
1119
+ "grad_norm": 0.3167949914932251,
1120
+ "learning_rate": 2.9938651089442184e-06,
1121
+ "loss": 0.07289664745330811,
1122
+ "step": 3040
1123
+ },
1124
+ {
1125
+ "epoch": 1.8627092846270927,
1126
+ "grad_norm": 0.4893674850463867,
1127
+ "learning_rate": 2.969289692453773e-06,
1128
+ "loss": 0.07124295830726624,
1129
+ "step": 3060
1130
+ },
1131
+ {
1132
+ "epoch": 1.8748858447488583,
1133
+ "grad_norm": 0.3017306327819824,
1134
+ "learning_rate": 2.944667179855109e-06,
1135
+ "loss": 0.08125877976417542,
1136
+ "step": 3080
1137
+ },
1138
+ {
1139
+ "epoch": 1.887062404870624,
1140
+ "grad_norm": 0.3442900776863098,
1141
+ "learning_rate": 2.920000042168871e-06,
1142
+ "loss": 0.0724608838558197,
1143
+ "step": 3100
1144
+ },
1145
+ {
1146
+ "epoch": 1.8992389649923895,
1147
+ "grad_norm": 0.27901750802993774,
1148
+ "learning_rate": 2.8952907548941057e-06,
1149
+ "loss": 0.07104775309562683,
1150
+ "step": 3120
1151
+ },
1152
+ {
1153
+ "epoch": 1.9114155251141551,
1154
+ "grad_norm": 0.35838621854782104,
1155
+ "learning_rate": 2.8705417977598277e-06,
1156
+ "loss": 0.0677955150604248,
1157
+ "step": 3140
1158
+ },
1159
+ {
1160
+ "epoch": 1.9235920852359207,
1161
+ "grad_norm": 0.3615752160549164,
1162
+ "learning_rate": 2.8457556544761687e-06,
1163
+ "loss": 0.07164496779441834,
1164
+ "step": 3160
1165
+ },
1166
+ {
1167
+ "epoch": 1.9357686453576863,
1168
+ "grad_norm": 0.5117827653884888,
1169
+ "learning_rate": 2.8209348124851187e-06,
1170
+ "loss": 0.071807599067688,
1171
+ "step": 3180
1172
+ },
1173
+ {
1174
+ "epoch": 1.947945205479452,
1175
+ "grad_norm": 0.36082401871681213,
1176
+ "learning_rate": 2.7960817627108965e-06,
1177
+ "loss": 0.095755535364151,
1178
+ "step": 3200
1179
+ },
1180
+ {
1181
+ "epoch": 1.9601217656012175,
1182
+ "grad_norm": 0.27905145287513733,
1183
+ "learning_rate": 2.77119899930997e-06,
1184
+ "loss": 0.07055851817131042,
1185
+ "step": 3220
1186
+ },
1187
+ {
1188
+ "epoch": 1.9722983257229831,
1189
+ "grad_norm": 0.5642575621604919,
1190
+ "learning_rate": 2.7462890194207513e-06,
1191
+ "loss": 0.07278798818588257,
1192
+ "step": 3240
1193
+ },
1194
+ {
1195
+ "epoch": 1.9844748858447487,
1196
+ "grad_norm": 0.2286670207977295,
1197
+ "learning_rate": 2.7213543229129956e-06,
1198
+ "loss": 0.07153088450431824,
1199
+ "step": 3260
1200
+ },
1201
+ {
1202
+ "epoch": 1.9966514459665143,
1203
+ "grad_norm": 0.3228706121444702,
1204
+ "learning_rate": 2.6963974121369242e-06,
1205
+ "loss": 0.07440360188484192,
1206
+ "step": 3280
1207
+ },
1208
+ {
1209
+ "epoch": 2.008523592085236,
1210
+ "grad_norm": 0.21613839268684387,
1211
+ "learning_rate": 2.671420791672093e-06,
1212
+ "loss": 0.0517767608165741,
1213
+ "step": 3300
1214
+ },
1215
+ {
1216
+ "epoch": 2.0207001522070014,
1217
+ "grad_norm": 0.19271881878376007,
1218
+ "learning_rate": 2.646426968076052e-06,
1219
+ "loss": 0.03812239170074463,
1220
+ "step": 3320
1221
+ },
1222
+ {
1223
+ "epoch": 2.032876712328767,
1224
+ "grad_norm": 0.12863056361675262,
1225
+ "learning_rate": 2.6214184496327865e-06,
1226
+ "loss": 0.04107579588890076,
1227
+ "step": 3340
1228
+ },
1229
+ {
1230
+ "epoch": 2.0450532724505326,
1231
+ "grad_norm": 0.26679477095603943,
1232
+ "learning_rate": 2.5963977461010022e-06,
1233
+ "loss": 0.04673115909099579,
1234
+ "step": 3360
1235
+ },
1236
+ {
1237
+ "epoch": 2.057229832572298,
1238
+ "grad_norm": 0.2743483781814575,
1239
+ "learning_rate": 2.5713673684622524e-06,
1240
+ "loss": 0.03674449622631073,
1241
+ "step": 3380
1242
+ },
1243
+ {
1244
+ "epoch": 2.069406392694064,
1245
+ "grad_norm": 0.16999909281730652,
1246
+ "learning_rate": 2.546329828668949e-06,
1247
+ "loss": 0.03422380387783051,
1248
+ "step": 3400
1249
+ },
1250
+ {
1251
+ "epoch": 2.0815829528158294,
1252
+ "grad_norm": 0.2931291460990906,
1253
+ "learning_rate": 2.5212876393922657e-06,
1254
+ "loss": 0.035878732800483704,
1255
+ "step": 3420
1256
+ },
1257
+ {
1258
+ "epoch": 2.093759512937595,
1259
+ "grad_norm": 0.34781521558761597,
1260
+ "learning_rate": 2.496243313769986e-06,
1261
+ "loss": 0.03577531576156616,
1262
+ "step": 3440
1263
+ },
1264
+ {
1265
+ "epoch": 2.1059360730593606,
1266
+ "grad_norm": 0.472351998090744,
1267
+ "learning_rate": 2.471199365154283e-06,
1268
+ "loss": 0.04281675517559051,
1269
+ "step": 3460
1270
+ },
1271
+ {
1272
+ "epoch": 2.118112633181126,
1273
+ "grad_norm": 0.38320988416671753,
1274
+ "learning_rate": 2.4461583068595014e-06,
1275
+ "loss": 0.042955422401428224,
1276
+ "step": 3480
1277
+ },
1278
+ {
1279
+ "epoch": 2.130289193302892,
1280
+ "grad_norm": 0.12197626382112503,
1281
+ "learning_rate": 2.421122651909918e-06,
1282
+ "loss": 0.04432957172393799,
1283
+ "step": 3500
1284
+ },
1285
+ {
1286
+ "epoch": 2.130289193302892,
1287
+ "eval_loss": 0.3123805522918701,
1288
+ "eval_runtime": 46.5258,
1289
+ "eval_samples_per_second": 18.914,
1290
+ "eval_steps_per_second": 18.914,
1291
+ "step": 3500
1292
+ },
1293
+ {
1294
+ "epoch": 2.1424657534246574,
1295
+ "grad_norm": 0.21291697025299072,
1296
+ "learning_rate": 2.3960949127875556e-06,
1297
+ "loss": 0.03356837034225464,
1298
+ "step": 3520
1299
+ },
1300
+ {
1301
+ "epoch": 2.154642313546423,
1302
+ "grad_norm": 0.44130077958106995,
1303
+ "learning_rate": 2.371077601180031e-06,
1304
+ "loss": 0.036935809254646304,
1305
+ "step": 3540
1306
+ },
1307
+ {
1308
+ "epoch": 2.1668188736681886,
1309
+ "grad_norm": 0.49069875478744507,
1310
+ "learning_rate": 2.3460732277284994e-06,
1311
+ "loss": 0.0395690768957138,
1312
+ "step": 3560
1313
+ },
1314
+ {
1315
+ "epoch": 2.178995433789954,
1316
+ "grad_norm": 0.2824937701225281,
1317
+ "learning_rate": 2.321084301775689e-06,
1318
+ "loss": 0.044693085551261905,
1319
+ "step": 3580
1320
+ },
1321
+ {
1322
+ "epoch": 2.19117199391172,
1323
+ "grad_norm": 0.35114097595214844,
1324
+ "learning_rate": 2.29611333111408e-06,
1325
+ "loss": 0.03243565857410431,
1326
+ "step": 3600
1327
+ },
1328
+ {
1329
+ "epoch": 2.2033485540334854,
1330
+ "grad_norm": 0.47931790351867676,
1331
+ "learning_rate": 2.271162821734225e-06,
1332
+ "loss": 0.04325798749923706,
1333
+ "step": 3620
1334
+ },
1335
+ {
1336
+ "epoch": 2.215525114155251,
1337
+ "grad_norm": 0.12716218829154968,
1338
+ "learning_rate": 2.2462352775732653e-06,
1339
+ "loss": 0.03856868743896484,
1340
+ "step": 3640
1341
+ },
1342
+ {
1343
+ "epoch": 2.2277016742770166,
1344
+ "grad_norm": 0.3522437512874603,
1345
+ "learning_rate": 2.221333200263637e-06,
1346
+ "loss": 0.041602414846420285,
1347
+ "step": 3660
1348
+ },
1349
+ {
1350
+ "epoch": 2.239878234398782,
1351
+ "grad_norm": 0.234590083360672,
1352
+ "learning_rate": 2.1964590888820233e-06,
1353
+ "loss": 0.04286134541034699,
1354
+ "step": 3680
1355
+ },
1356
+ {
1357
+ "epoch": 2.252054794520548,
1358
+ "grad_norm": 0.2972453534603119,
1359
+ "learning_rate": 2.1716154396985526e-06,
1360
+ "loss": 0.041756758093833925,
1361
+ "step": 3700
1362
+ },
1363
+ {
1364
+ "epoch": 2.2642313546423134,
1365
+ "grad_norm": 0.3236760199069977,
1366
+ "learning_rate": 2.1468047459262882e-06,
1367
+ "loss": 0.0359495222568512,
1368
+ "step": 3720
1369
+ },
1370
+ {
1371
+ "epoch": 2.276407914764079,
1372
+ "grad_norm": 0.32255831360816956,
1373
+ "learning_rate": 2.12202949747101e-06,
1374
+ "loss": 0.04322676360607147,
1375
+ "step": 3740
1376
+ },
1377
+ {
1378
+ "epoch": 2.2885844748858446,
1379
+ "grad_norm": 0.2753404378890991,
1380
+ "learning_rate": 2.0972921806813468e-06,
1381
+ "loss": 0.04191597998142242,
1382
+ "step": 3760
1383
+ },
1384
+ {
1385
+ "epoch": 2.30076103500761,
1386
+ "grad_norm": 0.23182159662246704,
1387
+ "learning_rate": 2.072595278099247e-06,
1388
+ "loss": 0.041278204321861266,
1389
+ "step": 3780
1390
+ },
1391
+ {
1392
+ "epoch": 2.312937595129376,
1393
+ "grad_norm": 0.25987961888313293,
1394
+ "learning_rate": 2.047941268210849e-06,
1395
+ "loss": 0.04312986135482788,
1396
+ "step": 3800
1397
+ },
1398
+ {
1399
+ "epoch": 2.3251141552511414,
1400
+ "grad_norm": 0.3683331310749054,
1401
+ "learning_rate": 2.0233326251977426e-06,
1402
+ "loss": 0.04236046075820923,
1403
+ "step": 3820
1404
+ },
1405
+ {
1406
+ "epoch": 2.337290715372907,
1407
+ "grad_norm": 0.2520082890987396,
1408
+ "learning_rate": 1.9987718186886724e-06,
1409
+ "loss": 0.04433901011943817,
1410
+ "step": 3840
1411
+ },
1412
+ {
1413
+ "epoch": 2.3494672754946726,
1414
+ "grad_norm": 0.16200299561023712,
1415
+ "learning_rate": 1.9742613135116986e-06,
1416
+ "loss": 0.04127628207206726,
1417
+ "step": 3860
1418
+ },
1419
+ {
1420
+ "epoch": 2.361643835616438,
1421
+ "grad_norm": 0.37064701318740845,
1422
+ "learning_rate": 1.949803569446828e-06,
1423
+ "loss": 0.04586326479911804,
1424
+ "step": 3880
1425
+ },
1426
+ {
1427
+ "epoch": 2.373820395738204,
1428
+ "grad_norm": 0.416020005941391,
1429
+ "learning_rate": 1.925401040979171e-06,
1430
+ "loss": 0.03624185025691986,
1431
+ "step": 3900
1432
+ },
1433
+ {
1434
+ "epoch": 2.3859969558599694,
1435
+ "grad_norm": 0.17131179571151733,
1436
+ "learning_rate": 1.9010561770526076e-06,
1437
+ "loss": 0.035064518451690674,
1438
+ "step": 3920
1439
+ },
1440
+ {
1441
+ "epoch": 2.398173515981735,
1442
+ "grad_norm": 0.5188226103782654,
1443
+ "learning_rate": 1.8767714208240312e-06,
1444
+ "loss": 0.042050021886825564,
1445
+ "step": 3940
1446
+ },
1447
+ {
1448
+ "epoch": 2.4103500761035006,
1449
+ "grad_norm": 0.3398009240627289,
1450
+ "learning_rate": 1.852549209418154e-06,
1451
+ "loss": 0.038166466355323794,
1452
+ "step": 3960
1453
+ },
1454
+ {
1455
+ "epoch": 2.422526636225266,
1456
+ "grad_norm": 0.2541758418083191,
1457
+ "learning_rate": 1.8283919736829332e-06,
1458
+ "loss": 0.040885674953460696,
1459
+ "step": 3980
1460
+ },
1461
+ {
1462
+ "epoch": 2.434703196347032,
1463
+ "grad_norm": 0.4074256122112274,
1464
+ "learning_rate": 1.804302137945614e-06,
1465
+ "loss": 0.040162667632102966,
1466
+ "step": 4000
1467
+ },
1468
+ {
1469
+ "epoch": 2.434703196347032,
1470
+ "eval_loss": 0.31583163142204285,
1471
+ "eval_runtime": 47.319,
1472
+ "eval_samples_per_second": 18.597,
1473
+ "eval_steps_per_second": 18.597,
1474
+ "step": 4000
1475
+ },
1476
+ {
1477
+ "epoch": 2.4468797564687974,
1478
+ "grad_norm": 0.2724802494049072,
1479
+ "learning_rate": 1.7802821197694426e-06,
1480
+ "loss": 0.04170995056629181,
1481
+ "step": 4020
1482
+ },
1483
+ {
1484
+ "epoch": 2.459056316590563,
1485
+ "grad_norm": 0.21376508474349976,
1486
+ "learning_rate": 1.7563343297110375e-06,
1487
+ "loss": 0.03834344446659088,
1488
+ "step": 4040
1489
+ },
1490
+ {
1491
+ "epoch": 2.4712328767123286,
1492
+ "grad_norm": 0.2933412492275238,
1493
+ "learning_rate": 1.732461171078486e-06,
1494
+ "loss": 0.03928310573101044,
1495
+ "step": 4060
1496
+ },
1497
+ {
1498
+ "epoch": 2.483409436834094,
1499
+ "grad_norm": 0.46805083751678467,
1500
+ "learning_rate": 1.7086650396901489e-06,
1501
+ "loss": 0.03358933925628662,
1502
+ "step": 4080
1503
+ },
1504
+ {
1505
+ "epoch": 2.49558599695586,
1506
+ "grad_norm": 0.45667552947998047,
1507
+ "learning_rate": 1.6849483236342322e-06,
1508
+ "loss": 0.03547535240650177,
1509
+ "step": 4100
1510
+ },
1511
+ {
1512
+ "epoch": 2.5077625570776254,
1513
+ "grad_norm": 0.24512450397014618,
1514
+ "learning_rate": 1.6613134030291217e-06,
1515
+ "loss": 0.03600102663040161,
1516
+ "step": 4120
1517
+ },
1518
+ {
1519
+ "epoch": 2.519939117199391,
1520
+ "grad_norm": 0.20636337995529175,
1521
+ "learning_rate": 1.6377626497845278e-06,
1522
+ "loss": 0.04347077012062073,
1523
+ "step": 4140
1524
+ },
1525
+ {
1526
+ "epoch": 2.5321156773211566,
1527
+ "grad_norm": 0.4584953486919403,
1528
+ "learning_rate": 1.6142984273634505e-06,
1529
+ "loss": 0.02908192276954651,
1530
+ "step": 4160
1531
+ },
1532
+ {
1533
+ "epoch": 2.544292237442922,
1534
+ "grad_norm": 0.26193487644195557,
1535
+ "learning_rate": 1.5909230905449846e-06,
1536
+ "loss": 0.03611198365688324,
1537
+ "step": 4180
1538
+ },
1539
+ {
1540
+ "epoch": 2.556468797564688,
1541
+ "grad_norm": 0.20813828706741333,
1542
+ "learning_rate": 1.567638985188012e-06,
1543
+ "loss": 0.03758668601512909,
1544
+ "step": 4200
1545
+ },
1546
+ {
1547
+ "epoch": 2.5686453576864534,
1548
+ "grad_norm": 0.3395022749900818,
1549
+ "learning_rate": 1.544448447995773e-06,
1550
+ "loss": 0.033633843064308167,
1551
+ "step": 4220
1552
+ },
1553
+ {
1554
+ "epoch": 2.580821917808219,
1555
+ "grad_norm": 0.1472434103488922,
1556
+ "learning_rate": 1.52135380628137e-06,
1557
+ "loss": 0.036797890067100526,
1558
+ "step": 4240
1559
+ },
1560
+ {
1561
+ "epoch": 2.5929984779299846,
1562
+ "grad_norm": 0.5788060426712036,
1563
+ "learning_rate": 1.498357377734201e-06,
1564
+ "loss": 0.039166563749313356,
1565
+ "step": 4260
1566
+ },
1567
+ {
1568
+ "epoch": 2.60517503805175,
1569
+ "grad_norm": 0.7623679637908936,
1570
+ "learning_rate": 1.4754614701873703e-06,
1571
+ "loss": 0.03717599511146545,
1572
+ "step": 4280
1573
+ },
1574
+ {
1575
+ "epoch": 2.6173515981735163,
1576
+ "grad_norm": 0.16205403208732605,
1577
+ "learning_rate": 1.4526683813860792e-06,
1578
+ "loss": 0.03962793946266174,
1579
+ "step": 4300
1580
+ },
1581
+ {
1582
+ "epoch": 2.6295281582952814,
1583
+ "grad_norm": 0.11986076086759567,
1584
+ "learning_rate": 1.4299803987570396e-06,
1585
+ "loss": 0.035475924611091614,
1586
+ "step": 4320
1587
+ },
1588
+ {
1589
+ "epoch": 2.6417047184170475,
1590
+ "grad_norm": 0.15573006868362427,
1591
+ "learning_rate": 1.4073997991789078e-06,
1592
+ "loss": 0.03256964683532715,
1593
+ "step": 4340
1594
+ },
1595
+ {
1596
+ "epoch": 2.6538812785388126,
1597
+ "grad_norm": 0.25151512026786804,
1598
+ "learning_rate": 1.384928848753792e-06,
1599
+ "loss": 0.03712306022644043,
1600
+ "step": 4360
1601
+ },
1602
+ {
1603
+ "epoch": 2.6660578386605787,
1604
+ "grad_norm": 0.20408153533935547,
1605
+ "learning_rate": 1.3625698025798322e-06,
1606
+ "loss": 0.041410398483276364,
1607
+ "step": 4380
1608
+ },
1609
+ {
1610
+ "epoch": 2.678234398782344,
1611
+ "grad_norm": 0.3156696856021881,
1612
+ "learning_rate": 1.3403249045248907e-06,
1613
+ "loss": 0.03158504366874695,
1614
+ "step": 4400
1615
+ },
1616
+ {
1617
+ "epoch": 2.69041095890411,
1618
+ "grad_norm": 0.3835665285587311,
1619
+ "learning_rate": 1.3181963870013604e-06,
1620
+ "loss": 0.03525224924087524,
1621
+ "step": 4420
1622
+ },
1623
+ {
1624
+ "epoch": 2.702587519025875,
1625
+ "grad_norm": 0.45423486828804016,
1626
+ "learning_rate": 1.2961864707421345e-06,
1627
+ "loss": 0.03239959478378296,
1628
+ "step": 4440
1629
+ },
1630
+ {
1631
+ "epoch": 2.714764079147641,
1632
+ "grad_norm": 0.15982766449451447,
1633
+ "learning_rate": 1.2742973645777394e-06,
1634
+ "loss": 0.031032082438468934,
1635
+ "step": 4460
1636
+ },
1637
+ {
1638
+ "epoch": 2.726940639269406,
1639
+ "grad_norm": 0.2770426869392395,
1640
+ "learning_rate": 1.252531265214662e-06,
1641
+ "loss": 0.030566230416297913,
1642
+ "step": 4480
1643
+ },
1644
+ {
1645
+ "epoch": 2.7391171993911723,
1646
+ "grad_norm": 0.3693839907646179,
1647
+ "learning_rate": 1.2308903570149048e-06,
1648
+ "loss": 0.041362547874450685,
1649
+ "step": 4500
1650
+ },
1651
+ {
1652
+ "epoch": 2.7391171993911723,
1653
+ "eval_loss": 0.3391737937927246,
1654
+ "eval_runtime": 46.9306,
1655
+ "eval_samples_per_second": 18.751,
1656
+ "eval_steps_per_second": 18.751,
1657
+ "step": 4500
1658
+ },
1659
+ {
1660
+ "epoch": 2.7512937595129374,
1661
+ "grad_norm": 0.3229275643825531,
1662
+ "learning_rate": 1.2093768117767613e-06,
1663
+ "loss": 0.0388390064239502,
1664
+ "step": 4520
1665
+ },
1666
+ {
1667
+ "epoch": 2.7634703196347035,
1668
+ "grad_norm": 0.6786078214645386,
1669
+ "learning_rate": 1.1879927885168733e-06,
1670
+ "loss": 0.032555675506591795,
1671
+ "step": 4540
1672
+ },
1673
+ {
1674
+ "epoch": 2.7756468797564686,
1675
+ "grad_norm": 0.32371029257774353,
1676
+ "learning_rate": 1.1667404332535504e-06,
1677
+ "loss": 0.03606459796428681,
1678
+ "step": 4560
1679
+ },
1680
+ {
1681
+ "epoch": 2.7878234398782347,
1682
+ "grad_norm": 0.44066882133483887,
1683
+ "learning_rate": 1.1456218787914128e-06,
1684
+ "loss": 0.032086309790611264,
1685
+ "step": 4580
1686
+ },
1687
+ {
1688
+ "epoch": 2.8,
1689
+ "grad_norm": 0.5005165338516235,
1690
+ "learning_rate": 1.1246392445073438e-06,
1691
+ "loss": 0.033362787961959836,
1692
+ "step": 4600
1693
+ },
1694
+ {
1695
+ "epoch": 2.812176560121766,
1696
+ "grad_norm": 0.22586822509765625,
1697
+ "learning_rate": 1.1037946361378027e-06,
1698
+ "loss": 0.03638745844364166,
1699
+ "step": 4620
1700
+ },
1701
+ {
1702
+ "epoch": 2.824353120243531,
1703
+ "grad_norm": 0.2905796766281128,
1704
+ "learning_rate": 1.0830901455674977e-06,
1705
+ "loss": 0.030933958292007447,
1706
+ "step": 4640
1707
+ },
1708
+ {
1709
+ "epoch": 2.836529680365297,
1710
+ "grad_norm": 0.10796497762203217,
1711
+ "learning_rate": 1.0625278506194538e-06,
1712
+ "loss": 0.02879139482975006,
1713
+ "step": 4660
1714
+ },
1715
+ {
1716
+ "epoch": 2.8487062404870622,
1717
+ "grad_norm": 0.2545916438102722,
1718
+ "learning_rate": 1.04210981484649e-06,
1719
+ "loss": 0.03345020413398743,
1720
+ "step": 4680
1721
+ },
1722
+ {
1723
+ "epoch": 2.8608828006088283,
1724
+ "grad_norm": 0.2986568808555603,
1725
+ "learning_rate": 1.0218380873241314e-06,
1726
+ "loss": 0.02593054175376892,
1727
+ "step": 4700
1728
+ },
1729
+ {
1730
+ "epoch": 2.8730593607305934,
1731
+ "grad_norm": 0.1755756139755249,
1732
+ "learning_rate": 1.0017147024449674e-06,
1733
+ "loss": 0.03906567096710205,
1734
+ "step": 4720
1735
+ },
1736
+ {
1737
+ "epoch": 2.8852359208523595,
1738
+ "grad_norm": 0.18288888037204742,
1739
+ "learning_rate": 9.81741679714493e-07,
1740
+ "loss": 0.03371626436710358,
1741
+ "step": 4740
1742
+ },
1743
+ {
1744
+ "epoch": 2.8974124809741246,
1745
+ "grad_norm": 0.368429958820343,
1746
+ "learning_rate": 9.619210235484333e-07,
1747
+ "loss": 0.03090968132019043,
1748
+ "step": 4760
1749
+ },
1750
+ {
1751
+ "epoch": 2.9095890410958907,
1752
+ "grad_norm": 0.17118144035339355,
1753
+ "learning_rate": 9.422547230715931e-07,
1754
+ "loss": 0.0322105199098587,
1755
+ "step": 4780
1756
+ },
1757
+ {
1758
+ "epoch": 2.921765601217656,
1759
+ "grad_norm": 0.41911277174949646,
1760
+ "learning_rate": 9.227447519182353e-07,
1761
+ "loss": 0.035210177302360535,
1762
+ "step": 4800
1763
+ },
1764
+ {
1765
+ "epoch": 2.933942161339422,
1766
+ "grad_norm": 0.3521968722343445,
1767
+ "learning_rate": 9.033930680340097e-07,
1768
+ "loss": 0.026842504739761353,
1769
+ "step": 4820
1770
+ },
1771
+ {
1772
+ "epoch": 2.946118721461187,
1773
+ "grad_norm": 0.20812013745307922,
1774
+ "learning_rate": 8.842016134794682e-07,
1775
+ "loss": 0.03439584076404571,
1776
+ "step": 4840
1777
+ },
1778
+ {
1779
+ "epoch": 2.958295281582953,
1780
+ "grad_norm": 0.2796875834465027,
1781
+ "learning_rate": 8.651723142351603e-07,
1782
+ "loss": 0.04011322855949402,
1783
+ "step": 4860
1784
+ },
1785
+ {
1786
+ "epoch": 2.9704718417047182,
1787
+ "grad_norm": 0.20960687100887299,
1788
+ "learning_rate": 8.463070800083562e-07,
1789
+ "loss": 0.03800423145294189,
1790
+ "step": 4880
1791
+ },
1792
+ {
1793
+ "epoch": 2.9826484018264843,
1794
+ "grad_norm": 0.2586495876312256,
1795
+ "learning_rate": 8.276078040413879e-07,
1796
+ "loss": 0.03839131891727447,
1797
+ "step": 4900
1798
+ },
1799
+ {
1800
+ "epoch": 2.9948249619482494,
1801
+ "grad_norm": 0.37137141823768616,
1802
+ "learning_rate": 8.090763629216589e-07,
1803
+ "loss": 0.02721840739250183,
1804
+ "step": 4920
1805
+ },
1806
+ {
1807
+ "epoch": 3.006697108066971,
1808
+ "grad_norm": 0.3677407503128052,
1809
+ "learning_rate": 7.907146163933102e-07,
1810
+ "loss": 0.023991990089416503,
1811
+ "step": 4940
1812
+ },
1813
+ {
1814
+ "epoch": 3.0188736681887365,
1815
+ "grad_norm": 0.11811063438653946,
1816
+ "learning_rate": 7.725244071705871e-07,
1817
+ "loss": 0.01451514959335327,
1818
+ "step": 4960
1819
+ },
1820
+ {
1821
+ "epoch": 3.031050228310502,
1822
+ "grad_norm": 0.3449067771434784,
1823
+ "learning_rate": 7.545075607529104e-07,
1824
+ "loss": 0.014327619969844819,
1825
+ "step": 4980
1826
+ },
1827
+ {
1828
+ "epoch": 3.0432267884322677,
1829
+ "grad_norm": 0.3237353265285492,
1830
+ "learning_rate": 7.366658852416788e-07,
1831
+ "loss": 0.017832010984420776,
1832
+ "step": 5000
1833
+ },
1834
+ {
1835
+ "epoch": 3.0432267884322677,
1836
+ "eval_loss": 0.39115142822265625,
1837
+ "eval_runtime": 47.6431,
1838
+ "eval_samples_per_second": 18.471,
1839
+ "eval_steps_per_second": 18.471,
1840
+ "step": 5000
1841
+ },
1842
+ {
1843
+ "epoch": 3.0554033485540333,
1844
+ "grad_norm": 0.2226281613111496,
1845
+ "learning_rate": 7.190011711588101e-07,
1846
+ "loss": 0.011674411594867706,
1847
+ "step": 5020
1848
+ },
1849
+ {
1850
+ "epoch": 3.067579908675799,
1851
+ "grad_norm": 0.08729376643896103,
1852
+ "learning_rate": 7.015151912670562e-07,
1853
+ "loss": 0.013690856099128724,
1854
+ "step": 5040
1855
+ },
1856
+ {
1857
+ "epoch": 3.0797564687975645,
1858
+ "grad_norm": 0.2745465636253357,
1859
+ "learning_rate": 6.842097003920903e-07,
1860
+ "loss": 0.011978642642498016,
1861
+ "step": 5060
1862
+ },
1863
+ {
1864
+ "epoch": 3.09193302891933,
1865
+ "grad_norm": 0.06905842572450638,
1866
+ "learning_rate": 6.67086435246406e-07,
1867
+ "loss": 0.013893941044807434,
1868
+ "step": 5080
1869
+ },
1870
+ {
1871
+ "epoch": 3.1041095890410957,
1872
+ "grad_norm": 0.07840315997600555,
1873
+ "learning_rate": 6.501471142550194e-07,
1874
+ "loss": 0.009910025447607041,
1875
+ "step": 5100
1876
+ },
1877
+ {
1878
+ "epoch": 3.1162861491628613,
1879
+ "grad_norm": 0.19672124087810516,
1880
+ "learning_rate": 6.333934373830222e-07,
1881
+ "loss": 0.008863755315542222,
1882
+ "step": 5120
1883
+ },
1884
+ {
1885
+ "epoch": 3.128462709284627,
1886
+ "grad_norm": 0.37645605206489563,
1887
+ "learning_rate": 6.168270859649761e-07,
1888
+ "loss": 0.010502541810274124,
1889
+ "step": 5140
1890
+ },
1891
+ {
1892
+ "epoch": 3.1406392694063925,
1893
+ "grad_norm": 0.2069159746170044,
1894
+ "learning_rate": 6.004497225361786e-07,
1895
+ "loss": 0.012096930295228958,
1896
+ "step": 5160
1897
+ },
1898
+ {
1899
+ "epoch": 3.1528158295281585,
1900
+ "grad_norm": 0.2584017217159271,
1901
+ "learning_rate": 5.842629906658226e-07,
1902
+ "loss": 0.013278065621852875,
1903
+ "step": 5180
1904
+ },
1905
+ {
1906
+ "epoch": 3.1649923896499237,
1907
+ "grad_norm": 0.2050527036190033,
1908
+ "learning_rate": 5.682685147920481e-07,
1909
+ "loss": 0.013548998534679413,
1910
+ "step": 5200
1911
+ },
1912
+ {
1913
+ "epoch": 3.1771689497716897,
1914
+ "grad_norm": 0.13838107883930206,
1915
+ "learning_rate": 5.524679000589256e-07,
1916
+ "loss": 0.013736458122730255,
1917
+ "step": 5220
1918
+ },
1919
+ {
1920
+ "epoch": 3.189345509893455,
1921
+ "grad_norm": 0.06378225982189178,
1922
+ "learning_rate": 5.36862732155366e-07,
1923
+ "loss": 0.013177134096622467,
1924
+ "step": 5240
1925
+ },
1926
+ {
1927
+ "epoch": 3.201522070015221,
1928
+ "grad_norm": 0.27431613206863403,
1929
+ "learning_rate": 5.214545771559879e-07,
1930
+ "loss": 0.011971819400787353,
1931
+ "step": 5260
1932
+ },
1933
+ {
1934
+ "epoch": 3.213698630136986,
1935
+ "grad_norm": 0.529901921749115,
1936
+ "learning_rate": 5.062449813639528e-07,
1937
+ "loss": 0.014422819018363953,
1938
+ "step": 5280
1939
+ },
1940
+ {
1941
+ "epoch": 3.225875190258752,
1942
+ "grad_norm": 0.19417761266231537,
1943
+ "learning_rate": 4.912354711557856e-07,
1944
+ "loss": 0.010663678497076034,
1945
+ "step": 5300
1946
+ },
1947
+ {
1948
+ "epoch": 3.2380517503805173,
1949
+ "grad_norm": 0.044735077768564224,
1950
+ "learning_rate": 4.764275528281892e-07,
1951
+ "loss": 0.011400717496871948,
1952
+ "step": 5320
1953
+ },
1954
+ {
1955
+ "epoch": 3.2502283105022833,
1956
+ "grad_norm": 0.057179443538188934,
1957
+ "learning_rate": 4.6182271244688355e-07,
1958
+ "loss": 0.008456526696681977,
1959
+ "step": 5340
1960
+ },
1961
+ {
1962
+ "epoch": 3.2624048706240485,
1963
+ "grad_norm": 0.10396906733512878,
1964
+ "learning_rate": 4.4742241569746407e-07,
1965
+ "loss": 0.014539115130901337,
1966
+ "step": 5360
1967
+ },
1968
+ {
1969
+ "epoch": 3.2745814307458145,
1970
+ "grad_norm": 0.32904428243637085,
1971
+ "learning_rate": 4.332281077383177e-07,
1972
+ "loss": 0.017625690996646882,
1973
+ "step": 5380
1974
+ },
1975
+ {
1976
+ "epoch": 3.2867579908675797,
1977
+ "grad_norm": 0.20823979377746582,
1978
+ "learning_rate": 4.1924121305558563e-07,
1979
+ "loss": 0.007641100138425827,
1980
+ "step": 5400
1981
+ },
1982
+ {
1983
+ "epoch": 3.2989345509893457,
1984
+ "grad_norm": 0.25470009446144104,
1985
+ "learning_rate": 4.054631353202121e-07,
1986
+ "loss": 0.011799700558185577,
1987
+ "step": 5420
1988
+ },
1989
+ {
1990
+ "epoch": 3.311111111111111,
1991
+ "grad_norm": 0.3968588709831238,
1992
+ "learning_rate": 3.9189525724707634e-07,
1993
+ "loss": 0.011455408483743667,
1994
+ "step": 5440
1995
+ },
1996
+ {
1997
+ "epoch": 3.323287671232877,
1998
+ "grad_norm": 0.10818332433700562,
1999
+ "learning_rate": 3.785389404562259e-07,
2000
+ "loss": 0.012499115616083144,
2001
+ "step": 5460
2002
+ },
2003
+ {
2004
+ "epoch": 3.335464231354642,
2005
+ "grad_norm": 0.1818460375070572,
2006
+ "learning_rate": 3.653955253362351e-07,
2007
+ "loss": 0.01148865669965744,
2008
+ "step": 5480
2009
+ },
2010
+ {
2011
+ "epoch": 3.347640791476408,
2012
+ "grad_norm": 0.3504088521003723,
2013
+ "learning_rate": 3.5246633090968205e-07,
2014
+ "loss": 0.012819178402423859,
2015
+ "step": 5500
2016
+ },
2017
+ {
2018
+ "epoch": 3.347640791476408,
2019
+ "eval_loss": 0.43404534459114075,
2020
+ "eval_runtime": 46.4882,
2021
+ "eval_samples_per_second": 18.93,
2022
+ "eval_steps_per_second": 18.93,
2023
+ "step": 5500
2024
+ },
2025
+ {
2026
+ "epoch": 3.3598173515981733,
2027
+ "grad_norm": 0.4551874101161957,
2028
+ "learning_rate": 3.397526547007832e-07,
2029
+ "loss": 0.013325585424900055,
2030
+ "step": 5520
2031
+ },
2032
+ {
2033
+ "epoch": 3.3719939117199393,
2034
+ "grad_norm": 0.35187825560569763,
2035
+ "learning_rate": 3.2725577260517396e-07,
2036
+ "loss": 0.011712662875652313,
2037
+ "step": 5540
2038
+ },
2039
+ {
2040
+ "epoch": 3.3841704718417045,
2041
+ "grad_norm": 0.6071529984474182,
2042
+ "learning_rate": 3.14976938761867e-07,
2043
+ "loss": 0.01580573171377182,
2044
+ "step": 5560
2045
+ },
2046
+ {
2047
+ "epoch": 3.3963470319634705,
2048
+ "grad_norm": 0.18844422698020935,
2049
+ "learning_rate": 3.029173854273909e-07,
2050
+ "loss": 0.012312603741884231,
2051
+ "step": 5580
2052
+ },
2053
+ {
2054
+ "epoch": 3.4085235920852357,
2055
+ "grad_norm": 0.13131535053253174,
2056
+ "learning_rate": 2.910783228521269e-07,
2057
+ "loss": 0.011797953397035599,
2058
+ "step": 5600
2059
+ },
2060
+ {
2061
+ "epoch": 3.4207001522070017,
2062
+ "grad_norm": 0.4402364492416382,
2063
+ "learning_rate": 2.794609391588504e-07,
2064
+ "loss": 0.012182456254959107,
2065
+ "step": 5620
2066
+ },
2067
+ {
2068
+ "epoch": 3.432876712328767,
2069
+ "grad_norm": 0.3497592508792877,
2070
+ "learning_rate": 2.6806640022349897e-07,
2071
+ "loss": 0.013599888980388641,
2072
+ "step": 5640
2073
+ },
2074
+ {
2075
+ "epoch": 3.445053272450533,
2076
+ "grad_norm": 0.2316354215145111,
2077
+ "learning_rate": 2.5689584955816497e-07,
2078
+ "loss": 0.009272868931293487,
2079
+ "step": 5660
2080
+ },
2081
+ {
2082
+ "epoch": 3.457229832572298,
2083
+ "grad_norm": 0.3858301341533661,
2084
+ "learning_rate": 2.459504081963421e-07,
2085
+ "loss": 0.008165979385375976,
2086
+ "step": 5680
2087
+ },
2088
+ {
2089
+ "epoch": 3.469406392694064,
2090
+ "grad_norm": 0.14734333753585815,
2091
+ "learning_rate": 2.3523117458041865e-07,
2092
+ "loss": 0.009182130545377731,
2093
+ "step": 5700
2094
+ },
2095
+ {
2096
+ "epoch": 3.4815829528158293,
2097
+ "grad_norm": 0.03280401974916458,
2098
+ "learning_rate": 2.2473922445144485e-07,
2099
+ "loss": 0.0107998326420784,
2100
+ "step": 5720
2101
+ },
2102
+ {
2103
+ "epoch": 3.4937595129375953,
2104
+ "grad_norm": 0.1505511999130249,
2105
+ "learning_rate": 2.144756107411733e-07,
2106
+ "loss": 0.014469687640666962,
2107
+ "step": 5740
2108
+ },
2109
+ {
2110
+ "epoch": 3.5059360730593605,
2111
+ "grad_norm": 0.2366904318332672,
2112
+ "learning_rate": 2.0444136346639333e-07,
2113
+ "loss": 0.0121701680123806,
2114
+ "step": 5760
2115
+ },
2116
+ {
2117
+ "epoch": 3.5181126331811265,
2118
+ "grad_norm": 0.1468425989151001,
2119
+ "learning_rate": 1.9463748962556096e-07,
2120
+ "loss": 0.014668506383895875,
2121
+ "step": 5780
2122
+ },
2123
+ {
2124
+ "epoch": 3.5302891933028917,
2125
+ "grad_norm": 0.14534050226211548,
2126
+ "learning_rate": 1.8506497309773885e-07,
2127
+ "loss": 0.010488402843475342,
2128
+ "step": 5800
2129
+ },
2130
+ {
2131
+ "epoch": 3.5424657534246577,
2132
+ "grad_norm": 0.15501493215560913,
2133
+ "learning_rate": 1.7572477454386257e-07,
2134
+ "loss": 0.010667071491479874,
2135
+ "step": 5820
2136
+ },
2137
+ {
2138
+ "epoch": 3.554642313546423,
2139
+ "grad_norm": 0.26535800099372864,
2140
+ "learning_rate": 1.6661783131032726e-07,
2141
+ "loss": 0.011079683899879456,
2142
+ "step": 5840
2143
+ },
2144
+ {
2145
+ "epoch": 3.566818873668189,
2146
+ "grad_norm": 0.24390950798988342,
2147
+ "learning_rate": 1.5774505733492263e-07,
2148
+ "loss": 0.009308797866106033,
2149
+ "step": 5860
2150
+ },
2151
+ {
2152
+ "epoch": 3.578995433789954,
2153
+ "grad_norm": 0.3409421443939209,
2154
+ "learning_rate": 1.49107343055111e-07,
2155
+ "loss": 0.012319787591695785,
2156
+ "step": 5880
2157
+ },
2158
+ {
2159
+ "epoch": 3.59117199391172,
2160
+ "grad_norm": 0.4800300896167755,
2161
+ "learning_rate": 1.407055553186701e-07,
2162
+ "loss": 0.00843576118350029,
2163
+ "step": 5900
2164
+ },
2165
+ {
2166
+ "epoch": 3.6033485540334853,
2167
+ "grad_norm": 0.11663182079792023,
2168
+ "learning_rate": 1.3254053729669564e-07,
2169
+ "loss": 0.00938587412238121,
2170
+ "step": 5920
2171
+ },
2172
+ {
2173
+ "epoch": 3.6155251141552514,
2174
+ "grad_norm": 0.29512378573417664,
2175
+ "learning_rate": 1.2461310839898656e-07,
2176
+ "loss": 0.011934128403663636,
2177
+ "step": 5940
2178
+ },
2179
+ {
2180
+ "epoch": 3.6277016742770165,
2181
+ "grad_norm": 0.2641650140285492,
2182
+ "learning_rate": 1.169240641918104e-07,
2183
+ "loss": 0.013170333206653595,
2184
+ "step": 5960
2185
+ },
2186
+ {
2187
+ "epoch": 3.6398782343987826,
2188
+ "grad_norm": 0.47704726457595825,
2189
+ "learning_rate": 1.0947417631806539e-07,
2190
+ "loss": 0.014534834027290344,
2191
+ "step": 5980
2192
+ },
2193
+ {
2194
+ "epoch": 3.6520547945205477,
2195
+ "grad_norm": 0.10114685446023941,
2196
+ "learning_rate": 1.0226419241983865e-07,
2197
+ "loss": 0.011021688580513,
2198
+ "step": 6000
2199
+ },
2200
+ {
2201
+ "epoch": 3.6520547945205477,
2202
+ "eval_loss": 0.44063475728034973,
2203
+ "eval_runtime": 46.1734,
2204
+ "eval_samples_per_second": 19.059,
2205
+ "eval_steps_per_second": 19.059,
2206
+ "step": 6000
2207
+ },
2208
+ {
2209
+ "epoch": 3.6642313546423138,
2210
+ "grad_norm": 0.2619183659553528,
2211
+ "learning_rate": 9.529483606337902e-08,
2212
+ "loss": 0.010764393210411071,
2213
+ "step": 6020
2214
+ },
2215
+ {
2216
+ "epoch": 3.676407914764079,
2217
+ "grad_norm": 0.05733129009604454,
2218
+ "learning_rate": 8.856680666647882e-08,
2219
+ "loss": 0.012128306180238723,
2220
+ "step": 6040
2221
+ },
2222
+ {
2223
+ "epoch": 3.688584474885845,
2224
+ "grad_norm": 0.19483673572540283,
2225
+ "learning_rate": 8.208077942828713e-08,
2226
+ "loss": 0.011729901283979416,
2227
+ "step": 6060
2228
+ },
2229
+ {
2230
+ "epoch": 3.70076103500761,
2231
+ "grad_norm": 0.2111903578042984,
2232
+ "learning_rate": 7.58374052615457e-08,
2233
+ "loss": 0.009119105339050294,
2234
+ "step": 6080
2235
+ },
2236
+ {
2237
+ "epoch": 3.712937595129376,
2238
+ "grad_norm": 0.04995311424136162,
2239
+ "learning_rate": 6.983731072726818e-08,
2240
+ "loss": 0.017101363837718965,
2241
+ "step": 6100
2242
+ },
2243
+ {
2244
+ "epoch": 3.7251141552511413,
2245
+ "grad_norm": 0.5839787125587463,
2246
+ "learning_rate": 6.408109797186118e-08,
2247
+ "loss": 0.012368235737085342,
2248
+ "step": 6120
2249
+ },
2250
+ {
2251
+ "epoch": 3.7372907153729074,
2252
+ "grad_norm": 0.4685717523097992,
2253
+ "learning_rate": 5.856934466669212e-08,
2254
+ "loss": 0.008782628178596496,
2255
+ "step": 6140
2256
+ },
2257
+ {
2258
+ "epoch": 3.7494672754946725,
2259
+ "grad_norm": 0.17204681038856506,
2260
+ "learning_rate": 5.3302603950119994e-08,
2261
+ "loss": 0.008994438499212266,
2262
+ "step": 6160
2263
+ },
2264
+ {
2265
+ "epoch": 3.7616438356164386,
2266
+ "grad_norm": 0.07392167299985886,
2267
+ "learning_rate": 4.8281404371981755e-08,
2268
+ "loss": 0.011286454647779465,
2269
+ "step": 6180
2270
+ },
2271
+ {
2272
+ "epoch": 3.7738203957382037,
2273
+ "grad_norm": 0.3728208541870117,
2274
+ "learning_rate": 4.350624984055196e-08,
2275
+ "loss": 0.011785905063152313,
2276
+ "step": 6200
2277
+ },
2278
+ {
2279
+ "epoch": 3.7859969558599698,
2280
+ "grad_norm": 0.25468680262565613,
2281
+ "learning_rate": 3.897761957196877e-08,
2282
+ "loss": 0.013624191284179688,
2283
+ "step": 6220
2284
+ },
2285
+ {
2286
+ "epoch": 3.798173515981735,
2287
+ "grad_norm": 0.09725204110145569,
2288
+ "learning_rate": 3.469596804214548e-08,
2289
+ "loss": 0.011700452119112015,
2290
+ "step": 6240
2291
+ },
2292
+ {
2293
+ "epoch": 3.810350076103501,
2294
+ "grad_norm": 0.07126162946224213,
2295
+ "learning_rate": 3.06617249411581e-08,
2296
+ "loss": 0.011029987037181855,
2297
+ "step": 6260
2298
+ },
2299
+ {
2300
+ "epoch": 3.822526636225266,
2301
+ "grad_norm": 0.08542267978191376,
2302
+ "learning_rate": 2.687529513012488e-08,
2303
+ "loss": 0.010965974628925323,
2304
+ "step": 6280
2305
+ },
2306
+ {
2307
+ "epoch": 3.834703196347032,
2308
+ "grad_norm": 0.2627331018447876,
2309
+ "learning_rate": 2.3337058600575722e-08,
2310
+ "loss": 0.012378603965044022,
2311
+ "step": 6300
2312
+ },
2313
+ {
2314
+ "epoch": 3.8468797564687973,
2315
+ "grad_norm": 0.19707690179347992,
2316
+ "learning_rate": 2.0047370436317437e-08,
2317
+ "loss": 0.011792077124118805,
2318
+ "step": 6320
2319
+ },
2320
+ {
2321
+ "epoch": 3.8590563165905634,
2322
+ "grad_norm": 0.47547003626823425,
2323
+ "learning_rate": 1.7006560777798608e-08,
2324
+ "loss": 0.01145942509174347,
2325
+ "step": 6340
2326
+ },
2327
+ {
2328
+ "epoch": 3.8712328767123285,
2329
+ "grad_norm": 0.3591565489768982,
2330
+ "learning_rate": 1.421493478897945e-08,
2331
+ "loss": 0.011088228970766067,
2332
+ "step": 6360
2333
+ },
2334
+ {
2335
+ "epoch": 3.8834094368340946,
2336
+ "grad_norm": 0.20619548857212067,
2337
+ "learning_rate": 1.1672772626704909e-08,
2338
+ "loss": 0.010828402638435364,
2339
+ "step": 6380
2340
+ },
2341
+ {
2342
+ "epoch": 3.8955859969558597,
2343
+ "grad_norm": 0.2822403311729431,
2344
+ "learning_rate": 9.38032941258965e-09,
2345
+ "loss": 0.01165580153465271,
2346
+ "step": 6400
2347
+ },
2348
+ {
2349
+ "epoch": 3.9077625570776258,
2350
+ "grad_norm": 0.15682674944400787,
2351
+ "learning_rate": 7.3378352074163215e-09,
2352
+ "loss": 0.010783226788043975,
2353
+ "step": 6420
2354
+ },
2355
+ {
2356
+ "epoch": 3.919939117199391,
2357
+ "grad_norm": 0.34253379702568054,
2358
+ "learning_rate": 5.545494988045963e-09,
2359
+ "loss": 0.011295531690120698,
2360
+ "step": 6440
2361
+ },
2362
+ {
2363
+ "epoch": 3.932115677321157,
2364
+ "grad_norm": 0.27684664726257324,
2365
+ "learning_rate": 4.003488626848073e-09,
2366
+ "loss": 0.013613662123680115,
2367
+ "step": 6460
2368
+ },
2369
+ {
2370
+ "epoch": 3.944292237442922,
2371
+ "grad_norm": 0.69688880443573,
2372
+ "learning_rate": 2.7119708736486615e-09,
2373
+ "loss": 0.011696261167526246,
2374
+ "step": 6480
2375
+ },
2376
+ {
2377
+ "epoch": 3.956468797564688,
2378
+ "grad_norm": 0.2617769241333008,
2379
+ "learning_rate": 1.6710713402015577e-09,
2380
+ "loss": 0.010873865336179733,
2381
+ "step": 6500
2382
+ },
2383
+ {
2384
+ "epoch": 3.956468797564688,
2385
+ "eval_loss": 0.441041499376297,
2386
+ "eval_runtime": 46.1946,
2387
+ "eval_samples_per_second": 19.05,
2388
+ "eval_steps_per_second": 19.05,
2389
+ "step": 6500
2390
+ },
2391
+ {
2392
+ "epoch": 3.9686453576864533,
2393
+ "grad_norm": 0.23670868575572968,
2394
+ "learning_rate": 8.80894487179651e-10,
2395
+ "loss": 0.012961818277835846,
2396
+ "step": 6520
2397
+ },
2398
+ {
2399
+ "epoch": 3.9808219178082194,
2400
+ "grad_norm": 1.036125659942627,
2401
+ "learning_rate": 3.4151961369188745e-10,
2402
+ "loss": 0.01224210560321808,
2403
+ "step": 6540
2404
+ },
2405
+ {
2406
+ "epoch": 3.9929984779299845,
2407
+ "grad_norm": 0.1431870311498642,
2408
+ "learning_rate": 5.300084932574612e-11,
2409
+ "loss": 0.010245455056428909,
2410
+ "step": 6560
2411
+ },
2412
+ {
2413
+ "epoch": 4.0,
2414
+ "step": 6572,
2415
+ "total_flos": 3.545203061907456e+17,
2416
+ "train_loss": 0.08566030774240586,
2417
+ "train_runtime": 13933.4985,
2418
+ "train_samples_per_second": 3.772,
2419
+ "train_steps_per_second": 0.472
2420
+ }
2421
+ ],
2422
+ "logging_steps": 20,
2423
+ "max_steps": 6572,
2424
+ "num_input_tokens_seen": 0,
2425
+ "num_train_epochs": 4,
2426
+ "save_steps": 500,
2427
+ "stateful_callbacks": {
2428
+ "TrainerControl": {
2429
+ "args": {
2430
+ "should_epoch_stop": false,
2431
+ "should_evaluate": false,
2432
+ "should_log": false,
2433
+ "should_save": true,
2434
+ "should_training_stop": true
2435
+ },
2436
+ "attributes": {}
2437
+ }
2438
+ },
2439
+ "total_flos": 3.545203061907456e+17,
2440
+ "train_batch_size": 1,
2441
+ "trial_name": null,
2442
+ "trial_params": null
2443
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c8f0bbe96b1bd3b42d30bb419fc0c08037dc014a4a8cea153b553983982dd1
3
+ size 5585
training_eval_loss.png ADDED
training_loss.png ADDED