furproxy commited on
Commit
b2f24e1
·
verified ·
1 Parent(s): 62f3803

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen3.5-9B
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: qwen35_caption_galore
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # qwen35_caption_galore
18
+
19
+ This model is a fine-tuned version of [/workspace/models/Qwen3.5-9B](https://huggingface.co//workspace/models/Qwen3.5-9B) on the my_caption dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - family_to_muon_lr = {
39
+ "language": _fallback(getattr(training_args, "language_muon_lr", 3e-5), language_lr),
40
+ "vision": _fallback(getattr(training_args, "vision_muon_lr", 3e-5), vision_lr),
41
+ "merger": _fallback(getattr(training_args, "merger_muon_lr", 3e-5), merger_lr),
42
+ }
43
+
44
+ family_to_adamw_lr = {
45
+ "language": _fallback(getattr(training_args, "language_adamw_lr", 3e-5), language_lr),
46
+ "vision": _fallback(getattr(training_args, "vision_adamw_lr", 3e-6), vision_lr),
47
+ "merger": _fallback(getattr(training_args, "merger_adamw_lr", 3e-5), merger_lr),
48
+ }
49
+ - train_batch_size: 1
50
+ - eval_batch_size: 8
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 64
54
+ - total_train_batch_size: 64
55
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
56
+ - lr_scheduler_type: cosine_with_min_lr
57
+ - lr_scheduler_warmup_steps: 0.05
58
+ - num_epochs: 3
59
+
60
+ ### Training results
61
+
62
+
63
+
64
+ ### Framework versions
65
+
66
+ - Transformers 5.5.3
67
+ - Pytorch 2.11.0+cu130
68
+ - Datasets 4.0.0
69
+ - Tokenizers 0.22.2
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen3.5-9B
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: qwen35_caption_galore
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # qwen35_caption_galore
18
+
19
+ This model is a fine-tuned version of [/workspace/models/Qwen3.5-9B](https://huggingface.co//workspace/models/Qwen3.5-9B) on the my_caption dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - family_to_muon_lr = {
39
+ "language": _fallback(getattr(training_args, "language_muon_lr", 3e-5), language_lr),
40
+ "vision": _fallback(getattr(training_args, "vision_muon_lr", 3e-5), vision_lr),
41
+ "merger": _fallback(getattr(training_args, "merger_muon_lr", 3e-5), merger_lr),
42
+ }
43
+
44
+ family_to_adamw_lr = {
45
+ "language": _fallback(getattr(training_args, "language_adamw_lr", 3e-5), language_lr),
46
+ "vision": _fallback(getattr(training_args, "vision_adamw_lr", 3e-6), vision_lr),
47
+ "merger": _fallback(getattr(training_args, "merger_adamw_lr", 3e-5), merger_lr),
48
+ }
49
+ - train_batch_size: 1
50
+ - eval_batch_size: 8
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 64
54
+ - total_train_batch_size: 64
55
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
56
+ - lr_scheduler_type: cosine_with_min_lr
57
+ - lr_scheduler_warmup_steps: 0.05
58
+ - num_epochs: 3
59
+
60
+ ### Training results
61
+
62
+
63
+
64
+ ### Framework versions
65
+
66
+ - Transformers 5.5.3
67
+ - Pytorch 2.11.0+cu130
68
+ - Datasets 4.0.0
69
+ - Tokenizers 0.22.2
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "effective_tokens_per_sec": 4156.816967960705,
3
+ "epoch": 3.0,
4
+ "total_flos": 3.2487544184132076e+18,
5
+ "train_loss": 0.8073726282178277,
6
+ "train_runtime": 15483.8831,
7
+ "train_samples_per_second": 3.199,
8
+ "train_steps_per_second": 0.05
9
+ }
chat_template.jinja ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count, is_system_content=false) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- elif content is iterable and content is not mapping %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if is_system_content %}
10
+ {{- raise_exception('System message cannot contain images.') }}
11
+ {%- endif %}
12
+ {%- if do_vision_count %}
13
+ {%- set image_count.value = image_count.value + 1 %}
14
+ {%- endif %}
15
+ {%- if add_vision_id %}
16
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
17
+ {%- endif %}
18
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
19
+ {%- elif 'video' in item or item.type == 'video' %}
20
+ {%- if is_system_content %}
21
+ {{- raise_exception('System message cannot contain videos.') }}
22
+ {%- endif %}
23
+ {%- if do_vision_count %}
24
+ {%- set video_count.value = video_count.value + 1 %}
25
+ {%- endif %}
26
+ {%- if add_vision_id %}
27
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
28
+ {%- endif %}
29
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
30
+ {%- elif 'text' in item %}
31
+ {{- item.text }}
32
+ {%- else %}
33
+ {{- raise_exception('Unexpected item type in content.') }}
34
+ {%- endif %}
35
+ {%- endfor %}
36
+ {%- elif content is none or content is undefined %}
37
+ {{- '' }}
38
+ {%- else %}
39
+ {{- raise_exception('Unexpected content type.') }}
40
+ {%- endif %}
41
+ {%- endmacro %}
42
+ {%- if not messages %}
43
+ {{- raise_exception('No messages provided.') }}
44
+ {%- endif %}
45
+ {%- if tools and tools is iterable and tools is not mapping %}
46
+ {{- '<|im_start|>system\n' }}
47
+ {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
48
+ {%- for tool in tools %}
49
+ {{- "\n" }}
50
+ {{- tool | tojson }}
51
+ {%- endfor %}
52
+ {{- "\n</tools>" }}
53
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
54
+ {%- if messages[0].role == 'system' %}
55
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
56
+ {%- if content %}
57
+ {{- '\n\n' + content }}
58
+ {%- endif %}
59
+ {%- endif %}
60
+ {{- '<|im_end|>\n' }}
61
+ {%- else %}
62
+ {%- if messages[0].role == 'system' %}
63
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
64
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
65
+ {%- endif %}
66
+ {%- endif %}
67
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
68
+ {%- for message in messages[::-1] %}
69
+ {%- set index = (messages|length - 1) - loop.index0 %}
70
+ {%- if ns.multi_step_tool and message.role == "user" %}
71
+ {%- set content = render_content(message.content, false)|trim %}
72
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
73
+ {%- set ns.multi_step_tool = false %}
74
+ {%- set ns.last_query_index = index %}
75
+ {%- endif %}
76
+ {%- endif %}
77
+ {%- endfor %}
78
+ {%- if ns.multi_step_tool %}
79
+ {{- raise_exception('No user query found in messages.') }}
80
+ {%- endif %}
81
+ {%- for message in messages %}
82
+ {%- set content = render_content(message.content, true)|trim %}
83
+ {%- if message.role == "system" %}
84
+ {%- if not loop.first %}
85
+ {{- raise_exception('System message must be at the beginning.') }}
86
+ {%- endif %}
87
+ {%- elif message.role == "user" %}
88
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
89
+ {%- elif message.role == "assistant" %}
90
+ {%- set reasoning_content = '' %}
91
+ {%- if message.reasoning_content is string %}
92
+ {%- set reasoning_content = message.reasoning_content %}
93
+ {%- else %}
94
+ {%- if '</think>' in content %}
95
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
96
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
97
+ {%- endif %}
98
+ {%- endif %}
99
+ {%- set reasoning_content = reasoning_content|trim %}
100
+ {%- if loop.index0 > ns.last_query_index %}
101
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
102
+ {%- else %}
103
+ {{- '<|im_start|>' + message.role + '\n' + content }}
104
+ {%- endif %}
105
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
106
+ {%- for tool_call in message.tool_calls %}
107
+ {%- if tool_call.function is defined %}
108
+ {%- set tool_call = tool_call.function %}
109
+ {%- endif %}
110
+ {%- if loop.first %}
111
+ {%- if content|trim %}
112
+ {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
113
+ {%- else %}
114
+ {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
115
+ {%- endif %}
116
+ {%- else %}
117
+ {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
118
+ {%- endif %}
119
+ {%- if tool_call.arguments is defined %}
120
+ {%- for args_name, args_value in tool_call.arguments|items %}
121
+ {{- '<parameter=' + args_name + '>\n' }}
122
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
123
+ {{- args_value }}
124
+ {{- '\n</parameter>\n' }}
125
+ {%- endfor %}
126
+ {%- endif %}
127
+ {{- '</function>\n</tool_call>' }}
128
+ {%- endfor %}
129
+ {%- endif %}
130
+ {{- '<|im_end|>\n' }}
131
+ {%- elif message.role == "tool" %}
132
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
133
+ {{- '<|im_start|>user' }}
134
+ {%- endif %}
135
+ {{- '\n<tool_response>\n' }}
136
+ {{- content }}
137
+ {{- '\n</tool_response>' }}
138
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
139
+ {{- '<|im_end|>\n' }}
140
+ {%- elif loop.last %}
141
+ {{- '<|im_end|>\n' }}
142
+ {%- endif %}
143
+ {%- else %}
144
+ {{- raise_exception('Unexpected message role.') }}
145
+ {%- endif %}
146
+ {%- endfor %}
147
+ {%- if add_generation_prompt %}
148
+ {{- '<|im_start|>assistant\n' }}
149
+ {%- if enable_thinking is defined and enable_thinking is false %}
150
+ {{- '<think>\n\n</think>\n\n' }}
151
+ {%- else %}
152
+ {{- '<think>\n' }}
153
+ {%- endif %}
154
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3_5ForConditionalGeneration"
4
+ ],
5
+ "dtype": "float32",
6
+ "eos_token_id": 248046,
7
+ "hidden_size": 4096,
8
+ "image_token_id": 248056,
9
+ "model_type": "qwen3_5",
10
+ "pad_token_id": 248044,
11
+ "text_config": {
12
+ "attention_bias": false,
13
+ "attention_dropout": 0.0,
14
+ "attn_output_gate": true,
15
+ "bos_token_id": null,
16
+ "dtype": "bfloat16",
17
+ "eos_token_id": 248044,
18
+ "full_attention_interval": 4,
19
+ "head_dim": 256,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 4096,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 12288,
24
+ "layer_types": [
25
+ "linear_attention",
26
+ "linear_attention",
27
+ "linear_attention",
28
+ "full_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "linear_attention",
32
+ "full_attention",
33
+ "linear_attention",
34
+ "linear_attention",
35
+ "linear_attention",
36
+ "full_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "linear_attention",
40
+ "full_attention",
41
+ "linear_attention",
42
+ "linear_attention",
43
+ "linear_attention",
44
+ "full_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "linear_attention",
48
+ "full_attention",
49
+ "linear_attention",
50
+ "linear_attention",
51
+ "linear_attention",
52
+ "full_attention",
53
+ "linear_attention",
54
+ "linear_attention",
55
+ "linear_attention",
56
+ "full_attention"
57
+ ],
58
+ "linear_conv_kernel_dim": 4,
59
+ "linear_key_head_dim": 128,
60
+ "linear_num_key_heads": 16,
61
+ "linear_num_value_heads": 32,
62
+ "linear_value_head_dim": 128,
63
+ "mamba_ssm_dtype": "float32",
64
+ "max_position_embeddings": 262144,
65
+ "mlp_only_layers": [],
66
+ "model_type": "qwen3_5_text",
67
+ "mtp_num_hidden_layers": 1,
68
+ "mtp_use_dedicated_embeddings": false,
69
+ "num_attention_heads": 16,
70
+ "num_hidden_layers": 32,
71
+ "num_key_value_heads": 4,
72
+ "pad_token_id": null,
73
+ "partial_rotary_factor": 0.25,
74
+ "rms_norm_eps": 1e-06,
75
+ "rope_parameters": {
76
+ "mrope_interleaved": true,
77
+ "mrope_section": [
78
+ 11,
79
+ 11,
80
+ 10
81
+ ],
82
+ "partial_rotary_factor": 0.25,
83
+ "rope_theta": 10000000,
84
+ "rope_type": "default"
85
+ },
86
+ "tie_word_embeddings": false,
87
+ "use_cache": false,
88
+ "vocab_size": 248320
89
+ },
90
+ "tie_word_embeddings": false,
91
+ "transformers_version": "5.5.3",
92
+ "use_cache": false,
93
+ "video_token_id": 248057,
94
+ "vision_config": {
95
+ "deepstack_visual_indexes": [],
96
+ "depth": 27,
97
+ "dtype": "bfloat16",
98
+ "hidden_act": "gelu_pytorch_tanh",
99
+ "hidden_size": 1152,
100
+ "in_channels": 3,
101
+ "initializer_range": 0.02,
102
+ "intermediate_size": 4304,
103
+ "model_type": "qwen3_5",
104
+ "num_heads": 16,
105
+ "num_position_embeddings": 2304,
106
+ "out_hidden_size": 4096,
107
+ "patch_size": 16,
108
+ "spatial_merge_size": 2,
109
+ "temporal_patch_size": 2
110
+ },
111
+ "vision_end_token_id": 248054,
112
+ "vision_start_token_id": 248053
113
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 248046,
5
+ 248044
6
+ ],
7
+ "pad_token_id": 248044,
8
+ "transformers_version": "5.5.3",
9
+ "use_cache": true
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e333ff5036ee636f4db4ec6caa1c67562af37eaf4bcb9909fc5fc19a2680b51
3
+ size 37639354416
processor_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "merge_size": 2,
19
+ "patch_size": 16,
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 16777216,
24
+ "shortest_edge": 65536
25
+ },
26
+ "temporal_patch_size": 2
27
+ },
28
+ "processor_class": "Qwen3VLProcessor",
29
+ "video_processor": {
30
+ "do_convert_rgb": true,
31
+ "do_normalize": true,
32
+ "do_rescale": true,
33
+ "do_resize": true,
34
+ "do_sample_frames": true,
35
+ "fps": 2,
36
+ "image_mean": [
37
+ 0.5,
38
+ 0.5,
39
+ 0.5
40
+ ],
41
+ "image_std": [
42
+ 0.5,
43
+ 0.5,
44
+ 0.5
45
+ ],
46
+ "max_frames": 768,
47
+ "merge_size": 2,
48
+ "min_frames": 4,
49
+ "patch_size": 16,
50
+ "resample": 3,
51
+ "rescale_factor": 0.00392156862745098,
52
+ "return_metadata": false,
53
+ "size": {
54
+ "longest_edge": 25165824,
55
+ "shortest_edge": 4096
56
+ },
57
+ "temporal_patch_size": 2,
58
+ "video_processor_type": "Qwen3VLVideoProcessor"
59
+ }
60
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
3
+ size 19989343
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "audio_bos_token": "<|audio_start|>",
4
+ "audio_eos_token": "<|audio_end|>",
5
+ "audio_token": "<|audio_pad|>",
6
+ "backend": "tokenizers",
7
+ "bos_token": null,
8
+ "clean_up_tokenization_spaces": false,
9
+ "eos_token": "<|im_end|>",
10
+ "errors": "replace",
11
+ "image_token": "<|image_pad|>",
12
+ "is_local": true,
13
+ "model_max_length": 262144,
14
+ "model_specific_special_tokens": {
15
+ "audio_bos_token": "<|audio_start|>",
16
+ "audio_eos_token": "<|audio_end|>",
17
+ "audio_token": "<|audio_pad|>",
18
+ "image_token": "<|image_pad|>",
19
+ "video_token": "<|video_pad|>",
20
+ "vision_bos_token": "<|vision_start|>",
21
+ "vision_eos_token": "<|vision_end|>"
22
+ },
23
+ "pad_token": "<|endoftext|>",
24
+ "padding_side": "right",
25
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
26
+ "processor_class": "Qwen3VLProcessor",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "TokenizersBackend",
29
+ "unk_token": null,
30
+ "video_token": "<|video_pad|>",
31
+ "vision_bos_token": "<|vision_start|>",
32
+ "vision_eos_token": "<|vision_end|>"
33
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "effective_tokens_per_sec": 4156.816967960705,
3
+ "epoch": 3.0,
4
+ "total_flos": 3.2487544184132076e+18,
5
+ "train_loss": 0.8073726282178277,
6
+ "train_runtime": 15483.8831,
7
+ "train_samples_per_second": 3.199,
8
+ "train_steps_per_second": 0.05
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 2, "total_steps": 774, "loss": 4.068140983581543, "lr": 7.692307692307693e-07, "epoch": 0.007751937984496124, "percentage": 0.26, "elapsed_time": "0:00:39", "remaining_time": "4:12:29"}
2
+ {"current_steps": 4, "total_steps": 774, "loss": 2.0239908695220947, "lr": 2.307692307692308e-06, "epoch": 0.015503875968992248, "percentage": 0.52, "elapsed_time": "0:01:22", "remaining_time": "4:25:32"}
3
+ {"current_steps": 6, "total_steps": 774, "loss": 1.9337211847305298, "lr": 3.846153846153846e-06, "epoch": 0.023255813953488372, "percentage": 0.78, "elapsed_time": "0:02:04", "remaining_time": "4:26:39"}
4
+ {"current_steps": 8, "total_steps": 774, "loss": 1.9213242530822754, "lr": 5.384615384615385e-06, "epoch": 0.031007751937984496, "percentage": 1.03, "elapsed_time": "0:02:44", "remaining_time": "4:22:05"}
5
+ {"current_steps": 10, "total_steps": 774, "loss": 2.1911349296569824, "lr": 6.923076923076923e-06, "epoch": 0.03875968992248062, "percentage": 1.29, "elapsed_time": "0:03:24", "remaining_time": "4:20:59"}
6
+ {"current_steps": 12, "total_steps": 774, "loss": 3.268449306488037, "lr": 8.461538461538462e-06, "epoch": 0.046511627906976744, "percentage": 1.55, "elapsed_time": "0:04:01", "remaining_time": "4:15:56"}
7
+ {"current_steps": 14, "total_steps": 774, "loss": 1.7003194093704224, "lr": 9.999999999999999e-06, "epoch": 0.05426356589147287, "percentage": 1.81, "elapsed_time": "0:04:47", "remaining_time": "4:19:48"}
8
+ {"current_steps": 16, "total_steps": 774, "loss": 1.8064090013504028, "lr": 1.153846153846154e-05, "epoch": 0.06201550387596899, "percentage": 2.07, "elapsed_time": "0:05:23", "remaining_time": "4:15:24"}
9
+ {"current_steps": 18, "total_steps": 774, "loss": 1.3964051008224487, "lr": 1.3076923076923078e-05, "epoch": 0.06976744186046512, "percentage": 2.33, "elapsed_time": "0:06:00", "remaining_time": "4:12:40"}
10
+ {"current_steps": 20, "total_steps": 774, "loss": 1.59793221950531, "lr": 1.4615384615384615e-05, "epoch": 0.07751937984496124, "percentage": 2.58, "elapsed_time": "0:06:39", "remaining_time": "4:10:53"}
11
+ {"current_steps": 22, "total_steps": 774, "loss": 1.7358227968215942, "lr": 1.6153846153846154e-05, "epoch": 0.08527131782945736, "percentage": 2.84, "elapsed_time": "0:07:17", "remaining_time": "4:09:14"}
12
+ {"current_steps": 24, "total_steps": 774, "loss": 1.3447600603103638, "lr": 1.7692307692307694e-05, "epoch": 0.09302325581395349, "percentage": 3.1, "elapsed_time": "0:07:59", "remaining_time": "4:09:35"}
13
+ {"current_steps": 26, "total_steps": 774, "loss": 1.4682307243347168, "lr": 1.923076923076923e-05, "epoch": 0.10077519379844961, "percentage": 3.36, "elapsed_time": "0:08:44", "remaining_time": "4:11:29"}
14
+ {"current_steps": 28, "total_steps": 774, "loss": 1.2035093307495117, "lr": 2.076923076923077e-05, "epoch": 0.10852713178294573, "percentage": 3.62, "elapsed_time": "0:09:23", "remaining_time": "4:10:20"}
15
+ {"current_steps": 30, "total_steps": 774, "loss": 1.1427452564239502, "lr": 2.230769230769231e-05, "epoch": 0.11627906976744186, "percentage": 3.88, "elapsed_time": "0:10:04", "remaining_time": "4:09:52"}
16
+ {"current_steps": 32, "total_steps": 774, "loss": 1.3711202144622803, "lr": 2.3846153846153846e-05, "epoch": 0.12403100775193798, "percentage": 4.13, "elapsed_time": "0:10:51", "remaining_time": "4:11:36"}
17
+ {"current_steps": 34, "total_steps": 774, "loss": 1.2189266681671143, "lr": 2.5384615384615386e-05, "epoch": 0.13178294573643412, "percentage": 4.39, "elapsed_time": "0:11:31", "remaining_time": "4:10:51"}
18
+ {"current_steps": 36, "total_steps": 774, "loss": 1.3252586126327515, "lr": 2.6923076923076923e-05, "epoch": 0.13953488372093023, "percentage": 4.65, "elapsed_time": "0:12:11", "remaining_time": "4:10:05"}
19
+ {"current_steps": 38, "total_steps": 774, "loss": 1.0033904314041138, "lr": 2.846153846153846e-05, "epoch": 0.14728682170542637, "percentage": 4.91, "elapsed_time": "0:12:44", "remaining_time": "4:06:46"}
20
+ {"current_steps": 40, "total_steps": 774, "loss": 1.4535468816757202, "lr": 3e-05, "epoch": 0.15503875968992248, "percentage": 5.17, "elapsed_time": "0:13:27", "remaining_time": "4:06:55"}
21
+ {"current_steps": 42, "total_steps": 774, "loss": 1.0325186252593994, "lr": 2.999580739494117e-05, "epoch": 0.16279069767441862, "percentage": 5.43, "elapsed_time": "0:14:06", "remaining_time": "4:05:46"}
22
+ {"current_steps": 44, "total_steps": 774, "loss": 1.2467223405838013, "lr": 2.998323233708815e-05, "epoch": 0.17054263565891473, "percentage": 5.68, "elapsed_time": "0:14:45", "remaining_time": "4:04:46"}
23
+ {"current_steps": 46, "total_steps": 774, "loss": 1.6686618328094482, "lr": 2.9962283096597995e-05, "epoch": 0.17829457364341086, "percentage": 5.94, "elapsed_time": "0:15:28", "remaining_time": "4:04:53"}
24
+ {"current_steps": 48, "total_steps": 774, "loss": 0.8405603170394897, "lr": 2.9932973451022333e-05, "epoch": 0.18604651162790697, "percentage": 6.2, "elapsed_time": "0:16:06", "remaining_time": "4:03:36"}
25
+ {"current_steps": 50, "total_steps": 774, "loss": 0.6372175812721252, "lr": 2.9895322676246387e-05, "epoch": 0.1937984496124031, "percentage": 6.46, "elapsed_time": "0:16:40", "remaining_time": "4:01:27"}
26
+ {"current_steps": 52, "total_steps": 774, "loss": 1.0768086910247803, "lr": 2.9849355533811937e-05, "epoch": 0.20155038759689922, "percentage": 6.72, "elapsed_time": "0:17:16", "remaining_time": "3:59:50"}
27
+ {"current_steps": 54, "total_steps": 774, "loss": 0.58198082447052, "lr": 2.9795102254632528e-05, "epoch": 0.20930232558139536, "percentage": 6.98, "elapsed_time": "0:17:52", "remaining_time": "3:58:22"}
28
+ {"current_steps": 56, "total_steps": 774, "loss": 1.3517603874206543, "lr": 2.9732598519111736e-05, "epoch": 0.21705426356589147, "percentage": 7.24, "elapsed_time": "0:18:35", "remaining_time": "3:58:16"}
29
+ {"current_steps": 58, "total_steps": 774, "loss": 1.340335488319397, "lr": 2.9661885433677437e-05, "epoch": 0.2248062015503876, "percentage": 7.49, "elapsed_time": "0:19:17", "remaining_time": "3:58:06"}
30
+ {"current_steps": 60, "total_steps": 774, "loss": 1.1451056003570557, "lr": 2.9583009503747627e-05, "epoch": 0.23255813953488372, "percentage": 7.75, "elapsed_time": "0:19:55", "remaining_time": "3:57:04"}
31
+ {"current_steps": 62, "total_steps": 774, "loss": 1.2255440950393677, "lr": 2.9496022603145497e-05, "epoch": 0.24031007751937986, "percentage": 8.01, "elapsed_time": "0:20:36", "remaining_time": "3:56:41"}
32
+ {"current_steps": 64, "total_steps": 774, "loss": 1.2778782844543457, "lr": 2.940098193998391e-05, "epoch": 0.24806201550387597, "percentage": 8.27, "elapsed_time": "0:21:16", "remaining_time": "3:56:04"}
33
+ {"current_steps": 66, "total_steps": 774, "loss": 1.178369402885437, "lr": 2.9297950019041724e-05, "epoch": 0.2558139534883721, "percentage": 8.53, "elapsed_time": "0:21:52", "remaining_time": "3:54:43"}
34
+ {"current_steps": 68, "total_steps": 774, "loss": 1.1788100004196167, "lr": 2.918699460065665e-05, "epoch": 0.26356589147286824, "percentage": 8.79, "elapsed_time": "0:22:35", "remaining_time": "3:54:29"}
35
+ {"current_steps": 70, "total_steps": 774, "loss": 1.306922435760498, "lr": 2.906818865616178e-05, "epoch": 0.2713178294573643, "percentage": 9.04, "elapsed_time": "0:23:17", "remaining_time": "3:54:17"}
36
+ {"current_steps": 72, "total_steps": 774, "loss": 1.0475130081176758, "lr": 2.8941610319894977e-05, "epoch": 0.27906976744186046, "percentage": 9.3, "elapsed_time": "0:23:58", "remaining_time": "3:53:46"}
37
+ {"current_steps": 74, "total_steps": 774, "loss": 1.1680102348327637, "lr": 2.8807342837812783e-05, "epoch": 0.2868217054263566, "percentage": 9.56, "elapsed_time": "0:24:38", "remaining_time": "3:53:03"}
38
+ {"current_steps": 76, "total_steps": 774, "loss": 1.0921390056610107, "lr": 2.8665474512742603e-05, "epoch": 0.29457364341085274, "percentage": 9.82, "elapsed_time": "0:25:14", "remaining_time": "3:51:53"}
39
+ {"current_steps": 78, "total_steps": 774, "loss": 1.1694703102111816, "lr": 2.8516098646309108e-05, "epoch": 0.3023255813953488, "percentage": 10.08, "elapsed_time": "0:25:54", "remaining_time": "3:51:08"}
40
+ {"current_steps": 80, "total_steps": 774, "loss": 1.1712660789489746, "lr": 2.8359313477573215e-05, "epoch": 0.31007751937984496, "percentage": 10.34, "elapsed_time": "0:26:37", "remaining_time": "3:50:54"}
41
+ {"current_steps": 82, "total_steps": 774, "loss": 1.32106351852417, "lr": 2.8195222118423792e-05, "epoch": 0.3178294573643411, "percentage": 10.59, "elapsed_time": "0:27:22", "remaining_time": "3:50:57"}
42
+ {"current_steps": 84, "total_steps": 774, "loss": 1.002191424369812, "lr": 2.8023932485764768e-05, "epoch": 0.32558139534883723, "percentage": 10.85, "elapsed_time": "0:28:00", "remaining_time": "3:50:02"}
43
+ {"current_steps": 86, "total_steps": 774, "loss": 1.1279501914978027, "lr": 2.7845557230542076e-05, "epoch": 0.3333333333333333, "percentage": 11.11, "elapsed_time": "0:28:37", "remaining_time": "3:48:59"}
44
+ {"current_steps": 88, "total_steps": 774, "loss": 1.3432408571243286, "lr": 2.7660213663657282e-05, "epoch": 0.34108527131782945, "percentage": 11.37, "elapsed_time": "0:29:21", "remaining_time": "3:48:52"}
45
+ {"current_steps": 90, "total_steps": 774, "loss": 0.8359699249267578, "lr": 2.7468023678816447e-05, "epoch": 0.3488372093023256, "percentage": 11.63, "elapsed_time": "0:30:00", "remaining_time": "3:48:05"}
46
+ {"current_steps": 92, "total_steps": 774, "loss": 1.1406829357147217, "lr": 2.726911367236509e-05, "epoch": 0.35658914728682173, "percentage": 11.89, "elapsed_time": "0:30:41", "remaining_time": "3:47:28"}
47
+ {"current_steps": 94, "total_steps": 774, "loss": 1.142421841621399, "lr": 2.706361446016193e-05, "epoch": 0.3643410852713178, "percentage": 12.14, "elapsed_time": "0:31:15", "remaining_time": "3:46:05"}
48
+ {"current_steps": 96, "total_steps": 774, "loss": 1.2204563617706299, "lr": 2.6851661191546038e-05, "epoch": 0.37209302325581395, "percentage": 12.4, "elapsed_time": "0:31:55", "remaining_time": "3:45:29"}
49
+ {"current_steps": 98, "total_steps": 774, "loss": 0.7862477898597717, "lr": 2.6633393260454096e-05, "epoch": 0.3798449612403101, "percentage": 12.66, "elapsed_time": "0:32:34", "remaining_time": "3:44:42"}
50
+ {"current_steps": 100, "total_steps": 774, "loss": 0.7346755862236023, "lr": 2.6408954213746028e-05, "epoch": 0.3875968992248062, "percentage": 12.92, "elapsed_time": "0:33:10", "remaining_time": "3:43:35"}
51
+ {"current_steps": 102, "total_steps": 774, "loss": 1.3203626871109009, "lr": 2.61784916567995e-05, "epoch": 0.3953488372093023, "percentage": 13.18, "elapsed_time": "0:33:55", "remaining_time": "3:43:27"}
52
+ {"current_steps": 104, "total_steps": 774, "loss": 1.2376055717468262, "lr": 2.5942157156435248e-05, "epoch": 0.40310077519379844, "percentage": 13.44, "elapsed_time": "0:34:35", "remaining_time": "3:42:53"}
53
+ {"current_steps": 106, "total_steps": 774, "loss": 1.0368235111236572, "lr": 2.570010614123707e-05, "epoch": 0.4108527131782946, "percentage": 13.7, "elapsed_time": "0:35:15", "remaining_time": "3:42:12"}
54
+ {"current_steps": 108, "total_steps": 774, "loss": 1.105363130569458, "lr": 2.545249779933216e-05, "epoch": 0.4186046511627907, "percentage": 13.95, "elapsed_time": "0:35:47", "remaining_time": "3:40:43"}
55
+ {"current_steps": 110, "total_steps": 774, "loss": 1.0211938619613647, "lr": 2.5199494973698856e-05, "epoch": 0.4263565891472868, "percentage": 14.21, "elapsed_time": "0:36:29", "remaining_time": "3:40:19"}
56
+ {"current_steps": 112, "total_steps": 774, "loss": 0.9343675971031189, "lr": 2.494126405507074e-05, "epoch": 0.43410852713178294, "percentage": 14.47, "elapsed_time": "0:37:08", "remaining_time": "3:39:32"}
57
+ {"current_steps": 114, "total_steps": 774, "loss": 1.0941760540008545, "lr": 2.4677974872507553e-05, "epoch": 0.4418604651162791, "percentage": 14.73, "elapsed_time": "0:37:46", "remaining_time": "3:38:41"}
58
+ {"current_steps": 116, "total_steps": 774, "loss": 1.0486119985580444, "lr": 2.440980058170478e-05, "epoch": 0.4496124031007752, "percentage": 14.99, "elapsed_time": "0:38:27", "remaining_time": "3:38:09"}
59
+ {"current_steps": 118, "total_steps": 774, "loss": 0.9473840594291687, "lr": 2.4136917551115484e-05, "epoch": 0.4573643410852713, "percentage": 15.25, "elapsed_time": "0:39:06", "remaining_time": "3:37:22"}
60
+ {"current_steps": 120, "total_steps": 774, "loss": 1.2813960313796997, "lr": 2.38595052459592e-05, "epoch": 0.46511627906976744, "percentage": 15.5, "elapsed_time": "0:39:51", "remaining_time": "3:37:11"}
61
+ {"current_steps": 122, "total_steps": 774, "loss": 1.0586227178573608, "lr": 2.357774611019419e-05, "epoch": 0.4728682170542636, "percentage": 15.76, "elapsed_time": "0:40:27", "remaining_time": "3:36:13"}
62
+ {"current_steps": 124, "total_steps": 774, "loss": 1.2756110429763794, "lr": 2.3291825446530736e-05, "epoch": 0.4806201550387597, "percentage": 16.02, "elapsed_time": "0:41:09", "remaining_time": "3:35:46"}
63
+ {"current_steps": 126, "total_steps": 774, "loss": 1.168853759765625, "lr": 2.3001931294564265e-05, "epoch": 0.4883720930232558, "percentage": 16.28, "elapsed_time": "0:41:48", "remaining_time": "3:35:00"}
64
+ {"current_steps": 128, "total_steps": 774, "loss": 1.181935429573059, "lr": 2.27082543071086e-05, "epoch": 0.49612403100775193, "percentage": 16.54, "elapsed_time": "0:42:29", "remaining_time": "3:34:28"}
65
+ {"current_steps": 130, "total_steps": 774, "loss": 1.1901732683181763, "lr": 2.2410987624810524e-05, "epoch": 0.5038759689922481, "percentage": 16.8, "elapsed_time": "0:43:06", "remaining_time": "3:33:32"}
66
+ {"current_steps": 132, "total_steps": 774, "loss": 0.7289036512374878, "lr": 2.2110326749128233e-05, "epoch": 0.5116279069767442, "percentage": 17.05, "elapsed_time": "0:43:40", "remaining_time": "3:32:27"}
67
+ {"current_steps": 134, "total_steps": 774, "loss": 1.161149024963379, "lr": 2.1806469413757164e-05, "epoch": 0.5193798449612403, "percentage": 17.31, "elapsed_time": "0:44:17", "remaining_time": "3:31:34"}
68
+ {"current_steps": 136, "total_steps": 774, "loss": 1.1283718347549438, "lr": 2.149961545458773e-05, "epoch": 0.5271317829457365, "percentage": 17.57, "elapsed_time": "0:45:00", "remaining_time": "3:31:10"}
69
+ {"current_steps": 138, "total_steps": 774, "loss": 1.362121343612671, "lr": 2.118996667828058e-05, "epoch": 0.5348837209302325, "percentage": 17.83, "elapsed_time": "0:45:40", "remaining_time": "3:30:29"}
70
+ {"current_steps": 140, "total_steps": 774, "loss": 1.2608673572540283, "lr": 2.0877726729545665e-05, "epoch": 0.5426356589147286, "percentage": 18.09, "elapsed_time": "0:46:24", "remaining_time": "3:30:10"}
71
+ {"current_steps": 142, "total_steps": 774, "loss": 0.5950201153755188, "lr": 2.0563100957212577e-05, "epoch": 0.5503875968992248, "percentage": 18.35, "elapsed_time": "0:46:59", "remaining_time": "3:29:08"}
72
+ {"current_steps": 144, "total_steps": 774, "loss": 1.3639545440673828, "lr": 2.0246296279180093e-05, "epoch": 0.5581395348837209, "percentage": 18.6, "elapsed_time": "0:47:44", "remaining_time": "3:28:51"}
73
+ {"current_steps": 146, "total_steps": 774, "loss": 1.0145015716552734, "lr": 1.9927521046333833e-05, "epoch": 0.5658914728682171, "percentage": 18.86, "elapsed_time": "0:48:21", "remaining_time": "3:28:01"}
74
+ {"current_steps": 148, "total_steps": 774, "loss": 0.9937471151351929, "lr": 1.960698490552145e-05, "epoch": 0.5736434108527132, "percentage": 19.12, "elapsed_time": "0:49:00", "remaining_time": "3:27:16"}
75
+ {"current_steps": 150, "total_steps": 774, "loss": 1.0032529830932617, "lr": 1.9284898661675586e-05, "epoch": 0.5813953488372093, "percentage": 19.38, "elapsed_time": "0:49:36", "remaining_time": "3:26:22"}
76
+ {"current_steps": 152, "total_steps": 774, "loss": 1.2299753427505493, "lr": 1.8961474139175106e-05, "epoch": 0.5891472868217055, "percentage": 19.64, "elapsed_time": "0:50:21", "remaining_time": "3:26:03"}
77
+ {"current_steps": 154, "total_steps": 774, "loss": 1.2138370275497437, "lr": 1.863692404253597e-05, "epoch": 0.5968992248062015, "percentage": 19.9, "elapsed_time": "0:51:07", "remaining_time": "3:25:49"}
78
+ {"current_steps": 156, "total_steps": 774, "loss": 0.7944934964179993, "lr": 1.8311461816523192e-05, "epoch": 0.6046511627906976, "percentage": 20.16, "elapsed_time": "0:51:41", "remaining_time": "3:24:48"}
79
+ {"current_steps": 158, "total_steps": 774, "loss": 0.8701238036155701, "lr": 1.7985301505776026e-05, "epoch": 0.6124031007751938, "percentage": 20.41, "elapsed_time": "0:52:15", "remaining_time": "3:23:44"}
80
+ {"current_steps": 160, "total_steps": 774, "loss": 1.279708981513977, "lr": 1.765865761403861e-05, "epoch": 0.6201550387596899, "percentage": 20.67, "elapsed_time": "0:53:00", "remaining_time": "3:23:24"}
81
+ {"current_steps": 162, "total_steps": 774, "loss": 1.020676612854004, "lr": 1.733174496308864e-05, "epoch": 0.627906976744186, "percentage": 20.93, "elapsed_time": "0:53:42", "remaining_time": "3:22:52"}
82
+ {"current_steps": 164, "total_steps": 774, "loss": 1.2313765287399292, "lr": 1.700477855145699e-05, "epoch": 0.6356589147286822, "percentage": 21.19, "elapsed_time": "0:54:27", "remaining_time": "3:22:32"}
83
+ {"current_steps": 166, "total_steps": 774, "loss": 0.9673617482185364, "lr": 1.6677973413030936e-05, "epoch": 0.6434108527131783, "percentage": 21.45, "elapsed_time": "0:55:09", "remaining_time": "3:22:00"}
84
+ {"current_steps": 168, "total_steps": 774, "loss": 1.194890022277832, "lr": 1.6351544475634266e-05, "epoch": 0.6511627906976745, "percentage": 21.71, "elapsed_time": "0:55:55", "remaining_time": "3:21:44"}
85
+ {"current_steps": 170, "total_steps": 774, "loss": 0.5818596482276917, "lr": 1.6025706419677057e-05, "epoch": 0.6589147286821705, "percentage": 21.96, "elapsed_time": "0:56:26", "remaining_time": "3:20:32"}
86
+ {"current_steps": 172, "total_steps": 774, "loss": 1.1378095149993896, "lr": 1.5700673536968222e-05, "epoch": 0.6666666666666666, "percentage": 22.22, "elapsed_time": "0:57:05", "remaining_time": "3:19:50"}
87
+ {"current_steps": 174, "total_steps": 774, "loss": 0.864031970500946, "lr": 1.5376659589783572e-05, "epoch": 0.6744186046511628, "percentage": 22.48, "elapsed_time": "0:57:43", "remaining_time": "3:19:04"}
88
+ {"current_steps": 176, "total_steps": 774, "loss": 0.9113052487373352, "lr": 1.5053877670282186e-05, "epoch": 0.6821705426356589, "percentage": 22.74, "elapsed_time": "0:58:20", "remaining_time": "3:18:14"}
89
+ {"current_steps": 178, "total_steps": 774, "loss": 0.9309589862823486, "lr": 1.4732540060363447e-05, "epoch": 0.689922480620155, "percentage": 23.0, "elapsed_time": "0:58:52", "remaining_time": "3:17:07"}
90
+ {"current_steps": 180, "total_steps": 774, "loss": 1.002301573753357, "lr": 1.4412858092056991e-05, "epoch": 0.6976744186046512, "percentage": 23.26, "elapsed_time": "0:59:32", "remaining_time": "3:16:28"}
91
+ {"current_steps": 182, "total_steps": 774, "loss": 1.0712729692459106, "lr": 1.4095042008537343e-05, "epoch": 0.7054263565891473, "percentage": 23.51, "elapsed_time": "1:00:11", "remaining_time": "3:15:47"}
92
+ {"current_steps": 184, "total_steps": 774, "loss": 0.9123468995094299, "lr": 1.3779300825854622e-05, "epoch": 0.7131782945736435, "percentage": 23.77, "elapsed_time": "1:00:46", "remaining_time": "3:14:52"}
93
+ {"current_steps": 186, "total_steps": 774, "loss": 1.2733235359191895, "lr": 1.3465842195472321e-05, "epoch": 0.7209302325581395, "percentage": 24.03, "elapsed_time": "1:01:31", "remaining_time": "3:14:29"}
94
+ {"current_steps": 188, "total_steps": 774, "loss": 0.9789453148841858, "lr": 1.3154872267702522e-05, "epoch": 0.7286821705426356, "percentage": 24.29, "elapsed_time": "1:02:10", "remaining_time": "3:13:48"}
95
+ {"current_steps": 190, "total_steps": 774, "loss": 1.0140795707702637, "lr": 1.2846595556128331e-05, "epoch": 0.7364341085271318, "percentage": 24.55, "elapsed_time": "1:02:51", "remaining_time": "3:13:11"}
96
+ {"current_steps": 192, "total_steps": 774, "loss": 1.1332778930664062, "lr": 1.254121480310276e-05, "epoch": 0.7441860465116279, "percentage": 24.81, "elapsed_time": "1:03:30", "remaining_time": "3:12:30"}
97
+ {"current_steps": 194, "total_steps": 774, "loss": 1.201830506324768, "lr": 1.2238930846412475e-05, "epoch": 0.751937984496124, "percentage": 25.06, "elapsed_time": "1:04:15", "remaining_time": "3:12:07"}
98
+ {"current_steps": 196, "total_steps": 774, "loss": 1.2011100053787231, "lr": 1.1939942487194116e-05, "epoch": 0.7596899224806202, "percentage": 25.32, "elapsed_time": "1:05:01", "remaining_time": "3:11:46"}
99
+ {"current_steps": 198, "total_steps": 774, "loss": 0.5936653017997742, "lr": 1.1644446359190004e-05, "epoch": 0.7674418604651163, "percentage": 25.58, "elapsed_time": "1:05:37", "remaining_time": "3:10:54"}
100
+ {"current_steps": 200, "total_steps": 774, "loss": 1.3216241598129272, "lr": 1.1352636799429354e-05, "epoch": 0.7751937984496124, "percentage": 25.84, "elapsed_time": "1:06:14", "remaining_time": "3:10:08"}
101
+ {"current_steps": 202, "total_steps": 774, "loss": 1.084835171699524, "lr": 1.1064705720419829e-05, "epoch": 0.7829457364341085, "percentage": 26.1, "elapsed_time": "1:06:49", "remaining_time": "3:09:12"}
102
+ {"current_steps": 204, "total_steps": 774, "loss": 1.2125266790390015, "lr": 1.0780842483933755e-05, "epoch": 0.7906976744186046, "percentage": 26.36, "elapsed_time": "1:07:32", "remaining_time": "3:08:43"}
103
+ {"current_steps": 206, "total_steps": 774, "loss": 1.0225963592529297, "lr": 1.050123377647171e-05, "epoch": 0.7984496124031008, "percentage": 26.61, "elapsed_time": "1:08:10", "remaining_time": "3:07:57"}
104
+ {"current_steps": 208, "total_steps": 774, "loss": 0.7963980436325073, "lr": 1.0226063486485695e-05, "epoch": 0.8062015503875969, "percentage": 26.87, "elapsed_time": "1:08:44", "remaining_time": "3:07:02"}
105
+ {"current_steps": 210, "total_steps": 774, "loss": 1.2788116931915283, "lr": 9.955512583442334e-06, "epoch": 0.813953488372093, "percentage": 27.13, "elapsed_time": "1:09:25", "remaining_time": "3:06:28"}
106
+ {"current_steps": 212, "total_steps": 774, "loss": 1.1842073202133179, "lr": 9.68975899880592e-06, "epoch": 0.8217054263565892, "percentage": 27.39, "elapsed_time": "1:10:06", "remaining_time": "3:05:51"}
107
+ {"current_steps": 214, "total_steps": 774, "loss": 0.9420091509819031, "lr": 9.42897750901933e-06, "epoch": 0.8294573643410853, "percentage": 27.65, "elapsed_time": "1:10:47", "remaining_time": "3:05:14"}
108
+ {"current_steps": 216, "total_steps": 774, "loss": 1.0436409711837769, "lr": 9.173339620559935e-06, "epoch": 0.8372093023255814, "percentage": 27.91, "elapsed_time": "1:11:26", "remaining_time": "3:04:32"}
109
+ {"current_steps": 218, "total_steps": 774, "loss": 1.2834446430206299, "lr": 8.923013457146082e-06, "epoch": 0.8449612403100775, "percentage": 28.17, "elapsed_time": "1:12:07", "remaining_time": "3:03:57"}
110
+ {"current_steps": 220, "total_steps": 774, "loss": 1.1693506240844727, "lr": 8.678163649168214e-06, "epoch": 0.8527131782945736, "percentage": 28.42, "elapsed_time": "1:12:52", "remaining_time": "3:03:29"}
111
+ {"current_steps": 222, "total_steps": 774, "loss": 0.49415066838264465, "lr": 8.438951225417476e-06, "epoch": 0.8604651162790697, "percentage": 28.68, "elapsed_time": "1:13:25", "remaining_time": "3:02:35"}
112
+ {"current_steps": 224, "total_steps": 774, "loss": 1.1654852628707886, "lr": 8.205533507182963e-06, "epoch": 0.8682170542635659, "percentage": 28.94, "elapsed_time": "1:14:06", "remaining_time": "3:01:57"}
113
+ {"current_steps": 226, "total_steps": 774, "loss": 1.2648242712020874, "lr": 7.978064004787238e-06, "epoch": 0.875968992248062, "percentage": 29.2, "elapsed_time": "1:14:47", "remaining_time": "3:01:21"}
114
+ {"current_steps": 228, "total_steps": 774, "loss": 0.8766679167747498, "lr": 7.756692316628162e-06, "epoch": 0.8837209302325582, "percentage": 29.46, "elapsed_time": "1:15:25", "remaining_time": "3:00:37"}
115
+ {"current_steps": 230, "total_steps": 774, "loss": 0.9922328591346741, "lr": 7.541564030793536e-06, "epoch": 0.8914728682170543, "percentage": 29.72, "elapsed_time": "1:16:03", "remaining_time": "2:59:53"}
116
+ {"current_steps": 232, "total_steps": 774, "loss": 0.837881863117218, "lr": 7.33282062931308e-06, "epoch": 0.8992248062015504, "percentage": 29.97, "elapsed_time": "1:16:41", "remaining_time": "2:59:09"}
117
+ {"current_steps": 234, "total_steps": 774, "loss": 1.272527813911438, "lr": 7.13059939511089e-06, "epoch": 0.9069767441860465, "percentage": 30.23, "elapsed_time": "1:17:24", "remaining_time": "2:58:37"}
118
+ {"current_steps": 236, "total_steps": 774, "loss": 0.6637862920761108, "lr": 6.935033321719421e-06, "epoch": 0.9147286821705426, "percentage": 30.49, "elapsed_time": "1:17:59", "remaining_time": "2:57:48"}
119
+ {"current_steps": 238, "total_steps": 774, "loss": 1.2028839588165283, "lr": 6.746251025814548e-06, "epoch": 0.9224806201550387, "percentage": 30.75, "elapsed_time": "1:18:45", "remaining_time": "2:57:23"}
120
+ {"current_steps": 240, "total_steps": 774, "loss": 1.0310890674591064, "lr": 6.564376662629032e-06, "epoch": 0.9302325581395349, "percentage": 31.01, "elapsed_time": "1:19:23", "remaining_time": "2:56:39"}
121
+ {"current_steps": 242, "total_steps": 774, "loss": 1.129476547241211, "lr": 6.389529844300147e-06, "epoch": 0.937984496124031, "percentage": 31.27, "elapsed_time": "1:20:03", "remaining_time": "2:56:00"}
122
+ {"current_steps": 244, "total_steps": 774, "loss": 0.9788402915000916, "lr": 6.2218255612051575e-06, "epoch": 0.9457364341085271, "percentage": 31.52, "elapsed_time": "1:20:43", "remaining_time": "2:55:19"}
123
+ {"current_steps": 246, "total_steps": 774, "loss": 0.7472362518310547, "lr": 6.061374106336329e-06, "epoch": 0.9534883720930233, "percentage": 31.78, "elapsed_time": "1:21:20", "remaining_time": "2:54:34"}
124
+ {"current_steps": 248, "total_steps": 774, "loss": 0.7408154606819153, "lr": 5.9082810027652495e-06, "epoch": 0.9612403100775194, "percentage": 32.04, "elapsed_time": "1:21:58", "remaining_time": "2:53:51"}
125
+ {"current_steps": 250, "total_steps": 774, "loss": 1.1912089586257935, "lr": 5.762646934244157e-06, "epoch": 0.9689922480620154, "percentage": 32.3, "elapsed_time": "1:22:40", "remaining_time": "2:53:17"}
126
+ {"current_steps": 252, "total_steps": 774, "loss": 0.970727264881134, "lr": 5.6245676789899e-06, "epoch": 0.9767441860465116, "percentage": 32.56, "elapsed_time": "1:23:19", "remaining_time": "2:52:35"}
127
+ {"current_steps": 254, "total_steps": 774, "loss": 0.9474197626113892, "lr": 5.494134046694101e-06, "epoch": 0.9844961240310077, "percentage": 32.82, "elapsed_time": "1:24:00", "remaining_time": "2:51:59"}
128
+ {"current_steps": 256, "total_steps": 774, "loss": 0.7675265073776245, "lr": 5.371431818800934e-06, "epoch": 0.9922480620155039, "percentage": 33.07, "elapsed_time": "1:24:34", "remaining_time": "2:51:07"}
129
+ {"current_steps": 258, "total_steps": 774, "loss": 1.151860237121582, "lr": 5.256541692091799e-06, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "1:25:16", "remaining_time": "2:50:32"}
130
+ {"current_steps": 260, "total_steps": 774, "loss": 0.6956380605697632, "lr": 5.149539225613974e-06, "epoch": 1.0077519379844961, "percentage": 33.59, "elapsed_time": "1:25:54", "remaining_time": "2:49:49"}
131
+ {"current_steps": 262, "total_steps": 774, "loss": 0.9135383367538452, "lr": 5.050494790988212e-06, "epoch": 1.0155038759689923, "percentage": 33.85, "elapsed_time": "1:26:37", "remaining_time": "2:49:16"}
132
+ {"current_steps": 264, "total_steps": 774, "loss": 0.721315324306488, "lr": 4.95947352612787e-06, "epoch": 1.0232558139534884, "percentage": 34.11, "elapsed_time": "1:27:12", "remaining_time": "2:48:29"}
133
+ {"current_steps": 266, "total_steps": 774, "loss": 0.4410458207130432, "lr": 4.876535292400089e-06, "epoch": 1.0310077519379846, "percentage": 34.37, "elapsed_time": "1:27:47", "remaining_time": "2:47:39"}
134
+ {"current_steps": 268, "total_steps": 774, "loss": 0.8536827564239502, "lr": 4.801734635257148e-06, "epoch": 1.0387596899224807, "percentage": 34.63, "elapsed_time": "1:28:26", "remaining_time": "2:46:59"}
135
+ {"current_steps": 270, "total_steps": 774, "loss": 0.903506875038147, "lr": 4.735120748363916e-06, "epoch": 1.0465116279069768, "percentage": 34.88, "elapsed_time": "1:29:09", "remaining_time": "2:46:25"}
136
+ {"current_steps": 272, "total_steps": 774, "loss": 0.48186248540878296, "lr": 4.676737441244975e-06, "epoch": 1.054263565891473, "percentage": 35.14, "elapsed_time": "1:29:44", "remaining_time": "2:45:38"}
137
+ {"current_steps": 274, "total_steps": 774, "loss": 0.8960871696472168, "lr": 4.626623110472677e-06, "epoch": 1.062015503875969, "percentage": 35.4, "elapsed_time": "1:30:31", "remaining_time": "2:45:10"}
138
+ {"current_steps": 276, "total_steps": 774, "loss": 0.8507243990898132, "lr": 4.584810714415135e-06, "epoch": 1.069767441860465, "percentage": 35.66, "elapsed_time": "1:31:13", "remaining_time": "2:44:36"}
139
+ {"current_steps": 278, "total_steps": 774, "loss": 0.9197998642921448, "lr": 4.5513277515607014e-06, "epoch": 1.0775193798449612, "percentage": 35.92, "elapsed_time": "1:31:58", "remaining_time": "2:44:06"}
140
+ {"current_steps": 280, "total_steps": 774, "loss": 0.778313398361206, "lr": 4.526196242433211e-06, "epoch": 1.0852713178294573, "percentage": 36.18, "elapsed_time": "1:32:40", "remaining_time": "2:43:30"}
141
+ {"current_steps": 282, "total_steps": 774, "loss": 0.5479567050933838, "lr": 4.509432715109887e-06, "epoch": 1.0930232558139534, "percentage": 36.43, "elapsed_time": "1:33:19", "remaining_time": "2:42:49"}
142
+ {"current_steps": 284, "total_steps": 774, "loss": 0.6334800720214844, "lr": 4.50104819435143e-06, "epoch": 1.1007751937984496, "percentage": 36.69, "elapsed_time": "1:33:55", "remaining_time": "2:42:02"}
143
+ {"current_steps": 286, "total_steps": 774, "loss": 0.8215212225914001, "lr": 4.50104819435143e-06, "epoch": 1.1085271317829457, "percentage": 36.95, "elapsed_time": "1:34:31", "remaining_time": "2:41:18"}
144
+ {"current_steps": 288, "total_steps": 774, "loss": 0.5245926976203918, "lr": 4.509432715109887e-06, "epoch": 1.1162790697674418, "percentage": 37.21, "elapsed_time": "1:35:05", "remaining_time": "2:40:28"}
145
+ {"current_steps": 290, "total_steps": 774, "loss": 1.0330955982208252, "lr": 4.526196242433211e-06, "epoch": 1.124031007751938, "percentage": 37.47, "elapsed_time": "1:35:50", "remaining_time": "2:39:58"}
146
+ {"current_steps": 292, "total_steps": 774, "loss": 0.5526050329208374, "lr": 4.5513277515607014e-06, "epoch": 1.1317829457364341, "percentage": 37.73, "elapsed_time": "1:36:28", "remaining_time": "2:39:15"}
147
+ {"current_steps": 294, "total_steps": 774, "loss": 1.046125888824463, "lr": 4.584810714415136e-06, "epoch": 1.1395348837209303, "percentage": 37.98, "elapsed_time": "1:37:14", "remaining_time": "2:38:45"}
148
+ {"current_steps": 296, "total_steps": 774, "loss": 0.3840217590332031, "lr": 4.626623110472676e-06, "epoch": 1.1472868217054264, "percentage": 38.24, "elapsed_time": "1:37:47", "remaining_time": "2:37:56"}
149
+ {"current_steps": 298, "total_steps": 774, "loss": 0.6799867153167725, "lr": 4.676737441244973e-06, "epoch": 1.1550387596899225, "percentage": 38.5, "elapsed_time": "1:38:29", "remaining_time": "2:37:19"}
150
+ {"current_steps": 300, "total_steps": 774, "loss": 0.6748986840248108, "lr": 4.735120748363917e-06, "epoch": 1.1627906976744187, "percentage": 38.76, "elapsed_time": "1:39:09", "remaining_time": "2:36:40"}
151
+ {"current_steps": 302, "total_steps": 774, "loss": 0.8421810865402222, "lr": 4.801734635257148e-06, "epoch": 1.1705426356589148, "percentage": 39.02, "elapsed_time": "1:39:52", "remaining_time": "2:36:05"}
152
+ {"current_steps": 304, "total_steps": 774, "loss": 0.5402819514274597, "lr": 4.876535292400087e-06, "epoch": 1.178294573643411, "percentage": 39.28, "elapsed_time": "1:40:26", "remaining_time": "2:35:17"}
153
+ {"current_steps": 306, "total_steps": 774, "loss": 0.9019787311553955, "lr": 4.95947352612787e-06, "epoch": 1.1860465116279069, "percentage": 39.53, "elapsed_time": "1:41:11", "remaining_time": "2:34:45"}
154
+ {"current_steps": 308, "total_steps": 774, "loss": 0.8330530524253845, "lr": 5.050494790988212e-06, "epoch": 1.193798449612403, "percentage": 39.79, "elapsed_time": "1:41:52", "remaining_time": "2:34:08"}
155
+ {"current_steps": 310, "total_steps": 774, "loss": 1.0060863494873047, "lr": 5.149539225613974e-06, "epoch": 1.2015503875968991, "percentage": 40.05, "elapsed_time": "1:42:37", "remaining_time": "2:33:35"}
156
+ {"current_steps": 312, "total_steps": 774, "loss": 0.5403499007225037, "lr": 5.256541692091797e-06, "epoch": 1.2093023255813953, "percentage": 40.31, "elapsed_time": "1:43:16", "remaining_time": "2:32:55"}
157
+ {"current_steps": 314, "total_steps": 774, "loss": 0.37406668066978455, "lr": 5.371431818800936e-06, "epoch": 1.2170542635658914, "percentage": 40.57, "elapsed_time": "1:43:50", "remaining_time": "2:32:08"}
158
+ {"current_steps": 316, "total_steps": 774, "loss": 0.6960604786872864, "lr": 5.494134046694099e-06, "epoch": 1.2248062015503876, "percentage": 40.83, "elapsed_time": "1:44:32", "remaining_time": "2:31:30"}
159
+ {"current_steps": 318, "total_steps": 774, "loss": 0.7832977771759033, "lr": 5.624567678989899e-06, "epoch": 1.2325581395348837, "percentage": 41.09, "elapsed_time": "1:45:13", "remaining_time": "2:30:53"}
160
+ {"current_steps": 320, "total_steps": 774, "loss": 0.9501113295555115, "lr": 5.762646934244156e-06, "epoch": 1.2403100775193798, "percentage": 41.34, "elapsed_time": "1:45:57", "remaining_time": "2:30:19"}
161
+ {"current_steps": 322, "total_steps": 774, "loss": 1.0130536556243896, "lr": 5.908281002765248e-06, "epoch": 1.248062015503876, "percentage": 41.6, "elapsed_time": "1:46:43", "remaining_time": "2:29:49"}
162
+ {"current_steps": 324, "total_steps": 774, "loss": 0.631900429725647, "lr": 6.061374106336328e-06, "epoch": 1.255813953488372, "percentage": 41.86, "elapsed_time": "1:47:21", "remaining_time": "2:29:06"}
163
+ {"current_steps": 326, "total_steps": 774, "loss": 0.8754401803016663, "lr": 6.2218255612051575e-06, "epoch": 1.2635658914728682, "percentage": 42.12, "elapsed_time": "1:48:05", "remaining_time": "2:28:33"}
164
+ {"current_steps": 328, "total_steps": 774, "loss": 0.7127947807312012, "lr": 6.389529844300143e-06, "epoch": 1.2713178294573644, "percentage": 42.38, "elapsed_time": "1:48:47", "remaining_time": "2:27:55"}
165
+ {"current_steps": 330, "total_steps": 774, "loss": 0.4656026363372803, "lr": 6.564376662629029e-06, "epoch": 1.2790697674418605, "percentage": 42.64, "elapsed_time": "1:49:22", "remaining_time": "2:27:09"}
166
+ {"current_steps": 332, "total_steps": 774, "loss": 0.8079378008842468, "lr": 6.74625102581455e-06, "epoch": 1.2868217054263567, "percentage": 42.89, "elapsed_time": "1:50:04", "remaining_time": "2:26:32"}
167
+ {"current_steps": 334, "total_steps": 774, "loss": 0.5637804865837097, "lr": 6.935033321719419e-06, "epoch": 1.2945736434108528, "percentage": 43.15, "elapsed_time": "1:50:39", "remaining_time": "2:25:46"}
168
+ {"current_steps": 336, "total_steps": 774, "loss": 0.8007771968841553, "lr": 7.130599395110884e-06, "epoch": 1.302325581395349, "percentage": 43.41, "elapsed_time": "1:51:21", "remaining_time": "2:25:09"}
169
+ {"current_steps": 338, "total_steps": 774, "loss": 0.551106333732605, "lr": 7.332820629313082e-06, "epoch": 1.310077519379845, "percentage": 43.67, "elapsed_time": "1:52:00", "remaining_time": "2:24:28"}
170
+ {"current_steps": 340, "total_steps": 774, "loss": 0.7754759788513184, "lr": 7.541564030793533e-06, "epoch": 1.3178294573643412, "percentage": 43.93, "elapsed_time": "1:52:45", "remaining_time": "2:23:56"}
171
+ {"current_steps": 342, "total_steps": 774, "loss": 0.7786872982978821, "lr": 7.75669231662816e-06, "epoch": 1.3255813953488373, "percentage": 44.19, "elapsed_time": "1:53:29", "remaining_time": "2:23:21"}
172
+ {"current_steps": 344, "total_steps": 774, "loss": 0.7895460724830627, "lr": 7.978064004787231e-06, "epoch": 1.3333333333333333, "percentage": 44.44, "elapsed_time": "1:54:09", "remaining_time": "2:22:41"}
173
+ {"current_steps": 346, "total_steps": 774, "loss": 0.20940443873405457, "lr": 8.205533507182961e-06, "epoch": 1.3410852713178294, "percentage": 44.7, "elapsed_time": "1:54:44", "remaining_time": "2:21:56"}
174
+ {"current_steps": 348, "total_steps": 774, "loss": 0.819771409034729, "lr": 8.438951225417474e-06, "epoch": 1.3488372093023255, "percentage": 44.96, "elapsed_time": "1:55:31", "remaining_time": "2:21:25"}
175
+ {"current_steps": 350, "total_steps": 774, "loss": 0.9801982641220093, "lr": 8.678163649168212e-06, "epoch": 1.3565891472868217, "percentage": 45.22, "elapsed_time": "1:56:20", "remaining_time": "2:20:56"}
176
+ {"current_steps": 352, "total_steps": 774, "loss": 0.7718797326087952, "lr": 8.923013457146075e-06, "epoch": 1.3643410852713178, "percentage": 45.48, "elapsed_time": "1:57:06", "remaining_time": "2:20:24"}
177
+ {"current_steps": 354, "total_steps": 774, "loss": 0.40787971019744873, "lr": 9.173339620559931e-06, "epoch": 1.372093023255814, "percentage": 45.74, "elapsed_time": "1:57:40", "remaining_time": "2:19:37"}
178
+ {"current_steps": 356, "total_steps": 774, "loss": 0.797160804271698, "lr": 9.428977509019326e-06, "epoch": 1.37984496124031, "percentage": 45.99, "elapsed_time": "1:58:23", "remaining_time": "2:19:00"}
179
+ {"current_steps": 358, "total_steps": 774, "loss": 0.6483190059661865, "lr": 9.689758998805924e-06, "epoch": 1.3875968992248062, "percentage": 46.25, "elapsed_time": "1:59:01", "remaining_time": "2:18:18"}
180
+ {"current_steps": 360, "total_steps": 774, "loss": 0.7835768461227417, "lr": 9.955512583442333e-06, "epoch": 1.3953488372093024, "percentage": 46.51, "elapsed_time": "1:59:42", "remaining_time": "2:17:40"}
181
+ {"current_steps": 362, "total_steps": 774, "loss": 0.6386092901229858, "lr": 1.0226063486485691e-05, "epoch": 1.4031007751937985, "percentage": 46.77, "elapsed_time": "2:00:23", "remaining_time": "2:17:01"}
182
+ {"current_steps": 364, "total_steps": 774, "loss": 0.8520874977111816, "lr": 1.0501233776471714e-05, "epoch": 1.4108527131782946, "percentage": 47.03, "elapsed_time": "2:01:08", "remaining_time": "2:16:27"}
183
+ {"current_steps": 366, "total_steps": 774, "loss": 0.37374499440193176, "lr": 1.0780842483933755e-05, "epoch": 1.4186046511627908, "percentage": 47.29, "elapsed_time": "2:01:44", "remaining_time": "2:15:43"}
184
+ {"current_steps": 368, "total_steps": 774, "loss": 0.3320968449115753, "lr": 1.1064705720419827e-05, "epoch": 1.4263565891472867, "percentage": 47.55, "elapsed_time": "2:02:21", "remaining_time": "2:14:59"}
185
+ {"current_steps": 370, "total_steps": 774, "loss": 0.7746375799179077, "lr": 1.135263679942935e-05, "epoch": 1.4341085271317828, "percentage": 47.8, "elapsed_time": "2:03:04", "remaining_time": "2:14:23"}
186
+ {"current_steps": 372, "total_steps": 774, "loss": 0.6704602241516113, "lr": 1.1644446359190006e-05, "epoch": 1.441860465116279, "percentage": 48.06, "elapsed_time": "2:03:44", "remaining_time": "2:13:42"}
187
+ {"current_steps": 374, "total_steps": 774, "loss": 0.9213350415229797, "lr": 1.1939942487194116e-05, "epoch": 1.449612403100775, "percentage": 48.32, "elapsed_time": "2:04:23", "remaining_time": "2:13:02"}
188
+ {"current_steps": 376, "total_steps": 774, "loss": 0.7233853936195374, "lr": 1.2238930846412471e-05, "epoch": 1.4573643410852712, "percentage": 48.58, "elapsed_time": "2:05:02", "remaining_time": "2:12:20"}
189
+ {"current_steps": 378, "total_steps": 774, "loss": 0.5185383558273315, "lr": 1.2541214803102757e-05, "epoch": 1.4651162790697674, "percentage": 48.84, "elapsed_time": "2:05:39", "remaining_time": "2:11:38"}
190
+ {"current_steps": 380, "total_steps": 774, "loss": 0.7751470804214478, "lr": 1.2846595556128331e-05, "epoch": 1.4728682170542635, "percentage": 49.1, "elapsed_time": "2:06:18", "remaining_time": "2:10:57"}
191
+ {"current_steps": 382, "total_steps": 774, "loss": 0.7363438010215759, "lr": 1.3154872267702518e-05, "epoch": 1.4806201550387597, "percentage": 49.35, "elapsed_time": "2:06:57", "remaining_time": "2:10:16"}
192
+ {"current_steps": 384, "total_steps": 774, "loss": 0.697909951210022, "lr": 1.3465842195472318e-05, "epoch": 1.4883720930232558, "percentage": 49.61, "elapsed_time": "2:07:37", "remaining_time": "2:09:37"}
193
+ {"current_steps": 386, "total_steps": 774, "loss": 0.5058455467224121, "lr": 1.3779300825854622e-05, "epoch": 1.496124031007752, "percentage": 49.87, "elapsed_time": "2:08:14", "remaining_time": "2:08:54"}
194
+ {"current_steps": 388, "total_steps": 774, "loss": 0.6899944543838501, "lr": 1.4095042008537336e-05, "epoch": 1.503875968992248, "percentage": 50.13, "elapsed_time": "2:08:56", "remaining_time": "2:08:16"}
195
+ {"current_steps": 390, "total_steps": 774, "loss": 0.5844802856445312, "lr": 1.4412858092056988e-05, "epoch": 1.5116279069767442, "percentage": 50.39, "elapsed_time": "2:09:34", "remaining_time": "2:07:35"}
196
+ {"current_steps": 392, "total_steps": 774, "loss": 0.6977730393409729, "lr": 1.4732540060363447e-05, "epoch": 1.5193798449612403, "percentage": 50.65, "elapsed_time": "2:10:12", "remaining_time": "2:06:53"}
197
+ {"current_steps": 394, "total_steps": 774, "loss": 0.7261441349983215, "lr": 1.5053877670282176e-05, "epoch": 1.5271317829457365, "percentage": 50.9, "elapsed_time": "2:10:49", "remaining_time": "2:06:10"}
198
+ {"current_steps": 396, "total_steps": 774, "loss": 0.7607800960540771, "lr": 1.537665958978357e-05, "epoch": 1.5348837209302326, "percentage": 51.16, "elapsed_time": "2:11:30", "remaining_time": "2:05:32"}
199
+ {"current_steps": 398, "total_steps": 774, "loss": 0.5964785218238831, "lr": 1.5700673536968222e-05, "epoch": 1.5426356589147288, "percentage": 51.42, "elapsed_time": "2:12:05", "remaining_time": "2:04:47"}
200
+ {"current_steps": 400, "total_steps": 774, "loss": 0.7581831812858582, "lr": 1.6025706419677047e-05, "epoch": 1.550387596899225, "percentage": 51.68, "elapsed_time": "2:12:50", "remaining_time": "2:04:12"}
201
+ {"current_steps": 402, "total_steps": 774, "loss": 0.5359363555908203, "lr": 1.6351544475634256e-05, "epoch": 1.558139534883721, "percentage": 51.94, "elapsed_time": "2:13:28", "remaining_time": "2:03:30"}
202
+ {"current_steps": 404, "total_steps": 774, "loss": 0.9142735004425049, "lr": 1.6677973413030932e-05, "epoch": 1.5658914728682172, "percentage": 52.2, "elapsed_time": "2:14:10", "remaining_time": "2:02:53"}
203
+ {"current_steps": 406, "total_steps": 774, "loss": 0.7637568712234497, "lr": 1.7004778551456975e-05, "epoch": 1.5736434108527133, "percentage": 52.45, "elapsed_time": "2:14:53", "remaining_time": "2:02:15"}
204
+ {"current_steps": 408, "total_steps": 774, "loss": 0.31641456484794617, "lr": 1.7331744963088644e-05, "epoch": 1.5813953488372094, "percentage": 52.71, "elapsed_time": "2:15:26", "remaining_time": "2:01:30"}
205
+ {"current_steps": 410, "total_steps": 774, "loss": 0.780099630355835, "lr": 1.7658657614038598e-05, "epoch": 1.5891472868217056, "percentage": 52.97, "elapsed_time": "2:16:07", "remaining_time": "2:00:50"}
206
+ {"current_steps": 412, "total_steps": 774, "loss": 0.7998414635658264, "lr": 1.7985301505776015e-05, "epoch": 1.5968992248062015, "percentage": 53.23, "elapsed_time": "2:16:44", "remaining_time": "2:00:08"}
207
+ {"current_steps": 414, "total_steps": 774, "loss": 0.5864279866218567, "lr": 1.8311461816523192e-05, "epoch": 1.6046511627906976, "percentage": 53.49, "elapsed_time": "2:17:21", "remaining_time": "1:59:26"}
208
+ {"current_steps": 416, "total_steps": 774, "loss": 0.47105392813682556, "lr": 1.8636924042535962e-05, "epoch": 1.6124031007751938, "percentage": 53.75, "elapsed_time": "2:17:55", "remaining_time": "1:58:41"}
209
+ {"current_steps": 418, "total_steps": 774, "loss": 0.8024092316627502, "lr": 1.8961474139175093e-05, "epoch": 1.62015503875969, "percentage": 54.01, "elapsed_time": "2:18:36", "remaining_time": "1:58:03"}
210
+ {"current_steps": 420, "total_steps": 774, "loss": 0.810451090335846, "lr": 1.9284898661675586e-05, "epoch": 1.627906976744186, "percentage": 54.26, "elapsed_time": "2:19:20", "remaining_time": "1:57:26"}
211
+ {"current_steps": 422, "total_steps": 774, "loss": 0.4688906967639923, "lr": 1.9606984905521443e-05, "epoch": 1.6356589147286822, "percentage": 54.52, "elapsed_time": "2:19:53", "remaining_time": "1:56:41"}
212
+ {"current_steps": 424, "total_steps": 774, "loss": 0.7383279204368591, "lr": 1.9927521046333837e-05, "epoch": 1.6434108527131783, "percentage": 54.78, "elapsed_time": "2:20:33", "remaining_time": "1:56:01"}
213
+ {"current_steps": 426, "total_steps": 774, "loss": 0.8395543694496155, "lr": 2.0246296279180093e-05, "epoch": 1.6511627906976745, "percentage": 55.04, "elapsed_time": "2:21:11", "remaining_time": "1:55:20"}
214
+ {"current_steps": 428, "total_steps": 774, "loss": 0.8986775875091553, "lr": 2.0563100957212567e-05, "epoch": 1.6589147286821704, "percentage": 55.3, "elapsed_time": "2:21:51", "remaining_time": "1:54:40"}
215
+ {"current_steps": 430, "total_steps": 774, "loss": 0.8169777393341064, "lr": 2.0877726729545672e-05, "epoch": 1.6666666666666665, "percentage": 55.56, "elapsed_time": "2:22:30", "remaining_time": "1:54:00"}
216
+ {"current_steps": 432, "total_steps": 774, "loss": 1.033119559288025, "lr": 2.1189966678280578e-05, "epoch": 1.6744186046511627, "percentage": 55.81, "elapsed_time": "2:23:15", "remaining_time": "1:53:24"}
217
+ {"current_steps": 434, "total_steps": 774, "loss": 0.5892492532730103, "lr": 2.149961545458772e-05, "epoch": 1.6821705426356588, "percentage": 56.07, "elapsed_time": "2:23:54", "remaining_time": "1:52:44"}
218
+ {"current_steps": 436, "total_steps": 774, "loss": 0.7995302081108093, "lr": 2.1806469413757164e-05, "epoch": 1.689922480620155, "percentage": 56.33, "elapsed_time": "2:24:32", "remaining_time": "1:52:03"}
219
+ {"current_steps": 438, "total_steps": 774, "loss": 0.8415105938911438, "lr": 2.211032674912823e-05, "epoch": 1.697674418604651, "percentage": 56.59, "elapsed_time": "2:25:14", "remaining_time": "1:51:25"}
220
+ {"current_steps": 440, "total_steps": 774, "loss": 0.6350277066230774, "lr": 2.241098762481052e-05, "epoch": 1.7054263565891472, "percentage": 56.85, "elapsed_time": "2:25:52", "remaining_time": "1:50:44"}
221
+ {"current_steps": 442, "total_steps": 774, "loss": 0.8463593125343323, "lr": 2.27082543071086e-05, "epoch": 1.7131782945736433, "percentage": 57.11, "elapsed_time": "2:26:35", "remaining_time": "1:50:06"}
222
+ {"current_steps": 444, "total_steps": 774, "loss": 0.5609403252601624, "lr": 2.3001931294564265e-05, "epoch": 1.7209302325581395, "percentage": 57.36, "elapsed_time": "2:27:11", "remaining_time": "1:49:24"}
223
+ {"current_steps": 446, "total_steps": 774, "loss": 0.8690592050552368, "lr": 2.3291825446530733e-05, "epoch": 1.7286821705426356, "percentage": 57.62, "elapsed_time": "2:27:52", "remaining_time": "1:48:45"}
224
+ {"current_steps": 448, "total_steps": 774, "loss": 0.8064720630645752, "lr": 2.357774611019419e-05, "epoch": 1.7364341085271318, "percentage": 57.88, "elapsed_time": "2:28:30", "remaining_time": "1:48:03"}
225
+ {"current_steps": 450, "total_steps": 774, "loss": 1.0067108869552612, "lr": 2.385950524595919e-05, "epoch": 1.744186046511628, "percentage": 58.14, "elapsed_time": "2:29:14", "remaining_time": "1:47:27"}
226
+ {"current_steps": 452, "total_steps": 774, "loss": 0.967079222202301, "lr": 2.4136917551115478e-05, "epoch": 1.751937984496124, "percentage": 58.4, "elapsed_time": "2:30:00", "remaining_time": "1:46:51"}
227
+ {"current_steps": 454, "total_steps": 774, "loss": 0.6444424986839294, "lr": 2.4409800581704777e-05, "epoch": 1.7596899224806202, "percentage": 58.66, "elapsed_time": "2:30:38", "remaining_time": "1:46:10"}
228
+ {"current_steps": 456, "total_steps": 774, "loss": 0.8322298526763916, "lr": 2.4677974872507553e-05, "epoch": 1.7674418604651163, "percentage": 58.91, "elapsed_time": "2:31:19", "remaining_time": "1:45:31"}
229
+ {"current_steps": 458, "total_steps": 774, "loss": 0.4230212867259979, "lr": 2.4941264055070734e-05, "epoch": 1.7751937984496124, "percentage": 59.17, "elapsed_time": "2:31:53", "remaining_time": "1:44:48"}
230
+ {"current_steps": 460, "total_steps": 774, "loss": 0.6065483093261719, "lr": 2.5199494973698852e-05, "epoch": 1.7829457364341086, "percentage": 59.43, "elapsed_time": "2:32:31", "remaining_time": "1:44:06"}
231
+ {"current_steps": 462, "total_steps": 774, "loss": 0.8183580040931702, "lr": 2.545249779933216e-05, "epoch": 1.7906976744186047, "percentage": 59.69, "elapsed_time": "2:33:08", "remaining_time": "1:43:25"}
232
+ {"current_steps": 464, "total_steps": 774, "loss": 0.9282822608947754, "lr": 2.5700106141237063e-05, "epoch": 1.7984496124031009, "percentage": 59.95, "elapsed_time": "2:33:53", "remaining_time": "1:42:49"}
233
+ {"current_steps": 466, "total_steps": 774, "loss": 0.8734548687934875, "lr": 2.594215715643524e-05, "epoch": 1.806201550387597, "percentage": 60.21, "elapsed_time": "2:34:35", "remaining_time": "1:42:10"}
234
+ {"current_steps": 468, "total_steps": 774, "loss": 0.8903089165687561, "lr": 2.6178491656799497e-05, "epoch": 1.8139534883720931, "percentage": 60.47, "elapsed_time": "2:35:15", "remaining_time": "1:41:30"}
235
+ {"current_steps": 470, "total_steps": 774, "loss": 0.4710087180137634, "lr": 2.640895421374602e-05, "epoch": 1.8217054263565893, "percentage": 60.72, "elapsed_time": "2:35:51", "remaining_time": "1:40:48"}
236
+ {"current_steps": 472, "total_steps": 774, "loss": 1.1290743350982666, "lr": 2.6633393260454096e-05, "epoch": 1.8294573643410854, "percentage": 60.98, "elapsed_time": "2:36:33", "remaining_time": "1:40:10"}
237
+ {"current_steps": 474, "total_steps": 774, "loss": 0.6608400344848633, "lr": 2.6851661191546034e-05, "epoch": 1.8372093023255816, "percentage": 61.24, "elapsed_time": "2:37:09", "remaining_time": "1:39:27"}
238
+ {"current_steps": 476, "total_steps": 774, "loss": 0.850265383720398, "lr": 2.706361446016192e-05, "epoch": 1.8449612403100775, "percentage": 61.5, "elapsed_time": "2:37:50", "remaining_time": "1:38:48"}
239
+ {"current_steps": 478, "total_steps": 774, "loss": 0.6361703872680664, "lr": 2.7269113672365096e-05, "epoch": 1.8527131782945736, "percentage": 61.76, "elapsed_time": "2:38:25", "remaining_time": "1:38:06"}
240
+ {"current_steps": 480, "total_steps": 774, "loss": 1.0639129877090454, "lr": 2.7468023678816444e-05, "epoch": 1.8604651162790697, "percentage": 62.02, "elapsed_time": "2:39:09", "remaining_time": "1:37:29"}
241
+ {"current_steps": 482, "total_steps": 774, "loss": 0.6422796845436096, "lr": 2.766021366365728e-05, "epoch": 1.8682170542635659, "percentage": 62.27, "elapsed_time": "2:39:47", "remaining_time": "1:36:48"}
242
+ {"current_steps": 484, "total_steps": 774, "loss": 0.7208263874053955, "lr": 2.784555723054208e-05, "epoch": 1.875968992248062, "percentage": 62.53, "elapsed_time": "2:40:26", "remaining_time": "1:36:08"}
243
+ {"current_steps": 486, "total_steps": 774, "loss": 0.8420804738998413, "lr": 2.8023932485764764e-05, "epoch": 1.8837209302325582, "percentage": 62.79, "elapsed_time": "2:41:06", "remaining_time": "1:35:28"}
244
+ {"current_steps": 488, "total_steps": 774, "loss": 0.5533670783042908, "lr": 2.81952221184238e-05, "epoch": 1.8914728682170543, "percentage": 63.05, "elapsed_time": "2:41:39", "remaining_time": "1:34:44"}
245
+ {"current_steps": 490, "total_steps": 774, "loss": 0.688605785369873, "lr": 2.8359313477573215e-05, "epoch": 1.8992248062015504, "percentage": 63.31, "elapsed_time": "2:42:18", "remaining_time": "1:34:04"}
246
+ {"current_steps": 492, "total_steps": 774, "loss": 0.5789573192596436, "lr": 2.8516098646309108e-05, "epoch": 1.9069767441860463, "percentage": 63.57, "elapsed_time": "2:42:56", "remaining_time": "1:33:23"}
247
+ {"current_steps": 494, "total_steps": 774, "loss": 0.6448074579238892, "lr": 2.8665474512742607e-05, "epoch": 1.9147286821705425, "percentage": 63.82, "elapsed_time": "2:43:33", "remaining_time": "1:32:42"}
248
+ {"current_steps": 496, "total_steps": 774, "loss": 0.6479641199111938, "lr": 2.8807342837812783e-05, "epoch": 1.9224806201550386, "percentage": 64.08, "elapsed_time": "2:44:12", "remaining_time": "1:32:02"}
249
+ {"current_steps": 498, "total_steps": 774, "loss": 0.4521400034427643, "lr": 2.894161031989497e-05, "epoch": 1.9302325581395348, "percentage": 64.34, "elapsed_time": "2:44:44", "remaining_time": "1:31:18"}
250
+ {"current_steps": 500, "total_steps": 774, "loss": 0.9132779240608215, "lr": 2.906818865616178e-05, "epoch": 1.937984496124031, "percentage": 64.6, "elapsed_time": "2:45:29", "remaining_time": "1:30:41"}
251
+ {"current_steps": 502, "total_steps": 774, "loss": 0.6908618807792664, "lr": 2.9186994600656647e-05, "epoch": 1.945736434108527, "percentage": 64.86, "elapsed_time": "2:46:05", "remaining_time": "1:29:59"}
252
+ {"current_steps": 504, "total_steps": 774, "loss": 0.6676538586616516, "lr": 2.929795001904172e-05, "epoch": 1.9534883720930232, "percentage": 65.12, "elapsed_time": "2:46:45", "remaining_time": "1:29:19"}
253
+ {"current_steps": 506, "total_steps": 774, "loss": 1.0052788257598877, "lr": 2.9400981939983914e-05, "epoch": 1.9612403100775193, "percentage": 65.37, "elapsed_time": "2:47:30", "remaining_time": "1:28:43"}
254
+ {"current_steps": 508, "total_steps": 774, "loss": 0.7913935780525208, "lr": 2.9496022603145494e-05, "epoch": 1.9689922480620154, "percentage": 65.63, "elapsed_time": "2:48:11", "remaining_time": "1:28:04"}
255
+ {"current_steps": 510, "total_steps": 774, "loss": 0.9280475974082947, "lr": 2.9583009503747627e-05, "epoch": 1.9767441860465116, "percentage": 65.89, "elapsed_time": "2:48:56", "remaining_time": "1:27:27"}
256
+ {"current_steps": 512, "total_steps": 774, "loss": 0.7493736743927002, "lr": 2.9661885433677437e-05, "epoch": 1.9844961240310077, "percentage": 66.15, "elapsed_time": "2:49:34", "remaining_time": "1:26:46"}
257
+ {"current_steps": 514, "total_steps": 774, "loss": 1.0501880645751953, "lr": 2.9732598519111736e-05, "epoch": 1.9922480620155039, "percentage": 66.41, "elapsed_time": "2:50:14", "remaining_time": "1:26:06"}
258
+ {"current_steps": 516, "total_steps": 774, "loss": 1.011595368385315, "lr": 2.9795102254632528e-05, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "2:50:57", "remaining_time": "1:25:28"}
259
+ {"current_steps": 518, "total_steps": 774, "loss": 0.5705936551094055, "lr": 2.9849355533811937e-05, "epoch": 2.007751937984496, "percentage": 66.93, "elapsed_time": "2:51:37", "remaining_time": "1:24:48"}
260
+ {"current_steps": 520, "total_steps": 774, "loss": 0.7379302978515625, "lr": 2.9895322676246387e-05, "epoch": 2.0155038759689923, "percentage": 67.18, "elapsed_time": "2:52:20", "remaining_time": "1:24:10"}
261
+ {"current_steps": 522, "total_steps": 774, "loss": 0.46209296584129333, "lr": 2.993297345102233e-05, "epoch": 2.0232558139534884, "percentage": 67.44, "elapsed_time": "2:52:56", "remaining_time": "1:23:29"}
262
+ {"current_steps": 524, "total_steps": 774, "loss": 0.773676335811615, "lr": 2.9962283096597995e-05, "epoch": 2.0310077519379846, "percentage": 67.7, "elapsed_time": "2:53:43", "remaining_time": "1:22:53"}
263
+ {"current_steps": 526, "total_steps": 774, "loss": 0.6592158675193787, "lr": 2.998323233708815e-05, "epoch": 2.0387596899224807, "percentage": 67.96, "elapsed_time": "2:54:24", "remaining_time": "1:22:13"}
264
+ {"current_steps": 528, "total_steps": 774, "loss": 0.7777129411697388, "lr": 2.999580739494117e-05, "epoch": 2.046511627906977, "percentage": 68.22, "elapsed_time": "2:55:08", "remaining_time": "1:21:35"}
265
+ {"current_steps": 530, "total_steps": 774, "loss": 0.385895311832428, "lr": 3e-05, "epoch": 2.054263565891473, "percentage": 68.48, "elapsed_time": "2:55:49", "remaining_time": "1:20:56"}
266
+ {"current_steps": 532, "total_steps": 774, "loss": 0.7748541235923767, "lr": 2.999580739494117e-05, "epoch": 2.062015503875969, "percentage": 68.73, "elapsed_time": "2:56:33", "remaining_time": "1:20:18"}
267
+ {"current_steps": 534, "total_steps": 774, "loss": 0.407875120639801, "lr": 2.998323233708815e-05, "epoch": 2.0697674418604652, "percentage": 68.99, "elapsed_time": "2:57:11", "remaining_time": "1:19:38"}
268
+ {"current_steps": 536, "total_steps": 774, "loss": 0.41405466198921204, "lr": 2.9962283096598e-05, "epoch": 2.0775193798449614, "percentage": 69.25, "elapsed_time": "2:57:49", "remaining_time": "1:18:57"}
269
+ {"current_steps": 538, "total_steps": 774, "loss": 0.701027512550354, "lr": 2.9932973451022333e-05, "epoch": 2.0852713178294575, "percentage": 69.51, "elapsed_time": "2:58:32", "remaining_time": "1:18:19"}
270
+ {"current_steps": 540, "total_steps": 774, "loss": 0.4735100567340851, "lr": 2.9895322676246387e-05, "epoch": 2.0930232558139537, "percentage": 69.77, "elapsed_time": "2:59:11", "remaining_time": "1:17:39"}
271
+ {"current_steps": 542, "total_steps": 774, "loss": 0.27081194519996643, "lr": 2.9849355533811937e-05, "epoch": 2.10077519379845, "percentage": 70.03, "elapsed_time": "2:59:45", "remaining_time": "1:16:56"}
272
+ {"current_steps": 544, "total_steps": 774, "loss": 0.6002092957496643, "lr": 2.9795102254632528e-05, "epoch": 2.108527131782946, "percentage": 70.28, "elapsed_time": "3:00:28", "remaining_time": "1:16:18"}
273
+ {"current_steps": 546, "total_steps": 774, "loss": 0.4636404514312744, "lr": 2.973259851911174e-05, "epoch": 2.116279069767442, "percentage": 70.54, "elapsed_time": "3:01:11", "remaining_time": "1:15:39"}
274
+ {"current_steps": 548, "total_steps": 774, "loss": 0.4923861026763916, "lr": 2.9661885433677434e-05, "epoch": 2.124031007751938, "percentage": 70.8, "elapsed_time": "3:01:53", "remaining_time": "1:15:00"}
275
+ {"current_steps": 550, "total_steps": 774, "loss": 0.3250856101512909, "lr": 2.9583009503747627e-05, "epoch": 2.1317829457364343, "percentage": 71.06, "elapsed_time": "3:02:28", "remaining_time": "1:14:18"}
276
+ {"current_steps": 552, "total_steps": 774, "loss": 0.7897784113883972, "lr": 2.9496022603145497e-05, "epoch": 2.13953488372093, "percentage": 71.32, "elapsed_time": "3:03:11", "remaining_time": "1:13:40"}
277
+ {"current_steps": 554, "total_steps": 774, "loss": 0.8441802859306335, "lr": 2.940098193998391e-05, "epoch": 2.147286821705426, "percentage": 71.58, "elapsed_time": "3:03:53", "remaining_time": "1:13:01"}
278
+ {"current_steps": 556, "total_steps": 774, "loss": 0.4028940498828888, "lr": 2.9297950019041724e-05, "epoch": 2.1550387596899223, "percentage": 71.83, "elapsed_time": "3:04:32", "remaining_time": "1:12:21"}
279
+ {"current_steps": 558, "total_steps": 774, "loss": 0.6657426953315735, "lr": 2.9186994600656647e-05, "epoch": 2.1627906976744184, "percentage": 72.09, "elapsed_time": "3:05:10", "remaining_time": "1:11:40"}
280
+ {"current_steps": 560, "total_steps": 774, "loss": 0.5439774990081787, "lr": 2.906818865616178e-05, "epoch": 2.1705426356589146, "percentage": 72.35, "elapsed_time": "3:05:49", "remaining_time": "1:11:00"}
281
+ {"current_steps": 562, "total_steps": 774, "loss": 0.7213448882102966, "lr": 2.8941610319894977e-05, "epoch": 2.1782945736434107, "percentage": 72.61, "elapsed_time": "3:06:29", "remaining_time": "1:10:21"}
282
+ {"current_steps": 564, "total_steps": 774, "loss": 0.38557326793670654, "lr": 2.8807342837812783e-05, "epoch": 2.186046511627907, "percentage": 72.87, "elapsed_time": "3:07:06", "remaining_time": "1:09:40"}
283
+ {"current_steps": 566, "total_steps": 774, "loss": 0.41664543747901917, "lr": 2.8665474512742603e-05, "epoch": 2.193798449612403, "percentage": 73.13, "elapsed_time": "3:07:46", "remaining_time": "1:09:00"}
284
+ {"current_steps": 568, "total_steps": 774, "loss": 0.4579377770423889, "lr": 2.851609864630911e-05, "epoch": 2.201550387596899, "percentage": 73.39, "elapsed_time": "3:08:24", "remaining_time": "1:08:19"}
285
+ {"current_steps": 570, "total_steps": 774, "loss": 0.3196179270744324, "lr": 2.8359313477573215e-05, "epoch": 2.2093023255813953, "percentage": 73.64, "elapsed_time": "3:08:59", "remaining_time": "1:07:38"}
286
+ {"current_steps": 572, "total_steps": 774, "loss": 0.5369107127189636, "lr": 2.8195222118423792e-05, "epoch": 2.2170542635658914, "percentage": 73.9, "elapsed_time": "3:09:41", "remaining_time": "1:06:59"}
287
+ {"current_steps": 574, "total_steps": 774, "loss": 0.23676389455795288, "lr": 2.8023932485764768e-05, "epoch": 2.2248062015503876, "percentage": 74.16, "elapsed_time": "3:10:12", "remaining_time": "1:06:16"}
288
+ {"current_steps": 576, "total_steps": 774, "loss": 0.44901129603385925, "lr": 2.7845557230542076e-05, "epoch": 2.2325581395348837, "percentage": 74.42, "elapsed_time": "3:10:46", "remaining_time": "1:05:34"}
289
+ {"current_steps": 578, "total_steps": 774, "loss": 0.5859266519546509, "lr": 2.766021366365729e-05, "epoch": 2.24031007751938, "percentage": 74.68, "elapsed_time": "3:11:27", "remaining_time": "1:04:55"}
290
+ {"current_steps": 580, "total_steps": 774, "loss": 0.6005488038063049, "lr": 2.746802367881645e-05, "epoch": 2.248062015503876, "percentage": 74.94, "elapsed_time": "3:12:08", "remaining_time": "1:04:16"}
291
+ {"current_steps": 582, "total_steps": 774, "loss": 0.32260704040527344, "lr": 2.726911367236509e-05, "epoch": 2.255813953488372, "percentage": 75.19, "elapsed_time": "3:12:45", "remaining_time": "1:03:35"}
292
+ {"current_steps": 584, "total_steps": 774, "loss": 0.8233704566955566, "lr": 2.706361446016193e-05, "epoch": 2.2635658914728682, "percentage": 75.45, "elapsed_time": "3:13:29", "remaining_time": "1:02:57"}
293
+ {"current_steps": 586, "total_steps": 774, "loss": 0.4317566156387329, "lr": 2.685166119154604e-05, "epoch": 2.2713178294573644, "percentage": 75.71, "elapsed_time": "3:14:05", "remaining_time": "1:02:16"}
294
+ {"current_steps": 588, "total_steps": 774, "loss": 0.8105683326721191, "lr": 2.6633393260454096e-05, "epoch": 2.2790697674418605, "percentage": 75.97, "elapsed_time": "3:14:46", "remaining_time": "1:01:36"}
295
+ {"current_steps": 590, "total_steps": 774, "loss": 0.4510256350040436, "lr": 2.6408954213746025e-05, "epoch": 2.2868217054263567, "percentage": 76.23, "elapsed_time": "3:15:24", "remaining_time": "1:00:56"}
296
+ {"current_steps": 592, "total_steps": 774, "loss": 0.7199202179908752, "lr": 2.6178491656799504e-05, "epoch": 2.294573643410853, "percentage": 76.49, "elapsed_time": "3:16:08", "remaining_time": "1:00:17"}
297
+ {"current_steps": 594, "total_steps": 774, "loss": 0.47333112359046936, "lr": 2.5942157156435248e-05, "epoch": 2.302325581395349, "percentage": 76.74, "elapsed_time": "3:16:48", "remaining_time": "0:59:38"}
298
+ {"current_steps": 596, "total_steps": 774, "loss": 0.4947061836719513, "lr": 2.570010614123707e-05, "epoch": 2.310077519379845, "percentage": 77.0, "elapsed_time": "3:17:27", "remaining_time": "0:58:58"}
299
+ {"current_steps": 598, "total_steps": 774, "loss": 0.6046218872070312, "lr": 2.5452497799332167e-05, "epoch": 2.317829457364341, "percentage": 77.26, "elapsed_time": "3:18:07", "remaining_time": "0:58:18"}
300
+ {"current_steps": 600, "total_steps": 774, "loss": 0.37087422609329224, "lr": 2.519949497369886e-05, "epoch": 2.3255813953488373, "percentage": 77.52, "elapsed_time": "3:18:42", "remaining_time": "0:57:37"}
301
+ {"current_steps": 602, "total_steps": 774, "loss": 0.579389214515686, "lr": 2.494126405507074e-05, "epoch": 2.3333333333333335, "percentage": 77.78, "elapsed_time": "3:19:17", "remaining_time": "0:56:56"}
302
+ {"current_steps": 604, "total_steps": 774, "loss": 0.7329738736152649, "lr": 2.467797487250756e-05, "epoch": 2.3410852713178296, "percentage": 78.04, "elapsed_time": "3:20:01", "remaining_time": "0:56:17"}
303
+ {"current_steps": 606, "total_steps": 774, "loss": 0.5676310658454895, "lr": 2.4409800581704784e-05, "epoch": 2.3488372093023258, "percentage": 78.29, "elapsed_time": "3:20:36", "remaining_time": "0:55:36"}
304
+ {"current_steps": 608, "total_steps": 774, "loss": 0.6383396983146667, "lr": 2.4136917551115484e-05, "epoch": 2.356589147286822, "percentage": 78.55, "elapsed_time": "3:21:18", "remaining_time": "0:54:57"}
305
+ {"current_steps": 610, "total_steps": 774, "loss": 0.6663593053817749, "lr": 2.3859505245959206e-05, "epoch": 2.3643410852713176, "percentage": 78.81, "elapsed_time": "3:21:57", "remaining_time": "0:54:17"}
306
+ {"current_steps": 612, "total_steps": 774, "loss": 0.32523995637893677, "lr": 2.3577746110194188e-05, "epoch": 2.3720930232558137, "percentage": 79.07, "elapsed_time": "3:22:32", "remaining_time": "0:53:36"}
307
+ {"current_steps": 614, "total_steps": 774, "loss": 0.5087898373603821, "lr": 2.329182544653074e-05, "epoch": 2.37984496124031, "percentage": 79.33, "elapsed_time": "3:23:11", "remaining_time": "0:52:56"}
308
+ {"current_steps": 616, "total_steps": 774, "loss": 0.5215730667114258, "lr": 2.3001931294564278e-05, "epoch": 2.387596899224806, "percentage": 79.59, "elapsed_time": "3:23:56", "remaining_time": "0:52:18"}
309
+ {"current_steps": 618, "total_steps": 774, "loss": 0.7069303393363953, "lr": 2.27082543071086e-05, "epoch": 2.395348837209302, "percentage": 79.84, "elapsed_time": "3:24:39", "remaining_time": "0:51:39"}
310
+ {"current_steps": 620, "total_steps": 774, "loss": 0.6097102165222168, "lr": 2.2410987624810527e-05, "epoch": 2.4031007751937983, "percentage": 80.1, "elapsed_time": "3:25:21", "remaining_time": "0:51:00"}
311
+ {"current_steps": 622, "total_steps": 774, "loss": 0.28449299931526184, "lr": 2.2110326749128246e-05, "epoch": 2.4108527131782944, "percentage": 80.36, "elapsed_time": "3:25:53", "remaining_time": "0:50:18"}
312
+ {"current_steps": 624, "total_steps": 774, "loss": 0.5394483208656311, "lr": 2.180646941375716e-05, "epoch": 2.4186046511627906, "percentage": 80.62, "elapsed_time": "3:26:32", "remaining_time": "0:49:38"}
313
+ {"current_steps": 626, "total_steps": 774, "loss": 0.351560115814209, "lr": 2.149961545458774e-05, "epoch": 2.4263565891472867, "percentage": 80.88, "elapsed_time": "3:27:07", "remaining_time": "0:48:58"}
314
+ {"current_steps": 628, "total_steps": 774, "loss": 0.6790451407432556, "lr": 2.1189966678280585e-05, "epoch": 2.434108527131783, "percentage": 81.14, "elapsed_time": "3:27:53", "remaining_time": "0:48:19"}
315
+ {"current_steps": 630, "total_steps": 774, "loss": 0.34560778737068176, "lr": 2.0877726729545665e-05, "epoch": 2.441860465116279, "percentage": 81.4, "elapsed_time": "3:28:30", "remaining_time": "0:47:39"}
316
+ {"current_steps": 632, "total_steps": 774, "loss": 0.35299909114837646, "lr": 2.0563100957212584e-05, "epoch": 2.449612403100775, "percentage": 81.65, "elapsed_time": "3:29:08", "remaining_time": "0:46:59"}
317
+ {"current_steps": 634, "total_steps": 774, "loss": 0.45075708627700806, "lr": 2.02462962791801e-05, "epoch": 2.4573643410852712, "percentage": 81.91, "elapsed_time": "3:29:47", "remaining_time": "0:46:19"}
318
+ {"current_steps": 636, "total_steps": 774, "loss": 0.4892677664756775, "lr": 1.9927521046333833e-05, "epoch": 2.4651162790697674, "percentage": 82.17, "elapsed_time": "3:30:29", "remaining_time": "0:45:40"}
319
+ {"current_steps": 638, "total_steps": 774, "loss": 0.6066938042640686, "lr": 1.9606984905521463e-05, "epoch": 2.4728682170542635, "percentage": 82.43, "elapsed_time": "3:31:08", "remaining_time": "0:45:00"}
320
+ {"current_steps": 640, "total_steps": 774, "loss": 0.3974202275276184, "lr": 1.928489866167559e-05, "epoch": 2.4806201550387597, "percentage": 82.69, "elapsed_time": "3:31:46", "remaining_time": "0:44:20"}
321
+ {"current_steps": 642, "total_steps": 774, "loss": 0.43941450119018555, "lr": 1.896147413917511e-05, "epoch": 2.488372093023256, "percentage": 82.95, "elapsed_time": "3:32:26", "remaining_time": "0:43:40"}
322
+ {"current_steps": 644, "total_steps": 774, "loss": 0.5508748888969421, "lr": 1.863692404253597e-05, "epoch": 2.496124031007752, "percentage": 83.2, "elapsed_time": "3:33:10", "remaining_time": "0:43:01"}
323
+ {"current_steps": 646, "total_steps": 774, "loss": 0.5954611897468567, "lr": 1.83114618165232e-05, "epoch": 2.503875968992248, "percentage": 83.46, "elapsed_time": "3:33:52", "remaining_time": "0:42:22"}
324
+ {"current_steps": 648, "total_steps": 774, "loss": 0.7873520851135254, "lr": 1.798530150577603e-05, "epoch": 2.511627906976744, "percentage": 83.72, "elapsed_time": "3:34:35", "remaining_time": "0:41:43"}
325
+ {"current_steps": 650, "total_steps": 774, "loss": 0.27345526218414307, "lr": 1.765865761403861e-05, "epoch": 2.5193798449612403, "percentage": 83.98, "elapsed_time": "3:35:08", "remaining_time": "0:41:02"}
326
+ {"current_steps": 652, "total_steps": 774, "loss": 0.5833812355995178, "lr": 1.7331744963088654e-05, "epoch": 2.5271317829457365, "percentage": 84.24, "elapsed_time": "3:35:51", "remaining_time": "0:40:23"}
327
+ {"current_steps": 654, "total_steps": 774, "loss": 0.3762988746166229, "lr": 1.7004778551456995e-05, "epoch": 2.5348837209302326, "percentage": 84.5, "elapsed_time": "3:36:30", "remaining_time": "0:39:43"}
328
+ {"current_steps": 656, "total_steps": 774, "loss": 0.5067244172096252, "lr": 1.667797341303094e-05, "epoch": 2.5426356589147288, "percentage": 84.75, "elapsed_time": "3:37:10", "remaining_time": "0:39:03"}
329
+ {"current_steps": 658, "total_steps": 774, "loss": 0.42985814809799194, "lr": 1.6351544475634277e-05, "epoch": 2.550387596899225, "percentage": 85.01, "elapsed_time": "3:37:51", "remaining_time": "0:38:24"}
330
+ {"current_steps": 660, "total_steps": 774, "loss": 0.8337141871452332, "lr": 1.6025706419677054e-05, "epoch": 2.558139534883721, "percentage": 85.27, "elapsed_time": "3:38:33", "remaining_time": "0:37:45"}
331
+ {"current_steps": 662, "total_steps": 774, "loss": 0.5003541707992554, "lr": 1.570067353696823e-05, "epoch": 2.565891472868217, "percentage": 85.53, "elapsed_time": "3:39:15", "remaining_time": "0:37:05"}
332
+ {"current_steps": 664, "total_steps": 774, "loss": 0.3022569715976715, "lr": 1.5376659589783585e-05, "epoch": 2.5736434108527133, "percentage": 85.79, "elapsed_time": "3:39:50", "remaining_time": "0:36:25"}
333
+ {"current_steps": 666, "total_steps": 774, "loss": 0.4718426465988159, "lr": 1.5053877670282193e-05, "epoch": 2.5813953488372094, "percentage": 86.05, "elapsed_time": "3:40:28", "remaining_time": "0:35:45"}
334
+ {"current_steps": 668, "total_steps": 774, "loss": 0.4901648163795471, "lr": 1.473254006036345e-05, "epoch": 2.5891472868217056, "percentage": 86.3, "elapsed_time": "3:41:11", "remaining_time": "0:35:05"}
335
+ {"current_steps": 670, "total_steps": 774, "loss": 0.6914687156677246, "lr": 1.4412858092056995e-05, "epoch": 2.5968992248062017, "percentage": 86.56, "elapsed_time": "3:41:52", "remaining_time": "0:34:26"}
336
+ {"current_steps": 672, "total_steps": 774, "loss": 0.4894769787788391, "lr": 1.4095042008537343e-05, "epoch": 2.604651162790698, "percentage": 86.82, "elapsed_time": "3:42:33", "remaining_time": "0:33:46"}
337
+ {"current_steps": 674, "total_steps": 774, "loss": 0.7514118552207947, "lr": 1.3779300825854615e-05, "epoch": 2.612403100775194, "percentage": 87.08, "elapsed_time": "3:43:17", "remaining_time": "0:33:07"}
338
+ {"current_steps": 676, "total_steps": 774, "loss": 0.7393191456794739, "lr": 1.3465842195472315e-05, "epoch": 2.62015503875969, "percentage": 87.34, "elapsed_time": "3:44:01", "remaining_time": "0:32:28"}
339
+ {"current_steps": 678, "total_steps": 774, "loss": 0.8212107419967651, "lr": 1.3154872267702535e-05, "epoch": 2.6279069767441863, "percentage": 87.6, "elapsed_time": "3:44:48", "remaining_time": "0:31:49"}
340
+ {"current_steps": 680, "total_steps": 774, "loss": 0.7656596302986145, "lr": 1.2846595556128338e-05, "epoch": 2.6356589147286824, "percentage": 87.86, "elapsed_time": "3:45:35", "remaining_time": "0:31:11"}
341
+ {"current_steps": 682, "total_steps": 774, "loss": 0.39778298139572144, "lr": 1.2541214803102764e-05, "epoch": 2.6434108527131785, "percentage": 88.11, "elapsed_time": "3:46:15", "remaining_time": "0:30:31"}
342
+ {"current_steps": 684, "total_steps": 774, "loss": 0.4492897689342499, "lr": 1.2238930846412478e-05, "epoch": 2.6511627906976747, "percentage": 88.37, "elapsed_time": "3:46:54", "remaining_time": "0:29:51"}
343
+ {"current_steps": 686, "total_steps": 774, "loss": 0.5477796792984009, "lr": 1.1939942487194114e-05, "epoch": 2.6589147286821704, "percentage": 88.63, "elapsed_time": "3:47:31", "remaining_time": "0:29:11"}
344
+ {"current_steps": 688, "total_steps": 774, "loss": 0.28585392236709595, "lr": 1.1644446359190002e-05, "epoch": 2.6666666666666665, "percentage": 88.89, "elapsed_time": "3:48:04", "remaining_time": "0:28:30"}
345
+ {"current_steps": 690, "total_steps": 774, "loss": 0.5053625106811523, "lr": 1.1352636799429364e-05, "epoch": 2.6744186046511627, "percentage": 89.15, "elapsed_time": "3:48:41", "remaining_time": "0:27:50"}
346
+ {"current_steps": 692, "total_steps": 774, "loss": 0.567241370677948, "lr": 1.1064705720419824e-05, "epoch": 2.682170542635659, "percentage": 89.41, "elapsed_time": "3:49:25", "remaining_time": "0:27:11"}
347
+ {"current_steps": 694, "total_steps": 774, "loss": 0.6684018969535828, "lr": 1.0780842483933762e-05, "epoch": 2.689922480620155, "percentage": 89.66, "elapsed_time": "3:50:07", "remaining_time": "0:26:31"}
348
+ {"current_steps": 696, "total_steps": 774, "loss": 0.33106908202171326, "lr": 1.0501233776471719e-05, "epoch": 2.697674418604651, "percentage": 89.92, "elapsed_time": "3:50:44", "remaining_time": "0:25:51"}
349
+ {"current_steps": 698, "total_steps": 774, "loss": 0.566682755947113, "lr": 1.0226063486485696e-05, "epoch": 2.705426356589147, "percentage": 90.18, "elapsed_time": "3:51:29", "remaining_time": "0:25:12"}
350
+ {"current_steps": 700, "total_steps": 774, "loss": 0.4341398775577545, "lr": 9.955512583442338e-06, "epoch": 2.7131782945736433, "percentage": 90.44, "elapsed_time": "3:52:05", "remaining_time": "0:24:32"}
351
+ {"current_steps": 702, "total_steps": 774, "loss": 0.4164765775203705, "lr": 9.689758998805937e-06, "epoch": 2.7209302325581395, "percentage": 90.7, "elapsed_time": "3:52:44", "remaining_time": "0:23:52"}
352
+ {"current_steps": 704, "total_steps": 774, "loss": 0.40749120712280273, "lr": 9.428977509019321e-06, "epoch": 2.7286821705426356, "percentage": 90.96, "elapsed_time": "3:53:25", "remaining_time": "0:23:12"}
353
+ {"current_steps": 706, "total_steps": 774, "loss": 0.28900110721588135, "lr": 9.173339620559945e-06, "epoch": 2.7364341085271318, "percentage": 91.21, "elapsed_time": "3:53:58", "remaining_time": "0:22:32"}
354
+ {"current_steps": 708, "total_steps": 774, "loss": 0.41211241483688354, "lr": 8.923013457146072e-06, "epoch": 2.744186046511628, "percentage": 91.47, "elapsed_time": "3:54:34", "remaining_time": "0:21:51"}
355
+ {"current_steps": 710, "total_steps": 774, "loss": 0.5537896156311035, "lr": 8.678163649168217e-06, "epoch": 2.751937984496124, "percentage": 91.73, "elapsed_time": "3:55:12", "remaining_time": "0:21:12"}
356
+ {"current_steps": 712, "total_steps": 774, "loss": 0.5788278579711914, "lr": 8.43895122541748e-06, "epoch": 2.75968992248062, "percentage": 91.99, "elapsed_time": "3:55:54", "remaining_time": "0:20:32"}
357
+ {"current_steps": 714, "total_steps": 774, "loss": 0.37125617265701294, "lr": 8.205533507182964e-06, "epoch": 2.7674418604651163, "percentage": 92.25, "elapsed_time": "3:56:31", "remaining_time": "0:19:52"}
358
+ {"current_steps": 716, "total_steps": 774, "loss": 0.3962320387363434, "lr": 7.978064004787233e-06, "epoch": 2.7751937984496124, "percentage": 92.51, "elapsed_time": "3:57:08", "remaining_time": "0:19:12"}
359
+ {"current_steps": 718, "total_steps": 774, "loss": 0.6869024634361267, "lr": 7.756692316628171e-06, "epoch": 2.7829457364341086, "percentage": 92.76, "elapsed_time": "3:57:51", "remaining_time": "0:18:33"}
360
+ {"current_steps": 720, "total_steps": 774, "loss": 0.5122371912002563, "lr": 7.541564030793529e-06, "epoch": 2.7906976744186047, "percentage": 93.02, "elapsed_time": "3:58:33", "remaining_time": "0:17:53"}
361
+ {"current_steps": 722, "total_steps": 774, "loss": 0.4030957818031311, "lr": 7.332820629313089e-06, "epoch": 2.798449612403101, "percentage": 93.28, "elapsed_time": "3:59:10", "remaining_time": "0:17:13"}
362
+ {"current_steps": 724, "total_steps": 774, "loss": 0.4579683840274811, "lr": 7.1305993951108914e-06, "epoch": 2.806201550387597, "percentage": 93.54, "elapsed_time": "3:59:49", "remaining_time": "0:16:33"}
363
+ {"current_steps": 726, "total_steps": 774, "loss": 0.4582154452800751, "lr": 6.935033321719423e-06, "epoch": 2.813953488372093, "percentage": 93.8, "elapsed_time": "4:00:25", "remaining_time": "0:15:53"}
364
+ {"current_steps": 728, "total_steps": 774, "loss": 0.46171411871910095, "lr": 6.74625102581455e-06, "epoch": 2.8217054263565893, "percentage": 94.06, "elapsed_time": "4:01:09", "remaining_time": "0:15:14"}
365
+ {"current_steps": 730, "total_steps": 774, "loss": 0.4829785227775574, "lr": 6.56437666262903e-06, "epoch": 2.8294573643410854, "percentage": 94.32, "elapsed_time": "4:01:49", "remaining_time": "0:14:34"}
366
+ {"current_steps": 732, "total_steps": 774, "loss": 0.4446869194507599, "lr": 6.389529844300143e-06, "epoch": 2.8372093023255816, "percentage": 94.57, "elapsed_time": "4:02:31", "remaining_time": "0:13:54"}
367
+ {"current_steps": 734, "total_steps": 774, "loss": 0.5170708298683167, "lr": 6.221825561205165e-06, "epoch": 2.8449612403100772, "percentage": 94.83, "elapsed_time": "4:03:10", "remaining_time": "0:13:15"}
368
+ {"current_steps": 736, "total_steps": 774, "loss": 0.6230844259262085, "lr": 6.061374106336333e-06, "epoch": 2.8527131782945734, "percentage": 95.09, "elapsed_time": "4:03:53", "remaining_time": "0:12:35"}
369
+ {"current_steps": 738, "total_steps": 774, "loss": 0.35932058095932007, "lr": 5.908281002765252e-06, "epoch": 2.8604651162790695, "percentage": 95.35, "elapsed_time": "4:04:30", "remaining_time": "0:11:55"}
370
+ {"current_steps": 740, "total_steps": 774, "loss": 0.3806362748146057, "lr": 5.762646934244159e-06, "epoch": 2.8682170542635657, "percentage": 95.61, "elapsed_time": "4:05:09", "remaining_time": "0:11:15"}
371
+ {"current_steps": 742, "total_steps": 774, "loss": 0.513190507888794, "lr": 5.624567678989899e-06, "epoch": 2.875968992248062, "percentage": 95.87, "elapsed_time": "4:05:49", "remaining_time": "0:10:36"}
372
+ {"current_steps": 744, "total_steps": 774, "loss": 0.6526894569396973, "lr": 5.494134046694099e-06, "epoch": 2.883720930232558, "percentage": 96.12, "elapsed_time": "4:06:33", "remaining_time": "0:09:56"}
373
+ {"current_steps": 746, "total_steps": 774, "loss": 0.4791458249092102, "lr": 5.371431818800933e-06, "epoch": 2.891472868217054, "percentage": 96.38, "elapsed_time": "4:07:09", "remaining_time": "0:09:16"}
374
+ {"current_steps": 748, "total_steps": 774, "loss": 0.5770004987716675, "lr": 5.256541692091802e-06, "epoch": 2.89922480620155, "percentage": 96.64, "elapsed_time": "4:07:49", "remaining_time": "0:08:36"}
375
+ {"current_steps": 750, "total_steps": 774, "loss": 0.3434167802333832, "lr": 5.149539225613978e-06, "epoch": 2.9069767441860463, "percentage": 96.9, "elapsed_time": "4:08:29", "remaining_time": "0:07:57"}
376
+ {"current_steps": 752, "total_steps": 774, "loss": 0.4575299322605133, "lr": 5.050494790988215e-06, "epoch": 2.9147286821705425, "percentage": 97.16, "elapsed_time": "4:09:10", "remaining_time": "0:07:17"}
377
+ {"current_steps": 754, "total_steps": 774, "loss": 0.3564453721046448, "lr": 4.959473526127871e-06, "epoch": 2.9224806201550386, "percentage": 97.42, "elapsed_time": "4:09:53", "remaining_time": "0:06:37"}
378
+ {"current_steps": 756, "total_steps": 774, "loss": 0.7428521513938904, "lr": 4.876535292400089e-06, "epoch": 2.9302325581395348, "percentage": 97.67, "elapsed_time": "4:10:39", "remaining_time": "0:05:58"}
379
+ {"current_steps": 758, "total_steps": 774, "loss": 0.4571719169616699, "lr": 4.801734635257146e-06, "epoch": 2.937984496124031, "percentage": 97.93, "elapsed_time": "4:11:22", "remaining_time": "0:05:18"}
380
+ {"current_steps": 760, "total_steps": 774, "loss": 0.5132399797439575, "lr": 4.73512074836392e-06, "epoch": 2.945736434108527, "percentage": 98.19, "elapsed_time": "4:12:03", "remaining_time": "0:04:38"}
381
+ {"current_steps": 762, "total_steps": 774, "loss": 0.814540445804596, "lr": 4.676737441244973e-06, "epoch": 2.953488372093023, "percentage": 98.45, "elapsed_time": "4:12:48", "remaining_time": "0:03:58"}
382
+ {"current_steps": 764, "total_steps": 774, "loss": 0.5996021628379822, "lr": 4.626623110472678e-06, "epoch": 2.9612403100775193, "percentage": 98.71, "elapsed_time": "4:13:32", "remaining_time": "0:03:19"}
383
+ {"current_steps": 766, "total_steps": 774, "loss": 0.2337801605463028, "lr": 4.584810714415136e-06, "epoch": 2.9689922480620154, "percentage": 98.97, "elapsed_time": "4:14:04", "remaining_time": "0:02:39"}
384
+ {"current_steps": 768, "total_steps": 774, "loss": 0.43569573760032654, "lr": 4.551327751560703e-06, "epoch": 2.9767441860465116, "percentage": 99.22, "elapsed_time": "4:14:43", "remaining_time": "0:01:59"}
385
+ {"current_steps": 770, "total_steps": 774, "loss": 0.42782190442085266, "lr": 4.526196242433211e-06, "epoch": 2.9844961240310077, "percentage": 99.48, "elapsed_time": "4:15:18", "remaining_time": "0:01:19"}
386
+ {"current_steps": 772, "total_steps": 774, "loss": 0.516304612159729, "lr": 4.509432715109889e-06, "epoch": 2.992248062015504, "percentage": 99.74, "elapsed_time": "4:16:00", "remaining_time": "0:00:39"}
387
+ {"current_steps": 774, "total_steps": 774, "loss": 0.15898612141609192, "lr": 4.50104819435143e-06, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "4:16:33", "remaining_time": "0:00:00"}
388
+ {"current_steps": 774, "total_steps": 774, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "4:18:03", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,2752 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 774,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007751937984496124,
14
+ "grad_norm": 2.3754148483276367,
15
+ "learning_rate": 7.692307692307693e-07,
16
+ "loss": 4.068140983581543,
17
+ "step": 2
18
+ },
19
+ {
20
+ "epoch": 0.015503875968992248,
21
+ "grad_norm": 0.20276732742786407,
22
+ "learning_rate": 2.307692307692308e-06,
23
+ "loss": 2.0239908695220947,
24
+ "step": 4
25
+ },
26
+ {
27
+ "epoch": 0.023255813953488372,
28
+ "grad_norm": 0.35163113474845886,
29
+ "learning_rate": 3.846153846153846e-06,
30
+ "loss": 1.9337211847305298,
31
+ "step": 6
32
+ },
33
+ {
34
+ "epoch": 0.031007751937984496,
35
+ "grad_norm": 0.19401516020298004,
36
+ "learning_rate": 5.384615384615385e-06,
37
+ "loss": 1.9213242530822754,
38
+ "step": 8
39
+ },
40
+ {
41
+ "epoch": 0.03875968992248062,
42
+ "grad_norm": 0.11387048661708832,
43
+ "learning_rate": 6.923076923076923e-06,
44
+ "loss": 2.1911349296569824,
45
+ "step": 10
46
+ },
47
+ {
48
+ "epoch": 0.046511627906976744,
49
+ "grad_norm": 2.326526165008545,
50
+ "learning_rate": 8.461538461538462e-06,
51
+ "loss": 3.268449306488037,
52
+ "step": 12
53
+ },
54
+ {
55
+ "epoch": 0.05426356589147287,
56
+ "grad_norm": 0.15703660249710083,
57
+ "learning_rate": 9.999999999999999e-06,
58
+ "loss": 1.7003194093704224,
59
+ "step": 14
60
+ },
61
+ {
62
+ "epoch": 0.06201550387596899,
63
+ "grad_norm": 0.6381284594535828,
64
+ "learning_rate": 1.153846153846154e-05,
65
+ "loss": 1.8064090013504028,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 0.06976744186046512,
70
+ "grad_norm": 0.5508278608322144,
71
+ "learning_rate": 1.3076923076923078e-05,
72
+ "loss": 1.3964051008224487,
73
+ "step": 18
74
+ },
75
+ {
76
+ "epoch": 0.07751937984496124,
77
+ "grad_norm": 0.3715132772922516,
78
+ "learning_rate": 1.4615384615384615e-05,
79
+ "loss": 1.59793221950531,
80
+ "step": 20
81
+ },
82
+ {
83
+ "epoch": 0.08527131782945736,
84
+ "grad_norm": 7.088710308074951,
85
+ "learning_rate": 1.6153846153846154e-05,
86
+ "loss": 1.7358227968215942,
87
+ "step": 22
88
+ },
89
+ {
90
+ "epoch": 0.09302325581395349,
91
+ "grad_norm": 0.14647279679775238,
92
+ "learning_rate": 1.7692307692307694e-05,
93
+ "loss": 1.3447600603103638,
94
+ "step": 24
95
+ },
96
+ {
97
+ "epoch": 0.10077519379844961,
98
+ "grad_norm": 0.22108672559261322,
99
+ "learning_rate": 1.923076923076923e-05,
100
+ "loss": 1.4682307243347168,
101
+ "step": 26
102
+ },
103
+ {
104
+ "epoch": 0.10852713178294573,
105
+ "grad_norm": 0.3697395622730255,
106
+ "learning_rate": 2.076923076923077e-05,
107
+ "loss": 1.2035093307495117,
108
+ "step": 28
109
+ },
110
+ {
111
+ "epoch": 0.11627906976744186,
112
+ "grad_norm": 0.24884682893753052,
113
+ "learning_rate": 2.230769230769231e-05,
114
+ "loss": 1.1427452564239502,
115
+ "step": 30
116
+ },
117
+ {
118
+ "epoch": 0.12403100775193798,
119
+ "grad_norm": 0.18956558406352997,
120
+ "learning_rate": 2.3846153846153846e-05,
121
+ "loss": 1.3711202144622803,
122
+ "step": 32
123
+ },
124
+ {
125
+ "epoch": 0.13178294573643412,
126
+ "grad_norm": 0.18877695500850677,
127
+ "learning_rate": 2.5384615384615386e-05,
128
+ "loss": 1.2189266681671143,
129
+ "step": 34
130
+ },
131
+ {
132
+ "epoch": 0.13953488372093023,
133
+ "grad_norm": 0.10388179123401642,
134
+ "learning_rate": 2.6923076923076923e-05,
135
+ "loss": 1.3252586126327515,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 0.14728682170542637,
140
+ "grad_norm": 0.2508637309074402,
141
+ "learning_rate": 2.846153846153846e-05,
142
+ "loss": 1.0033904314041138,
143
+ "step": 38
144
+ },
145
+ {
146
+ "epoch": 0.15503875968992248,
147
+ "grad_norm": 0.09986624866724014,
148
+ "learning_rate": 3e-05,
149
+ "loss": 1.4535468816757202,
150
+ "step": 40
151
+ },
152
+ {
153
+ "epoch": 0.16279069767441862,
154
+ "grad_norm": 0.12885728478431702,
155
+ "learning_rate": 2.999580739494117e-05,
156
+ "loss": 1.0325186252593994,
157
+ "step": 42
158
+ },
159
+ {
160
+ "epoch": 0.17054263565891473,
161
+ "grad_norm": 0.10903146117925644,
162
+ "learning_rate": 2.998323233708815e-05,
163
+ "loss": 1.2467223405838013,
164
+ "step": 44
165
+ },
166
+ {
167
+ "epoch": 0.17829457364341086,
168
+ "grad_norm": 0.11212802678346634,
169
+ "learning_rate": 2.9962283096597995e-05,
170
+ "loss": 1.6686618328094482,
171
+ "step": 46
172
+ },
173
+ {
174
+ "epoch": 0.18604651162790697,
175
+ "grad_norm": 0.2613708972930908,
176
+ "learning_rate": 2.9932973451022333e-05,
177
+ "loss": 0.8405603170394897,
178
+ "step": 48
179
+ },
180
+ {
181
+ "epoch": 0.1937984496124031,
182
+ "grad_norm": 0.8741084337234497,
183
+ "learning_rate": 2.9895322676246387e-05,
184
+ "loss": 0.6372175812721252,
185
+ "step": 50
186
+ },
187
+ {
188
+ "epoch": 0.20155038759689922,
189
+ "grad_norm": 0.20064491033554077,
190
+ "learning_rate": 2.9849355533811937e-05,
191
+ "loss": 1.0768086910247803,
192
+ "step": 52
193
+ },
194
+ {
195
+ "epoch": 0.20930232558139536,
196
+ "grad_norm": 0.20809470117092133,
197
+ "learning_rate": 2.9795102254632528e-05,
198
+ "loss": 0.58198082447052,
199
+ "step": 54
200
+ },
201
+ {
202
+ "epoch": 0.21705426356589147,
203
+ "grad_norm": 0.10565000772476196,
204
+ "learning_rate": 2.9732598519111736e-05,
205
+ "loss": 1.3517603874206543,
206
+ "step": 56
207
+ },
208
+ {
209
+ "epoch": 0.2248062015503876,
210
+ "grad_norm": 0.10898104310035706,
211
+ "learning_rate": 2.9661885433677437e-05,
212
+ "loss": 1.340335488319397,
213
+ "step": 58
214
+ },
215
+ {
216
+ "epoch": 0.23255813953488372,
217
+ "grad_norm": 0.3277449309825897,
218
+ "learning_rate": 2.9583009503747627e-05,
219
+ "loss": 1.1451056003570557,
220
+ "step": 60
221
+ },
222
+ {
223
+ "epoch": 0.24031007751937986,
224
+ "grad_norm": 0.11206506192684174,
225
+ "learning_rate": 2.9496022603145497e-05,
226
+ "loss": 1.2255440950393677,
227
+ "step": 62
228
+ },
229
+ {
230
+ "epoch": 0.24806201550387597,
231
+ "grad_norm": 0.14122240245342255,
232
+ "learning_rate": 2.940098193998391e-05,
233
+ "loss": 1.2778782844543457,
234
+ "step": 64
235
+ },
236
+ {
237
+ "epoch": 0.2558139534883721,
238
+ "grad_norm": 0.17153455317020416,
239
+ "learning_rate": 2.9297950019041724e-05,
240
+ "loss": 1.178369402885437,
241
+ "step": 66
242
+ },
243
+ {
244
+ "epoch": 0.26356589147286824,
245
+ "grad_norm": 0.2940099239349365,
246
+ "learning_rate": 2.918699460065665e-05,
247
+ "loss": 1.1788100004196167,
248
+ "step": 68
249
+ },
250
+ {
251
+ "epoch": 0.2713178294573643,
252
+ "grad_norm": 0.07703827321529388,
253
+ "learning_rate": 2.906818865616178e-05,
254
+ "loss": 1.306922435760498,
255
+ "step": 70
256
+ },
257
+ {
258
+ "epoch": 0.27906976744186046,
259
+ "grad_norm": 0.24490903317928314,
260
+ "learning_rate": 2.8941610319894977e-05,
261
+ "loss": 1.0475130081176758,
262
+ "step": 72
263
+ },
264
+ {
265
+ "epoch": 0.2868217054263566,
266
+ "grad_norm": 0.13828147947788239,
267
+ "learning_rate": 2.8807342837812783e-05,
268
+ "loss": 1.1680102348327637,
269
+ "step": 74
270
+ },
271
+ {
272
+ "epoch": 0.29457364341085274,
273
+ "grad_norm": 0.13997578620910645,
274
+ "learning_rate": 2.8665474512742603e-05,
275
+ "loss": 1.0921390056610107,
276
+ "step": 76
277
+ },
278
+ {
279
+ "epoch": 0.3023255813953488,
280
+ "grad_norm": 0.08565083891153336,
281
+ "learning_rate": 2.8516098646309108e-05,
282
+ "loss": 1.1694703102111816,
283
+ "step": 78
284
+ },
285
+ {
286
+ "epoch": 0.31007751937984496,
287
+ "grad_norm": 0.054738063365221024,
288
+ "learning_rate": 2.8359313477573215e-05,
289
+ "loss": 1.1712660789489746,
290
+ "step": 80
291
+ },
292
+ {
293
+ "epoch": 0.3178294573643411,
294
+ "grad_norm": 0.06600243598222733,
295
+ "learning_rate": 2.8195222118423792e-05,
296
+ "loss": 1.32106351852417,
297
+ "step": 82
298
+ },
299
+ {
300
+ "epoch": 0.32558139534883723,
301
+ "grad_norm": 0.1856859177350998,
302
+ "learning_rate": 2.8023932485764768e-05,
303
+ "loss": 1.002191424369812,
304
+ "step": 84
305
+ },
306
+ {
307
+ "epoch": 0.3333333333333333,
308
+ "grad_norm": 0.1050916314125061,
309
+ "learning_rate": 2.7845557230542076e-05,
310
+ "loss": 1.1279501914978027,
311
+ "step": 86
312
+ },
313
+ {
314
+ "epoch": 0.34108527131782945,
315
+ "grad_norm": 0.09759809821844101,
316
+ "learning_rate": 2.7660213663657282e-05,
317
+ "loss": 1.3432408571243286,
318
+ "step": 88
319
+ },
320
+ {
321
+ "epoch": 0.3488372093023256,
322
+ "grad_norm": 0.09217233955860138,
323
+ "learning_rate": 2.7468023678816447e-05,
324
+ "loss": 0.8359699249267578,
325
+ "step": 90
326
+ },
327
+ {
328
+ "epoch": 0.35658914728682173,
329
+ "grad_norm": 0.11377432197332382,
330
+ "learning_rate": 2.726911367236509e-05,
331
+ "loss": 1.1406829357147217,
332
+ "step": 92
333
+ },
334
+ {
335
+ "epoch": 0.3643410852713178,
336
+ "grad_norm": 0.14340269565582275,
337
+ "learning_rate": 2.706361446016193e-05,
338
+ "loss": 1.142421841621399,
339
+ "step": 94
340
+ },
341
+ {
342
+ "epoch": 0.37209302325581395,
343
+ "grad_norm": 0.39369332790374756,
344
+ "learning_rate": 2.6851661191546038e-05,
345
+ "loss": 1.2204563617706299,
346
+ "step": 96
347
+ },
348
+ {
349
+ "epoch": 0.3798449612403101,
350
+ "grad_norm": 0.1904468685388565,
351
+ "learning_rate": 2.6633393260454096e-05,
352
+ "loss": 0.7862477898597717,
353
+ "step": 98
354
+ },
355
+ {
356
+ "epoch": 0.3875968992248062,
357
+ "grad_norm": 0.13884544372558594,
358
+ "learning_rate": 2.6408954213746028e-05,
359
+ "loss": 0.7346755862236023,
360
+ "step": 100
361
+ },
362
+ {
363
+ "epoch": 0.3953488372093023,
364
+ "grad_norm": 0.07463762909173965,
365
+ "learning_rate": 2.61784916567995e-05,
366
+ "loss": 1.3203626871109009,
367
+ "step": 102
368
+ },
369
+ {
370
+ "epoch": 0.40310077519379844,
371
+ "grad_norm": 0.1436045616865158,
372
+ "learning_rate": 2.5942157156435248e-05,
373
+ "loss": 1.2376055717468262,
374
+ "step": 104
375
+ },
376
+ {
377
+ "epoch": 0.4108527131782946,
378
+ "grad_norm": 0.14889220893383026,
379
+ "learning_rate": 2.570010614123707e-05,
380
+ "loss": 1.0368235111236572,
381
+ "step": 106
382
+ },
383
+ {
384
+ "epoch": 0.4186046511627907,
385
+ "grad_norm": 0.073515884578228,
386
+ "learning_rate": 2.545249779933216e-05,
387
+ "loss": 1.105363130569458,
388
+ "step": 108
389
+ },
390
+ {
391
+ "epoch": 0.4263565891472868,
392
+ "grad_norm": 0.0721718966960907,
393
+ "learning_rate": 2.5199494973698856e-05,
394
+ "loss": 1.0211938619613647,
395
+ "step": 110
396
+ },
397
+ {
398
+ "epoch": 0.43410852713178294,
399
+ "grad_norm": 0.08736886829137802,
400
+ "learning_rate": 2.494126405507074e-05,
401
+ "loss": 0.9343675971031189,
402
+ "step": 112
403
+ },
404
+ {
405
+ "epoch": 0.4418604651162791,
406
+ "grad_norm": 0.6035546064376831,
407
+ "learning_rate": 2.4677974872507553e-05,
408
+ "loss": 1.0941760540008545,
409
+ "step": 114
410
+ },
411
+ {
412
+ "epoch": 0.4496124031007752,
413
+ "grad_norm": 0.10358745604753494,
414
+ "learning_rate": 2.440980058170478e-05,
415
+ "loss": 1.0486119985580444,
416
+ "step": 116
417
+ },
418
+ {
419
+ "epoch": 0.4573643410852713,
420
+ "grad_norm": 0.41432875394821167,
421
+ "learning_rate": 2.4136917551115484e-05,
422
+ "loss": 0.9473840594291687,
423
+ "step": 118
424
+ },
425
+ {
426
+ "epoch": 0.46511627906976744,
427
+ "grad_norm": 0.06210324168205261,
428
+ "learning_rate": 2.38595052459592e-05,
429
+ "loss": 1.2813960313796997,
430
+ "step": 120
431
+ },
432
+ {
433
+ "epoch": 0.4728682170542636,
434
+ "grad_norm": 0.21510902047157288,
435
+ "learning_rate": 2.357774611019419e-05,
436
+ "loss": 1.0586227178573608,
437
+ "step": 122
438
+ },
439
+ {
440
+ "epoch": 0.4806201550387597,
441
+ "grad_norm": 0.10468325763940811,
442
+ "learning_rate": 2.3291825446530736e-05,
443
+ "loss": 1.2756110429763794,
444
+ "step": 124
445
+ },
446
+ {
447
+ "epoch": 0.4883720930232558,
448
+ "grad_norm": 1.1782441139221191,
449
+ "learning_rate": 2.3001931294564265e-05,
450
+ "loss": 1.168853759765625,
451
+ "step": 126
452
+ },
453
+ {
454
+ "epoch": 0.49612403100775193,
455
+ "grad_norm": 0.11016172915697098,
456
+ "learning_rate": 2.27082543071086e-05,
457
+ "loss": 1.181935429573059,
458
+ "step": 128
459
+ },
460
+ {
461
+ "epoch": 0.5038759689922481,
462
+ "grad_norm": 0.6895468831062317,
463
+ "learning_rate": 2.2410987624810524e-05,
464
+ "loss": 1.1901732683181763,
465
+ "step": 130
466
+ },
467
+ {
468
+ "epoch": 0.5116279069767442,
469
+ "grad_norm": 0.18288742005825043,
470
+ "learning_rate": 2.2110326749128233e-05,
471
+ "loss": 0.7289036512374878,
472
+ "step": 132
473
+ },
474
+ {
475
+ "epoch": 0.5193798449612403,
476
+ "grad_norm": 0.11772674322128296,
477
+ "learning_rate": 2.1806469413757164e-05,
478
+ "loss": 1.161149024963379,
479
+ "step": 134
480
+ },
481
+ {
482
+ "epoch": 0.5271317829457365,
483
+ "grad_norm": 0.2412514090538025,
484
+ "learning_rate": 2.149961545458773e-05,
485
+ "loss": 1.1283718347549438,
486
+ "step": 136
487
+ },
488
+ {
489
+ "epoch": 0.5348837209302325,
490
+ "grad_norm": 0.08974076807498932,
491
+ "learning_rate": 2.118996667828058e-05,
492
+ "loss": 1.362121343612671,
493
+ "step": 138
494
+ },
495
+ {
496
+ "epoch": 0.5426356589147286,
497
+ "grad_norm": 0.12378139048814774,
498
+ "learning_rate": 2.0877726729545665e-05,
499
+ "loss": 1.2608673572540283,
500
+ "step": 140
501
+ },
502
+ {
503
+ "epoch": 0.5503875968992248,
504
+ "grad_norm": 0.41343384981155396,
505
+ "learning_rate": 2.0563100957212577e-05,
506
+ "loss": 0.5950201153755188,
507
+ "step": 142
508
+ },
509
+ {
510
+ "epoch": 0.5581395348837209,
511
+ "grad_norm": 0.09795749187469482,
512
+ "learning_rate": 2.0246296279180093e-05,
513
+ "loss": 1.3639545440673828,
514
+ "step": 144
515
+ },
516
+ {
517
+ "epoch": 0.5658914728682171,
518
+ "grad_norm": 0.09332104027271271,
519
+ "learning_rate": 1.9927521046333833e-05,
520
+ "loss": 1.0145015716552734,
521
+ "step": 146
522
+ },
523
+ {
524
+ "epoch": 0.5736434108527132,
525
+ "grad_norm": 0.15143655240535736,
526
+ "learning_rate": 1.960698490552145e-05,
527
+ "loss": 0.9937471151351929,
528
+ "step": 148
529
+ },
530
+ {
531
+ "epoch": 0.5813953488372093,
532
+ "grad_norm": 0.13751712441444397,
533
+ "learning_rate": 1.9284898661675586e-05,
534
+ "loss": 1.0032529830932617,
535
+ "step": 150
536
+ },
537
+ {
538
+ "epoch": 0.5891472868217055,
539
+ "grad_norm": 0.0935468077659607,
540
+ "learning_rate": 1.8961474139175106e-05,
541
+ "loss": 1.2299753427505493,
542
+ "step": 152
543
+ },
544
+ {
545
+ "epoch": 0.5968992248062015,
546
+ "grad_norm": 0.06415323913097382,
547
+ "learning_rate": 1.863692404253597e-05,
548
+ "loss": 1.2138370275497437,
549
+ "step": 154
550
+ },
551
+ {
552
+ "epoch": 0.6046511627906976,
553
+ "grad_norm": 0.17963920533657074,
554
+ "learning_rate": 1.8311461816523192e-05,
555
+ "loss": 0.7944934964179993,
556
+ "step": 156
557
+ },
558
+ {
559
+ "epoch": 0.6124031007751938,
560
+ "grad_norm": 0.13642992079257965,
561
+ "learning_rate": 1.7985301505776026e-05,
562
+ "loss": 0.8701238036155701,
563
+ "step": 158
564
+ },
565
+ {
566
+ "epoch": 0.6201550387596899,
567
+ "grad_norm": 0.09022583067417145,
568
+ "learning_rate": 1.765865761403861e-05,
569
+ "loss": 1.279708981513977,
570
+ "step": 160
571
+ },
572
+ {
573
+ "epoch": 0.627906976744186,
574
+ "grad_norm": 0.310406357049942,
575
+ "learning_rate": 1.733174496308864e-05,
576
+ "loss": 1.020676612854004,
577
+ "step": 162
578
+ },
579
+ {
580
+ "epoch": 0.6356589147286822,
581
+ "grad_norm": 0.0761994794011116,
582
+ "learning_rate": 1.700477855145699e-05,
583
+ "loss": 1.2313765287399292,
584
+ "step": 164
585
+ },
586
+ {
587
+ "epoch": 0.6434108527131783,
588
+ "grad_norm": 0.09753235429525375,
589
+ "learning_rate": 1.6677973413030936e-05,
590
+ "loss": 0.9673617482185364,
591
+ "step": 166
592
+ },
593
+ {
594
+ "epoch": 0.6511627906976745,
595
+ "grad_norm": 0.16590853035449982,
596
+ "learning_rate": 1.6351544475634266e-05,
597
+ "loss": 1.194890022277832,
598
+ "step": 168
599
+ },
600
+ {
601
+ "epoch": 0.6589147286821705,
602
+ "grad_norm": 0.14948436617851257,
603
+ "learning_rate": 1.6025706419677057e-05,
604
+ "loss": 0.5818596482276917,
605
+ "step": 170
606
+ },
607
+ {
608
+ "epoch": 0.6666666666666666,
609
+ "grad_norm": 0.1858793944120407,
610
+ "learning_rate": 1.5700673536968222e-05,
611
+ "loss": 1.1378095149993896,
612
+ "step": 172
613
+ },
614
+ {
615
+ "epoch": 0.6744186046511628,
616
+ "grad_norm": 0.2438780963420868,
617
+ "learning_rate": 1.5376659589783572e-05,
618
+ "loss": 0.864031970500946,
619
+ "step": 174
620
+ },
621
+ {
622
+ "epoch": 0.6821705426356589,
623
+ "grad_norm": 0.13808684051036835,
624
+ "learning_rate": 1.5053877670282186e-05,
625
+ "loss": 0.9113052487373352,
626
+ "step": 176
627
+ },
628
+ {
629
+ "epoch": 0.689922480620155,
630
+ "grad_norm": 0.211846262216568,
631
+ "learning_rate": 1.4732540060363447e-05,
632
+ "loss": 0.9309589862823486,
633
+ "step": 178
634
+ },
635
+ {
636
+ "epoch": 0.6976744186046512,
637
+ "grad_norm": 0.13679386675357819,
638
+ "learning_rate": 1.4412858092056991e-05,
639
+ "loss": 1.002301573753357,
640
+ "step": 180
641
+ },
642
+ {
643
+ "epoch": 0.7054263565891473,
644
+ "grad_norm": 0.09943073987960815,
645
+ "learning_rate": 1.4095042008537343e-05,
646
+ "loss": 1.0712729692459106,
647
+ "step": 182
648
+ },
649
+ {
650
+ "epoch": 0.7131782945736435,
651
+ "grad_norm": 0.18878647685050964,
652
+ "learning_rate": 1.3779300825854622e-05,
653
+ "loss": 0.9123468995094299,
654
+ "step": 184
655
+ },
656
+ {
657
+ "epoch": 0.7209302325581395,
658
+ "grad_norm": 0.08719319850206375,
659
+ "learning_rate": 1.3465842195472321e-05,
660
+ "loss": 1.2733235359191895,
661
+ "step": 186
662
+ },
663
+ {
664
+ "epoch": 0.7286821705426356,
665
+ "grad_norm": 0.06448693573474884,
666
+ "learning_rate": 1.3154872267702522e-05,
667
+ "loss": 0.9789453148841858,
668
+ "step": 188
669
+ },
670
+ {
671
+ "epoch": 0.7364341085271318,
672
+ "grad_norm": 0.12241167575120926,
673
+ "learning_rate": 1.2846595556128331e-05,
674
+ "loss": 1.0140795707702637,
675
+ "step": 190
676
+ },
677
+ {
678
+ "epoch": 0.7441860465116279,
679
+ "grad_norm": 0.14142774045467377,
680
+ "learning_rate": 1.254121480310276e-05,
681
+ "loss": 1.1332778930664062,
682
+ "step": 192
683
+ },
684
+ {
685
+ "epoch": 0.751937984496124,
686
+ "grad_norm": 0.07337574660778046,
687
+ "learning_rate": 1.2238930846412475e-05,
688
+ "loss": 1.201830506324768,
689
+ "step": 194
690
+ },
691
+ {
692
+ "epoch": 0.7596899224806202,
693
+ "grad_norm": 0.07899358868598938,
694
+ "learning_rate": 1.1939942487194116e-05,
695
+ "loss": 1.2011100053787231,
696
+ "step": 196
697
+ },
698
+ {
699
+ "epoch": 0.7674418604651163,
700
+ "grad_norm": 0.10521137714385986,
701
+ "learning_rate": 1.1644446359190004e-05,
702
+ "loss": 0.5936653017997742,
703
+ "step": 198
704
+ },
705
+ {
706
+ "epoch": 0.7751937984496124,
707
+ "grad_norm": 0.16837139427661896,
708
+ "learning_rate": 1.1352636799429354e-05,
709
+ "loss": 1.3216241598129272,
710
+ "step": 200
711
+ },
712
+ {
713
+ "epoch": 0.7829457364341085,
714
+ "grad_norm": 0.11404802650213242,
715
+ "learning_rate": 1.1064705720419829e-05,
716
+ "loss": 1.084835171699524,
717
+ "step": 202
718
+ },
719
+ {
720
+ "epoch": 0.7906976744186046,
721
+ "grad_norm": 0.24780981242656708,
722
+ "learning_rate": 1.0780842483933755e-05,
723
+ "loss": 1.2125266790390015,
724
+ "step": 204
725
+ },
726
+ {
727
+ "epoch": 0.7984496124031008,
728
+ "grad_norm": 0.12619031965732574,
729
+ "learning_rate": 1.050123377647171e-05,
730
+ "loss": 1.0225963592529297,
731
+ "step": 206
732
+ },
733
+ {
734
+ "epoch": 0.8062015503875969,
735
+ "grad_norm": 1.412670612335205,
736
+ "learning_rate": 1.0226063486485695e-05,
737
+ "loss": 0.7963980436325073,
738
+ "step": 208
739
+ },
740
+ {
741
+ "epoch": 0.813953488372093,
742
+ "grad_norm": 0.18459799885749817,
743
+ "learning_rate": 9.955512583442334e-06,
744
+ "loss": 1.2788116931915283,
745
+ "step": 210
746
+ },
747
+ {
748
+ "epoch": 0.8217054263565892,
749
+ "grad_norm": 0.058253731578588486,
750
+ "learning_rate": 9.68975899880592e-06,
751
+ "loss": 1.1842073202133179,
752
+ "step": 212
753
+ },
754
+ {
755
+ "epoch": 0.8294573643410853,
756
+ "grad_norm": 0.09324084967374802,
757
+ "learning_rate": 9.42897750901933e-06,
758
+ "loss": 0.9420091509819031,
759
+ "step": 214
760
+ },
761
+ {
762
+ "epoch": 0.8372093023255814,
763
+ "grad_norm": 0.14589789509773254,
764
+ "learning_rate": 9.173339620559935e-06,
765
+ "loss": 1.0436409711837769,
766
+ "step": 216
767
+ },
768
+ {
769
+ "epoch": 0.8449612403100775,
770
+ "grad_norm": 0.08236993849277496,
771
+ "learning_rate": 8.923013457146082e-06,
772
+ "loss": 1.2834446430206299,
773
+ "step": 218
774
+ },
775
+ {
776
+ "epoch": 0.8527131782945736,
777
+ "grad_norm": 0.07797209173440933,
778
+ "learning_rate": 8.678163649168214e-06,
779
+ "loss": 1.1693506240844727,
780
+ "step": 220
781
+ },
782
+ {
783
+ "epoch": 0.8604651162790697,
784
+ "grad_norm": 0.21979407966136932,
785
+ "learning_rate": 8.438951225417476e-06,
786
+ "loss": 0.49415066838264465,
787
+ "step": 222
788
+ },
789
+ {
790
+ "epoch": 0.8682170542635659,
791
+ "grad_norm": 0.16792796552181244,
792
+ "learning_rate": 8.205533507182963e-06,
793
+ "loss": 1.1654852628707886,
794
+ "step": 224
795
+ },
796
+ {
797
+ "epoch": 0.875968992248062,
798
+ "grad_norm": 0.1074092760682106,
799
+ "learning_rate": 7.978064004787238e-06,
800
+ "loss": 1.2648242712020874,
801
+ "step": 226
802
+ },
803
+ {
804
+ "epoch": 0.8837209302325582,
805
+ "grad_norm": 0.12686721980571747,
806
+ "learning_rate": 7.756692316628162e-06,
807
+ "loss": 0.8766679167747498,
808
+ "step": 228
809
+ },
810
+ {
811
+ "epoch": 0.8914728682170543,
812
+ "grad_norm": 0.10413216799497604,
813
+ "learning_rate": 7.541564030793536e-06,
814
+ "loss": 0.9922328591346741,
815
+ "step": 230
816
+ },
817
+ {
818
+ "epoch": 0.8992248062015504,
819
+ "grad_norm": 0.07999309152364731,
820
+ "learning_rate": 7.33282062931308e-06,
821
+ "loss": 0.837881863117218,
822
+ "step": 232
823
+ },
824
+ {
825
+ "epoch": 0.9069767441860465,
826
+ "grad_norm": 0.16637900471687317,
827
+ "learning_rate": 7.13059939511089e-06,
828
+ "loss": 1.272527813911438,
829
+ "step": 234
830
+ },
831
+ {
832
+ "epoch": 0.9147286821705426,
833
+ "grad_norm": 0.13920988142490387,
834
+ "learning_rate": 6.935033321719421e-06,
835
+ "loss": 0.6637862920761108,
836
+ "step": 236
837
+ },
838
+ {
839
+ "epoch": 0.9224806201550387,
840
+ "grad_norm": 0.07921171188354492,
841
+ "learning_rate": 6.746251025814548e-06,
842
+ "loss": 1.2028839588165283,
843
+ "step": 238
844
+ },
845
+ {
846
+ "epoch": 0.9302325581395349,
847
+ "grad_norm": 0.11715266853570938,
848
+ "learning_rate": 6.564376662629032e-06,
849
+ "loss": 1.0310890674591064,
850
+ "step": 240
851
+ },
852
+ {
853
+ "epoch": 0.937984496124031,
854
+ "grad_norm": 0.1706083118915558,
855
+ "learning_rate": 6.389529844300147e-06,
856
+ "loss": 1.129476547241211,
857
+ "step": 242
858
+ },
859
+ {
860
+ "epoch": 0.9457364341085271,
861
+ "grad_norm": 0.09015638381242752,
862
+ "learning_rate": 6.2218255612051575e-06,
863
+ "loss": 0.9788402915000916,
864
+ "step": 244
865
+ },
866
+ {
867
+ "epoch": 0.9534883720930233,
868
+ "grad_norm": 0.09626635164022446,
869
+ "learning_rate": 6.061374106336329e-06,
870
+ "loss": 0.7472362518310547,
871
+ "step": 246
872
+ },
873
+ {
874
+ "epoch": 0.9612403100775194,
875
+ "grad_norm": 0.17239803075790405,
876
+ "learning_rate": 5.9082810027652495e-06,
877
+ "loss": 0.7408154606819153,
878
+ "step": 248
879
+ },
880
+ {
881
+ "epoch": 0.9689922480620154,
882
+ "grad_norm": 0.07973187416791916,
883
+ "learning_rate": 5.762646934244157e-06,
884
+ "loss": 1.1912089586257935,
885
+ "step": 250
886
+ },
887
+ {
888
+ "epoch": 0.9767441860465116,
889
+ "grad_norm": 0.08109164237976074,
890
+ "learning_rate": 5.6245676789899e-06,
891
+ "loss": 0.970727264881134,
892
+ "step": 252
893
+ },
894
+ {
895
+ "epoch": 0.9844961240310077,
896
+ "grad_norm": 0.2656784951686859,
897
+ "learning_rate": 5.494134046694101e-06,
898
+ "loss": 0.9474197626113892,
899
+ "step": 254
900
+ },
901
+ {
902
+ "epoch": 0.9922480620155039,
903
+ "grad_norm": 0.09388367086648941,
904
+ "learning_rate": 5.371431818800934e-06,
905
+ "loss": 0.7675265073776245,
906
+ "step": 256
907
+ },
908
+ {
909
+ "epoch": 1.0,
910
+ "grad_norm": 0.07208788394927979,
911
+ "learning_rate": 5.256541692091799e-06,
912
+ "loss": 1.151860237121582,
913
+ "step": 258
914
+ },
915
+ {
916
+ "epoch": 1.0077519379844961,
917
+ "grad_norm": 0.07177931815385818,
918
+ "learning_rate": 5.149539225613974e-06,
919
+ "loss": 0.6956380605697632,
920
+ "step": 260
921
+ },
922
+ {
923
+ "epoch": 1.0155038759689923,
924
+ "grad_norm": 0.06252402067184448,
925
+ "learning_rate": 5.050494790988212e-06,
926
+ "loss": 0.9135383367538452,
927
+ "step": 262
928
+ },
929
+ {
930
+ "epoch": 1.0232558139534884,
931
+ "grad_norm": 0.17128507792949677,
932
+ "learning_rate": 4.95947352612787e-06,
933
+ "loss": 0.721315324306488,
934
+ "step": 264
935
+ },
936
+ {
937
+ "epoch": 1.0310077519379846,
938
+ "grad_norm": 0.08467314392328262,
939
+ "learning_rate": 4.876535292400089e-06,
940
+ "loss": 0.4410458207130432,
941
+ "step": 266
942
+ },
943
+ {
944
+ "epoch": 1.0387596899224807,
945
+ "grad_norm": 0.10766004770994186,
946
+ "learning_rate": 4.801734635257148e-06,
947
+ "loss": 0.8536827564239502,
948
+ "step": 268
949
+ },
950
+ {
951
+ "epoch": 1.0465116279069768,
952
+ "grad_norm": 0.15451736748218536,
953
+ "learning_rate": 4.735120748363916e-06,
954
+ "loss": 0.903506875038147,
955
+ "step": 270
956
+ },
957
+ {
958
+ "epoch": 1.054263565891473,
959
+ "grad_norm": 0.06612464040517807,
960
+ "learning_rate": 4.676737441244975e-06,
961
+ "loss": 0.48186248540878296,
962
+ "step": 272
963
+ },
964
+ {
965
+ "epoch": 1.062015503875969,
966
+ "grad_norm": 0.10002221167087555,
967
+ "learning_rate": 4.626623110472677e-06,
968
+ "loss": 0.8960871696472168,
969
+ "step": 274
970
+ },
971
+ {
972
+ "epoch": 1.069767441860465,
973
+ "grad_norm": 0.07858562469482422,
974
+ "learning_rate": 4.584810714415135e-06,
975
+ "loss": 0.8507243990898132,
976
+ "step": 276
977
+ },
978
+ {
979
+ "epoch": 1.0775193798449612,
980
+ "grad_norm": 0.06665261089801788,
981
+ "learning_rate": 4.5513277515607014e-06,
982
+ "loss": 0.9197998642921448,
983
+ "step": 278
984
+ },
985
+ {
986
+ "epoch": 1.0852713178294573,
987
+ "grad_norm": 0.16345328092575073,
988
+ "learning_rate": 4.526196242433211e-06,
989
+ "loss": 0.778313398361206,
990
+ "step": 280
991
+ },
992
+ {
993
+ "epoch": 1.0930232558139534,
994
+ "grad_norm": 0.10489022731781006,
995
+ "learning_rate": 4.509432715109887e-06,
996
+ "loss": 0.5479567050933838,
997
+ "step": 282
998
+ },
999
+ {
1000
+ "epoch": 1.1007751937984496,
1001
+ "grad_norm": 0.05080074071884155,
1002
+ "learning_rate": 4.50104819435143e-06,
1003
+ "loss": 0.6334800720214844,
1004
+ "step": 284
1005
+ },
1006
+ {
1007
+ "epoch": 1.1085271317829457,
1008
+ "grad_norm": 0.12185381352901459,
1009
+ "learning_rate": 4.50104819435143e-06,
1010
+ "loss": 0.8215212225914001,
1011
+ "step": 286
1012
+ },
1013
+ {
1014
+ "epoch": 1.1162790697674418,
1015
+ "grad_norm": 0.08784171938896179,
1016
+ "learning_rate": 4.509432715109887e-06,
1017
+ "loss": 0.5245926976203918,
1018
+ "step": 288
1019
+ },
1020
+ {
1021
+ "epoch": 1.124031007751938,
1022
+ "grad_norm": 0.09065528959035873,
1023
+ "learning_rate": 4.526196242433211e-06,
1024
+ "loss": 1.0330955982208252,
1025
+ "step": 290
1026
+ },
1027
+ {
1028
+ "epoch": 1.1317829457364341,
1029
+ "grad_norm": 0.04104357957839966,
1030
+ "learning_rate": 4.5513277515607014e-06,
1031
+ "loss": 0.5526050329208374,
1032
+ "step": 292
1033
+ },
1034
+ {
1035
+ "epoch": 1.1395348837209303,
1036
+ "grad_norm": 0.31155890226364136,
1037
+ "learning_rate": 4.584810714415136e-06,
1038
+ "loss": 1.046125888824463,
1039
+ "step": 294
1040
+ },
1041
+ {
1042
+ "epoch": 1.1472868217054264,
1043
+ "grad_norm": 0.15053009986877441,
1044
+ "learning_rate": 4.626623110472676e-06,
1045
+ "loss": 0.3840217590332031,
1046
+ "step": 296
1047
+ },
1048
+ {
1049
+ "epoch": 1.1550387596899225,
1050
+ "grad_norm": 0.08694499731063843,
1051
+ "learning_rate": 4.676737441244973e-06,
1052
+ "loss": 0.6799867153167725,
1053
+ "step": 298
1054
+ },
1055
+ {
1056
+ "epoch": 1.1627906976744187,
1057
+ "grad_norm": 0.07247356325387955,
1058
+ "learning_rate": 4.735120748363917e-06,
1059
+ "loss": 0.6748986840248108,
1060
+ "step": 300
1061
+ },
1062
+ {
1063
+ "epoch": 1.1705426356589148,
1064
+ "grad_norm": 0.06139397993683815,
1065
+ "learning_rate": 4.801734635257148e-06,
1066
+ "loss": 0.8421810865402222,
1067
+ "step": 302
1068
+ },
1069
+ {
1070
+ "epoch": 1.178294573643411,
1071
+ "grad_norm": 0.08629012107849121,
1072
+ "learning_rate": 4.876535292400087e-06,
1073
+ "loss": 0.5402819514274597,
1074
+ "step": 304
1075
+ },
1076
+ {
1077
+ "epoch": 1.1860465116279069,
1078
+ "grad_norm": 0.17801746726036072,
1079
+ "learning_rate": 4.95947352612787e-06,
1080
+ "loss": 0.9019787311553955,
1081
+ "step": 306
1082
+ },
1083
+ {
1084
+ "epoch": 1.193798449612403,
1085
+ "grad_norm": 0.13845831155776978,
1086
+ "learning_rate": 5.050494790988212e-06,
1087
+ "loss": 0.8330530524253845,
1088
+ "step": 308
1089
+ },
1090
+ {
1091
+ "epoch": 1.2015503875968991,
1092
+ "grad_norm": 0.0652163103222847,
1093
+ "learning_rate": 5.149539225613974e-06,
1094
+ "loss": 1.0060863494873047,
1095
+ "step": 310
1096
+ },
1097
+ {
1098
+ "epoch": 1.2093023255813953,
1099
+ "grad_norm": 0.0937967598438263,
1100
+ "learning_rate": 5.256541692091797e-06,
1101
+ "loss": 0.5403499007225037,
1102
+ "step": 312
1103
+ },
1104
+ {
1105
+ "epoch": 1.2170542635658914,
1106
+ "grad_norm": 0.19726139307022095,
1107
+ "learning_rate": 5.371431818800936e-06,
1108
+ "loss": 0.37406668066978455,
1109
+ "step": 314
1110
+ },
1111
+ {
1112
+ "epoch": 1.2248062015503876,
1113
+ "grad_norm": 0.11253905296325684,
1114
+ "learning_rate": 5.494134046694099e-06,
1115
+ "loss": 0.6960604786872864,
1116
+ "step": 316
1117
+ },
1118
+ {
1119
+ "epoch": 1.2325581395348837,
1120
+ "grad_norm": 0.08688368648290634,
1121
+ "learning_rate": 5.624567678989899e-06,
1122
+ "loss": 0.7832977771759033,
1123
+ "step": 318
1124
+ },
1125
+ {
1126
+ "epoch": 1.2403100775193798,
1127
+ "grad_norm": 0.19534340500831604,
1128
+ "learning_rate": 5.762646934244156e-06,
1129
+ "loss": 0.9501113295555115,
1130
+ "step": 320
1131
+ },
1132
+ {
1133
+ "epoch": 1.248062015503876,
1134
+ "grad_norm": 0.06447340548038483,
1135
+ "learning_rate": 5.908281002765248e-06,
1136
+ "loss": 1.0130536556243896,
1137
+ "step": 322
1138
+ },
1139
+ {
1140
+ "epoch": 1.255813953488372,
1141
+ "grad_norm": 0.11461887508630753,
1142
+ "learning_rate": 6.061374106336328e-06,
1143
+ "loss": 0.631900429725647,
1144
+ "step": 324
1145
+ },
1146
+ {
1147
+ "epoch": 1.2635658914728682,
1148
+ "grad_norm": 0.09350797533988953,
1149
+ "learning_rate": 6.2218255612051575e-06,
1150
+ "loss": 0.8754401803016663,
1151
+ "step": 326
1152
+ },
1153
+ {
1154
+ "epoch": 1.2713178294573644,
1155
+ "grad_norm": 0.11175557225942612,
1156
+ "learning_rate": 6.389529844300143e-06,
1157
+ "loss": 0.7127947807312012,
1158
+ "step": 328
1159
+ },
1160
+ {
1161
+ "epoch": 1.2790697674418605,
1162
+ "grad_norm": 0.09055038541555405,
1163
+ "learning_rate": 6.564376662629029e-06,
1164
+ "loss": 0.4656026363372803,
1165
+ "step": 330
1166
+ },
1167
+ {
1168
+ "epoch": 1.2868217054263567,
1169
+ "grad_norm": 0.09712733328342438,
1170
+ "learning_rate": 6.74625102581455e-06,
1171
+ "loss": 0.8079378008842468,
1172
+ "step": 332
1173
+ },
1174
+ {
1175
+ "epoch": 1.2945736434108528,
1176
+ "grad_norm": 0.18206307291984558,
1177
+ "learning_rate": 6.935033321719419e-06,
1178
+ "loss": 0.5637804865837097,
1179
+ "step": 334
1180
+ },
1181
+ {
1182
+ "epoch": 1.302325581395349,
1183
+ "grad_norm": 0.23368722200393677,
1184
+ "learning_rate": 7.130599395110884e-06,
1185
+ "loss": 0.8007771968841553,
1186
+ "step": 336
1187
+ },
1188
+ {
1189
+ "epoch": 1.310077519379845,
1190
+ "grad_norm": 0.05224426090717316,
1191
+ "learning_rate": 7.332820629313082e-06,
1192
+ "loss": 0.551106333732605,
1193
+ "step": 338
1194
+ },
1195
+ {
1196
+ "epoch": 1.3178294573643412,
1197
+ "grad_norm": 0.07984264940023422,
1198
+ "learning_rate": 7.541564030793533e-06,
1199
+ "loss": 0.7754759788513184,
1200
+ "step": 340
1201
+ },
1202
+ {
1203
+ "epoch": 1.3255813953488373,
1204
+ "grad_norm": 0.22976501286029816,
1205
+ "learning_rate": 7.75669231662816e-06,
1206
+ "loss": 0.7786872982978821,
1207
+ "step": 342
1208
+ },
1209
+ {
1210
+ "epoch": 1.3333333333333333,
1211
+ "grad_norm": 0.17023955285549164,
1212
+ "learning_rate": 7.978064004787231e-06,
1213
+ "loss": 0.7895460724830627,
1214
+ "step": 344
1215
+ },
1216
+ {
1217
+ "epoch": 1.3410852713178294,
1218
+ "grad_norm": 0.12108391523361206,
1219
+ "learning_rate": 8.205533507182961e-06,
1220
+ "loss": 0.20940443873405457,
1221
+ "step": 346
1222
+ },
1223
+ {
1224
+ "epoch": 1.3488372093023255,
1225
+ "grad_norm": 0.07635517418384552,
1226
+ "learning_rate": 8.438951225417474e-06,
1227
+ "loss": 0.819771409034729,
1228
+ "step": 348
1229
+ },
1230
+ {
1231
+ "epoch": 1.3565891472868217,
1232
+ "grad_norm": 0.11260077357292175,
1233
+ "learning_rate": 8.678163649168212e-06,
1234
+ "loss": 0.9801982641220093,
1235
+ "step": 350
1236
+ },
1237
+ {
1238
+ "epoch": 1.3643410852713178,
1239
+ "grad_norm": 0.09885291010141373,
1240
+ "learning_rate": 8.923013457146075e-06,
1241
+ "loss": 0.7718797326087952,
1242
+ "step": 352
1243
+ },
1244
+ {
1245
+ "epoch": 1.372093023255814,
1246
+ "grad_norm": 0.09329655021429062,
1247
+ "learning_rate": 9.173339620559931e-06,
1248
+ "loss": 0.40787971019744873,
1249
+ "step": 354
1250
+ },
1251
+ {
1252
+ "epoch": 1.37984496124031,
1253
+ "grad_norm": 0.11724522709846497,
1254
+ "learning_rate": 9.428977509019326e-06,
1255
+ "loss": 0.797160804271698,
1256
+ "step": 356
1257
+ },
1258
+ {
1259
+ "epoch": 1.3875968992248062,
1260
+ "grad_norm": 0.11735495924949646,
1261
+ "learning_rate": 9.689758998805924e-06,
1262
+ "loss": 0.6483190059661865,
1263
+ "step": 358
1264
+ },
1265
+ {
1266
+ "epoch": 1.3953488372093024,
1267
+ "grad_norm": 0.08914632350206375,
1268
+ "learning_rate": 9.955512583442333e-06,
1269
+ "loss": 0.7835768461227417,
1270
+ "step": 360
1271
+ },
1272
+ {
1273
+ "epoch": 1.4031007751937985,
1274
+ "grad_norm": 0.07666268944740295,
1275
+ "learning_rate": 1.0226063486485691e-05,
1276
+ "loss": 0.6386092901229858,
1277
+ "step": 362
1278
+ },
1279
+ {
1280
+ "epoch": 1.4108527131782946,
1281
+ "grad_norm": 0.08281254768371582,
1282
+ "learning_rate": 1.0501233776471714e-05,
1283
+ "loss": 0.8520874977111816,
1284
+ "step": 364
1285
+ },
1286
+ {
1287
+ "epoch": 1.4186046511627908,
1288
+ "grad_norm": 0.14842084050178528,
1289
+ "learning_rate": 1.0780842483933755e-05,
1290
+ "loss": 0.37374499440193176,
1291
+ "step": 366
1292
+ },
1293
+ {
1294
+ "epoch": 1.4263565891472867,
1295
+ "grad_norm": 0.24841120839118958,
1296
+ "learning_rate": 1.1064705720419827e-05,
1297
+ "loss": 0.3320968449115753,
1298
+ "step": 368
1299
+ },
1300
+ {
1301
+ "epoch": 1.4341085271317828,
1302
+ "grad_norm": 0.11581484228372574,
1303
+ "learning_rate": 1.135263679942935e-05,
1304
+ "loss": 0.7746375799179077,
1305
+ "step": 370
1306
+ },
1307
+ {
1308
+ "epoch": 1.441860465116279,
1309
+ "grad_norm": 0.0945417657494545,
1310
+ "learning_rate": 1.1644446359190006e-05,
1311
+ "loss": 0.6704602241516113,
1312
+ "step": 372
1313
+ },
1314
+ {
1315
+ "epoch": 1.449612403100775,
1316
+ "grad_norm": 0.06997057050466537,
1317
+ "learning_rate": 1.1939942487194116e-05,
1318
+ "loss": 0.9213350415229797,
1319
+ "step": 374
1320
+ },
1321
+ {
1322
+ "epoch": 1.4573643410852712,
1323
+ "grad_norm": 0.07435750216245651,
1324
+ "learning_rate": 1.2238930846412471e-05,
1325
+ "loss": 0.7233853936195374,
1326
+ "step": 376
1327
+ },
1328
+ {
1329
+ "epoch": 1.4651162790697674,
1330
+ "grad_norm": 0.18093754351139069,
1331
+ "learning_rate": 1.2541214803102757e-05,
1332
+ "loss": 0.5185383558273315,
1333
+ "step": 378
1334
+ },
1335
+ {
1336
+ "epoch": 1.4728682170542635,
1337
+ "grad_norm": 0.052637044340372086,
1338
+ "learning_rate": 1.2846595556128331e-05,
1339
+ "loss": 0.7751470804214478,
1340
+ "step": 380
1341
+ },
1342
+ {
1343
+ "epoch": 1.4806201550387597,
1344
+ "grad_norm": 0.10150747746229172,
1345
+ "learning_rate": 1.3154872267702518e-05,
1346
+ "loss": 0.7363438010215759,
1347
+ "step": 382
1348
+ },
1349
+ {
1350
+ "epoch": 1.4883720930232558,
1351
+ "grad_norm": 0.08896318078041077,
1352
+ "learning_rate": 1.3465842195472318e-05,
1353
+ "loss": 0.697909951210022,
1354
+ "step": 384
1355
+ },
1356
+ {
1357
+ "epoch": 1.496124031007752,
1358
+ "grad_norm": 0.09349460154771805,
1359
+ "learning_rate": 1.3779300825854622e-05,
1360
+ "loss": 0.5058455467224121,
1361
+ "step": 386
1362
+ },
1363
+ {
1364
+ "epoch": 1.503875968992248,
1365
+ "grad_norm": 0.0640454888343811,
1366
+ "learning_rate": 1.4095042008537336e-05,
1367
+ "loss": 0.6899944543838501,
1368
+ "step": 388
1369
+ },
1370
+ {
1371
+ "epoch": 1.5116279069767442,
1372
+ "grad_norm": 0.08342494815587997,
1373
+ "learning_rate": 1.4412858092056988e-05,
1374
+ "loss": 0.5844802856445312,
1375
+ "step": 390
1376
+ },
1377
+ {
1378
+ "epoch": 1.5193798449612403,
1379
+ "grad_norm": 0.14086060225963593,
1380
+ "learning_rate": 1.4732540060363447e-05,
1381
+ "loss": 0.6977730393409729,
1382
+ "step": 392
1383
+ },
1384
+ {
1385
+ "epoch": 1.5271317829457365,
1386
+ "grad_norm": 0.135100856423378,
1387
+ "learning_rate": 1.5053877670282176e-05,
1388
+ "loss": 0.7261441349983215,
1389
+ "step": 394
1390
+ },
1391
+ {
1392
+ "epoch": 1.5348837209302326,
1393
+ "grad_norm": 0.1089802235364914,
1394
+ "learning_rate": 1.537665958978357e-05,
1395
+ "loss": 0.7607800960540771,
1396
+ "step": 396
1397
+ },
1398
+ {
1399
+ "epoch": 1.5426356589147288,
1400
+ "grad_norm": 0.17686955630779266,
1401
+ "learning_rate": 1.5700673536968222e-05,
1402
+ "loss": 0.5964785218238831,
1403
+ "step": 398
1404
+ },
1405
+ {
1406
+ "epoch": 1.550387596899225,
1407
+ "grad_norm": 0.06133165583014488,
1408
+ "learning_rate": 1.6025706419677047e-05,
1409
+ "loss": 0.7581831812858582,
1410
+ "step": 400
1411
+ },
1412
+ {
1413
+ "epoch": 1.558139534883721,
1414
+ "grad_norm": 0.11867906898260117,
1415
+ "learning_rate": 1.6351544475634256e-05,
1416
+ "loss": 0.5359363555908203,
1417
+ "step": 402
1418
+ },
1419
+ {
1420
+ "epoch": 1.5658914728682172,
1421
+ "grad_norm": 0.08171830326318741,
1422
+ "learning_rate": 1.6677973413030932e-05,
1423
+ "loss": 0.9142735004425049,
1424
+ "step": 404
1425
+ },
1426
+ {
1427
+ "epoch": 1.5736434108527133,
1428
+ "grad_norm": 0.11325247585773468,
1429
+ "learning_rate": 1.7004778551456975e-05,
1430
+ "loss": 0.7637568712234497,
1431
+ "step": 406
1432
+ },
1433
+ {
1434
+ "epoch": 1.5813953488372094,
1435
+ "grad_norm": 0.054144054651260376,
1436
+ "learning_rate": 1.7331744963088644e-05,
1437
+ "loss": 0.31641456484794617,
1438
+ "step": 408
1439
+ },
1440
+ {
1441
+ "epoch": 1.5891472868217056,
1442
+ "grad_norm": 0.051439665257930756,
1443
+ "learning_rate": 1.7658657614038598e-05,
1444
+ "loss": 0.780099630355835,
1445
+ "step": 410
1446
+ },
1447
+ {
1448
+ "epoch": 1.5968992248062015,
1449
+ "grad_norm": 0.07669004052877426,
1450
+ "learning_rate": 1.7985301505776015e-05,
1451
+ "loss": 0.7998414635658264,
1452
+ "step": 412
1453
+ },
1454
+ {
1455
+ "epoch": 1.6046511627906976,
1456
+ "grad_norm": 0.08620447665452957,
1457
+ "learning_rate": 1.8311461816523192e-05,
1458
+ "loss": 0.5864279866218567,
1459
+ "step": 414
1460
+ },
1461
+ {
1462
+ "epoch": 1.6124031007751938,
1463
+ "grad_norm": 0.0925377830862999,
1464
+ "learning_rate": 1.8636924042535962e-05,
1465
+ "loss": 0.47105392813682556,
1466
+ "step": 416
1467
+ },
1468
+ {
1469
+ "epoch": 1.62015503875969,
1470
+ "grad_norm": 0.08717449009418488,
1471
+ "learning_rate": 1.8961474139175093e-05,
1472
+ "loss": 0.8024092316627502,
1473
+ "step": 418
1474
+ },
1475
+ {
1476
+ "epoch": 1.627906976744186,
1477
+ "grad_norm": 0.12033627182245255,
1478
+ "learning_rate": 1.9284898661675586e-05,
1479
+ "loss": 0.810451090335846,
1480
+ "step": 420
1481
+ },
1482
+ {
1483
+ "epoch": 1.6356589147286822,
1484
+ "grad_norm": 0.07522077113389969,
1485
+ "learning_rate": 1.9606984905521443e-05,
1486
+ "loss": 0.4688906967639923,
1487
+ "step": 422
1488
+ },
1489
+ {
1490
+ "epoch": 1.6434108527131783,
1491
+ "grad_norm": 0.07208564877510071,
1492
+ "learning_rate": 1.9927521046333837e-05,
1493
+ "loss": 0.7383279204368591,
1494
+ "step": 424
1495
+ },
1496
+ {
1497
+ "epoch": 1.6511627906976745,
1498
+ "grad_norm": 0.09510686248540878,
1499
+ "learning_rate": 2.0246296279180093e-05,
1500
+ "loss": 0.8395543694496155,
1501
+ "step": 426
1502
+ },
1503
+ {
1504
+ "epoch": 1.6589147286821704,
1505
+ "grad_norm": 0.15472382307052612,
1506
+ "learning_rate": 2.0563100957212567e-05,
1507
+ "loss": 0.8986775875091553,
1508
+ "step": 428
1509
+ },
1510
+ {
1511
+ "epoch": 1.6666666666666665,
1512
+ "grad_norm": 0.09020368754863739,
1513
+ "learning_rate": 2.0877726729545672e-05,
1514
+ "loss": 0.8169777393341064,
1515
+ "step": 430
1516
+ },
1517
+ {
1518
+ "epoch": 1.6744186046511627,
1519
+ "grad_norm": 0.198333740234375,
1520
+ "learning_rate": 2.1189966678280578e-05,
1521
+ "loss": 1.033119559288025,
1522
+ "step": 432
1523
+ },
1524
+ {
1525
+ "epoch": 1.6821705426356588,
1526
+ "grad_norm": 0.08684570342302322,
1527
+ "learning_rate": 2.149961545458772e-05,
1528
+ "loss": 0.5892492532730103,
1529
+ "step": 434
1530
+ },
1531
+ {
1532
+ "epoch": 1.689922480620155,
1533
+ "grad_norm": 0.0764966830611229,
1534
+ "learning_rate": 2.1806469413757164e-05,
1535
+ "loss": 0.7995302081108093,
1536
+ "step": 436
1537
+ },
1538
+ {
1539
+ "epoch": 1.697674418604651,
1540
+ "grad_norm": 0.13916683197021484,
1541
+ "learning_rate": 2.211032674912823e-05,
1542
+ "loss": 0.8415105938911438,
1543
+ "step": 438
1544
+ },
1545
+ {
1546
+ "epoch": 1.7054263565891472,
1547
+ "grad_norm": 0.24585378170013428,
1548
+ "learning_rate": 2.241098762481052e-05,
1549
+ "loss": 0.6350277066230774,
1550
+ "step": 440
1551
+ },
1552
+ {
1553
+ "epoch": 1.7131782945736433,
1554
+ "grad_norm": 0.050845544785261154,
1555
+ "learning_rate": 2.27082543071086e-05,
1556
+ "loss": 0.8463593125343323,
1557
+ "step": 442
1558
+ },
1559
+ {
1560
+ "epoch": 1.7209302325581395,
1561
+ "grad_norm": 0.07698489725589752,
1562
+ "learning_rate": 2.3001931294564265e-05,
1563
+ "loss": 0.5609403252601624,
1564
+ "step": 444
1565
+ },
1566
+ {
1567
+ "epoch": 1.7286821705426356,
1568
+ "grad_norm": 0.06638149172067642,
1569
+ "learning_rate": 2.3291825446530733e-05,
1570
+ "loss": 0.8690592050552368,
1571
+ "step": 446
1572
+ },
1573
+ {
1574
+ "epoch": 1.7364341085271318,
1575
+ "grad_norm": 0.08811336010694504,
1576
+ "learning_rate": 2.357774611019419e-05,
1577
+ "loss": 0.8064720630645752,
1578
+ "step": 448
1579
+ },
1580
+ {
1581
+ "epoch": 1.744186046511628,
1582
+ "grad_norm": 0.0755743682384491,
1583
+ "learning_rate": 2.385950524595919e-05,
1584
+ "loss": 1.0067108869552612,
1585
+ "step": 450
1586
+ },
1587
+ {
1588
+ "epoch": 1.751937984496124,
1589
+ "grad_norm": 0.06093823164701462,
1590
+ "learning_rate": 2.4136917551115478e-05,
1591
+ "loss": 0.967079222202301,
1592
+ "step": 452
1593
+ },
1594
+ {
1595
+ "epoch": 1.7596899224806202,
1596
+ "grad_norm": 0.09034255892038345,
1597
+ "learning_rate": 2.4409800581704777e-05,
1598
+ "loss": 0.6444424986839294,
1599
+ "step": 454
1600
+ },
1601
+ {
1602
+ "epoch": 1.7674418604651163,
1603
+ "grad_norm": 0.1733829230070114,
1604
+ "learning_rate": 2.4677974872507553e-05,
1605
+ "loss": 0.8322298526763916,
1606
+ "step": 456
1607
+ },
1608
+ {
1609
+ "epoch": 1.7751937984496124,
1610
+ "grad_norm": 0.23445071280002594,
1611
+ "learning_rate": 2.4941264055070734e-05,
1612
+ "loss": 0.4230212867259979,
1613
+ "step": 458
1614
+ },
1615
+ {
1616
+ "epoch": 1.7829457364341086,
1617
+ "grad_norm": 0.1249038353562355,
1618
+ "learning_rate": 2.5199494973698852e-05,
1619
+ "loss": 0.6065483093261719,
1620
+ "step": 460
1621
+ },
1622
+ {
1623
+ "epoch": 1.7906976744186047,
1624
+ "grad_norm": 0.08323405683040619,
1625
+ "learning_rate": 2.545249779933216e-05,
1626
+ "loss": 0.8183580040931702,
1627
+ "step": 462
1628
+ },
1629
+ {
1630
+ "epoch": 1.7984496124031009,
1631
+ "grad_norm": 0.10287293046712875,
1632
+ "learning_rate": 2.5700106141237063e-05,
1633
+ "loss": 0.9282822608947754,
1634
+ "step": 464
1635
+ },
1636
+ {
1637
+ "epoch": 1.806201550387597,
1638
+ "grad_norm": 0.053924717009067535,
1639
+ "learning_rate": 2.594215715643524e-05,
1640
+ "loss": 0.8734548687934875,
1641
+ "step": 466
1642
+ },
1643
+ {
1644
+ "epoch": 1.8139534883720931,
1645
+ "grad_norm": 0.10388979315757751,
1646
+ "learning_rate": 2.6178491656799497e-05,
1647
+ "loss": 0.8903089165687561,
1648
+ "step": 468
1649
+ },
1650
+ {
1651
+ "epoch": 1.8217054263565893,
1652
+ "grad_norm": 0.06755795329809189,
1653
+ "learning_rate": 2.640895421374602e-05,
1654
+ "loss": 0.4710087180137634,
1655
+ "step": 470
1656
+ },
1657
+ {
1658
+ "epoch": 1.8294573643410854,
1659
+ "grad_norm": 0.08703745901584625,
1660
+ "learning_rate": 2.6633393260454096e-05,
1661
+ "loss": 1.1290743350982666,
1662
+ "step": 472
1663
+ },
1664
+ {
1665
+ "epoch": 1.8372093023255816,
1666
+ "grad_norm": 0.10183677822351456,
1667
+ "learning_rate": 2.6851661191546034e-05,
1668
+ "loss": 0.6608400344848633,
1669
+ "step": 474
1670
+ },
1671
+ {
1672
+ "epoch": 1.8449612403100775,
1673
+ "grad_norm": 0.11454630643129349,
1674
+ "learning_rate": 2.706361446016192e-05,
1675
+ "loss": 0.850265383720398,
1676
+ "step": 476
1677
+ },
1678
+ {
1679
+ "epoch": 1.8527131782945736,
1680
+ "grad_norm": 0.06782646477222443,
1681
+ "learning_rate": 2.7269113672365096e-05,
1682
+ "loss": 0.6361703872680664,
1683
+ "step": 478
1684
+ },
1685
+ {
1686
+ "epoch": 1.8604651162790697,
1687
+ "grad_norm": 0.08559778332710266,
1688
+ "learning_rate": 2.7468023678816444e-05,
1689
+ "loss": 1.0639129877090454,
1690
+ "step": 480
1691
+ },
1692
+ {
1693
+ "epoch": 1.8682170542635659,
1694
+ "grad_norm": 0.06762553006410599,
1695
+ "learning_rate": 2.766021366365728e-05,
1696
+ "loss": 0.6422796845436096,
1697
+ "step": 482
1698
+ },
1699
+ {
1700
+ "epoch": 1.875968992248062,
1701
+ "grad_norm": 0.07438317686319351,
1702
+ "learning_rate": 2.784555723054208e-05,
1703
+ "loss": 0.7208263874053955,
1704
+ "step": 484
1705
+ },
1706
+ {
1707
+ "epoch": 1.8837209302325582,
1708
+ "grad_norm": 0.07318796217441559,
1709
+ "learning_rate": 2.8023932485764764e-05,
1710
+ "loss": 0.8420804738998413,
1711
+ "step": 486
1712
+ },
1713
+ {
1714
+ "epoch": 1.8914728682170543,
1715
+ "grad_norm": 0.10379486531019211,
1716
+ "learning_rate": 2.81952221184238e-05,
1717
+ "loss": 0.5533670783042908,
1718
+ "step": 488
1719
+ },
1720
+ {
1721
+ "epoch": 1.8992248062015504,
1722
+ "grad_norm": 1.0894800424575806,
1723
+ "learning_rate": 2.8359313477573215e-05,
1724
+ "loss": 0.688605785369873,
1725
+ "step": 490
1726
+ },
1727
+ {
1728
+ "epoch": 1.9069767441860463,
1729
+ "grad_norm": 0.23758739233016968,
1730
+ "learning_rate": 2.8516098646309108e-05,
1731
+ "loss": 0.5789573192596436,
1732
+ "step": 492
1733
+ },
1734
+ {
1735
+ "epoch": 1.9147286821705425,
1736
+ "grad_norm": 0.06857667863368988,
1737
+ "learning_rate": 2.8665474512742607e-05,
1738
+ "loss": 0.6448074579238892,
1739
+ "step": 494
1740
+ },
1741
+ {
1742
+ "epoch": 1.9224806201550386,
1743
+ "grad_norm": 0.08650626242160797,
1744
+ "learning_rate": 2.8807342837812783e-05,
1745
+ "loss": 0.6479641199111938,
1746
+ "step": 496
1747
+ },
1748
+ {
1749
+ "epoch": 1.9302325581395348,
1750
+ "grad_norm": 0.07275024801492691,
1751
+ "learning_rate": 2.894161031989497e-05,
1752
+ "loss": 0.4521400034427643,
1753
+ "step": 498
1754
+ },
1755
+ {
1756
+ "epoch": 1.937984496124031,
1757
+ "grad_norm": 0.05953352525830269,
1758
+ "learning_rate": 2.906818865616178e-05,
1759
+ "loss": 0.9132779240608215,
1760
+ "step": 500
1761
+ },
1762
+ {
1763
+ "epoch": 1.945736434108527,
1764
+ "grad_norm": 0.12861226499080658,
1765
+ "learning_rate": 2.9186994600656647e-05,
1766
+ "loss": 0.6908618807792664,
1767
+ "step": 502
1768
+ },
1769
+ {
1770
+ "epoch": 1.9534883720930232,
1771
+ "grad_norm": 0.07091208547353745,
1772
+ "learning_rate": 2.929795001904172e-05,
1773
+ "loss": 0.6676538586616516,
1774
+ "step": 504
1775
+ },
1776
+ {
1777
+ "epoch": 1.9612403100775193,
1778
+ "grad_norm": 0.11093394458293915,
1779
+ "learning_rate": 2.9400981939983914e-05,
1780
+ "loss": 1.0052788257598877,
1781
+ "step": 506
1782
+ },
1783
+ {
1784
+ "epoch": 1.9689922480620154,
1785
+ "grad_norm": 0.05772824585437775,
1786
+ "learning_rate": 2.9496022603145494e-05,
1787
+ "loss": 0.7913935780525208,
1788
+ "step": 508
1789
+ },
1790
+ {
1791
+ "epoch": 1.9767441860465116,
1792
+ "grad_norm": 0.0762370154261589,
1793
+ "learning_rate": 2.9583009503747627e-05,
1794
+ "loss": 0.9280475974082947,
1795
+ "step": 510
1796
+ },
1797
+ {
1798
+ "epoch": 1.9844961240310077,
1799
+ "grad_norm": 0.18662315607070923,
1800
+ "learning_rate": 2.9661885433677437e-05,
1801
+ "loss": 0.7493736743927002,
1802
+ "step": 512
1803
+ },
1804
+ {
1805
+ "epoch": 1.9922480620155039,
1806
+ "grad_norm": 0.07221183180809021,
1807
+ "learning_rate": 2.9732598519111736e-05,
1808
+ "loss": 1.0501880645751953,
1809
+ "step": 514
1810
+ },
1811
+ {
1812
+ "epoch": 2.0,
1813
+ "grad_norm": 0.08491652458906174,
1814
+ "learning_rate": 2.9795102254632528e-05,
1815
+ "loss": 1.011595368385315,
1816
+ "step": 516
1817
+ },
1818
+ {
1819
+ "epoch": 2.007751937984496,
1820
+ "grad_norm": 0.09373293071985245,
1821
+ "learning_rate": 2.9849355533811937e-05,
1822
+ "loss": 0.5705936551094055,
1823
+ "step": 518
1824
+ },
1825
+ {
1826
+ "epoch": 2.0155038759689923,
1827
+ "grad_norm": 0.06463813781738281,
1828
+ "learning_rate": 2.9895322676246387e-05,
1829
+ "loss": 0.7379302978515625,
1830
+ "step": 520
1831
+ },
1832
+ {
1833
+ "epoch": 2.0232558139534884,
1834
+ "grad_norm": 0.09566348791122437,
1835
+ "learning_rate": 2.993297345102233e-05,
1836
+ "loss": 0.46209296584129333,
1837
+ "step": 522
1838
+ },
1839
+ {
1840
+ "epoch": 2.0310077519379846,
1841
+ "grad_norm": 0.05616720765829086,
1842
+ "learning_rate": 2.9962283096597995e-05,
1843
+ "loss": 0.773676335811615,
1844
+ "step": 524
1845
+ },
1846
+ {
1847
+ "epoch": 2.0387596899224807,
1848
+ "grad_norm": 0.09464262425899506,
1849
+ "learning_rate": 2.998323233708815e-05,
1850
+ "loss": 0.6592158675193787,
1851
+ "step": 526
1852
+ },
1853
+ {
1854
+ "epoch": 2.046511627906977,
1855
+ "grad_norm": 0.09258489310741425,
1856
+ "learning_rate": 2.999580739494117e-05,
1857
+ "loss": 0.7777129411697388,
1858
+ "step": 528
1859
+ },
1860
+ {
1861
+ "epoch": 2.054263565891473,
1862
+ "grad_norm": 0.13635995984077454,
1863
+ "learning_rate": 3e-05,
1864
+ "loss": 0.385895311832428,
1865
+ "step": 530
1866
+ },
1867
+ {
1868
+ "epoch": 2.062015503875969,
1869
+ "grad_norm": 0.1054837629199028,
1870
+ "learning_rate": 2.999580739494117e-05,
1871
+ "loss": 0.7748541235923767,
1872
+ "step": 532
1873
+ },
1874
+ {
1875
+ "epoch": 2.0697674418604652,
1876
+ "grad_norm": 0.08860507607460022,
1877
+ "learning_rate": 2.998323233708815e-05,
1878
+ "loss": 0.407875120639801,
1879
+ "step": 534
1880
+ },
1881
+ {
1882
+ "epoch": 2.0775193798449614,
1883
+ "grad_norm": 0.07644882053136826,
1884
+ "learning_rate": 2.9962283096598e-05,
1885
+ "loss": 0.41405466198921204,
1886
+ "step": 536
1887
+ },
1888
+ {
1889
+ "epoch": 2.0852713178294575,
1890
+ "grad_norm": 0.20681916177272797,
1891
+ "learning_rate": 2.9932973451022333e-05,
1892
+ "loss": 0.701027512550354,
1893
+ "step": 538
1894
+ },
1895
+ {
1896
+ "epoch": 2.0930232558139537,
1897
+ "grad_norm": 0.07310563325881958,
1898
+ "learning_rate": 2.9895322676246387e-05,
1899
+ "loss": 0.4735100567340851,
1900
+ "step": 540
1901
+ },
1902
+ {
1903
+ "epoch": 2.10077519379845,
1904
+ "grad_norm": 0.0755162462592125,
1905
+ "learning_rate": 2.9849355533811937e-05,
1906
+ "loss": 0.27081194519996643,
1907
+ "step": 542
1908
+ },
1909
+ {
1910
+ "epoch": 2.108527131782946,
1911
+ "grad_norm": 0.07929737865924835,
1912
+ "learning_rate": 2.9795102254632528e-05,
1913
+ "loss": 0.6002092957496643,
1914
+ "step": 544
1915
+ },
1916
+ {
1917
+ "epoch": 2.116279069767442,
1918
+ "grad_norm": 0.25740522146224976,
1919
+ "learning_rate": 2.973259851911174e-05,
1920
+ "loss": 0.4636404514312744,
1921
+ "step": 546
1922
+ },
1923
+ {
1924
+ "epoch": 2.124031007751938,
1925
+ "grad_norm": 0.07688764482736588,
1926
+ "learning_rate": 2.9661885433677434e-05,
1927
+ "loss": 0.4923861026763916,
1928
+ "step": 548
1929
+ },
1930
+ {
1931
+ "epoch": 2.1317829457364343,
1932
+ "grad_norm": 0.24001885950565338,
1933
+ "learning_rate": 2.9583009503747627e-05,
1934
+ "loss": 0.3250856101512909,
1935
+ "step": 550
1936
+ },
1937
+ {
1938
+ "epoch": 2.13953488372093,
1939
+ "grad_norm": 0.09132993221282959,
1940
+ "learning_rate": 2.9496022603145497e-05,
1941
+ "loss": 0.7897784113883972,
1942
+ "step": 552
1943
+ },
1944
+ {
1945
+ "epoch": 2.147286821705426,
1946
+ "grad_norm": 0.09284122288227081,
1947
+ "learning_rate": 2.940098193998391e-05,
1948
+ "loss": 0.8441802859306335,
1949
+ "step": 554
1950
+ },
1951
+ {
1952
+ "epoch": 2.1550387596899223,
1953
+ "grad_norm": 0.07503140717744827,
1954
+ "learning_rate": 2.9297950019041724e-05,
1955
+ "loss": 0.4028940498828888,
1956
+ "step": 556
1957
+ },
1958
+ {
1959
+ "epoch": 2.1627906976744184,
1960
+ "grad_norm": 0.11651087552309036,
1961
+ "learning_rate": 2.9186994600656647e-05,
1962
+ "loss": 0.6657426953315735,
1963
+ "step": 558
1964
+ },
1965
+ {
1966
+ "epoch": 2.1705426356589146,
1967
+ "grad_norm": 0.06494183093309402,
1968
+ "learning_rate": 2.906818865616178e-05,
1969
+ "loss": 0.5439774990081787,
1970
+ "step": 560
1971
+ },
1972
+ {
1973
+ "epoch": 2.1782945736434107,
1974
+ "grad_norm": 0.05145857110619545,
1975
+ "learning_rate": 2.8941610319894977e-05,
1976
+ "loss": 0.7213448882102966,
1977
+ "step": 562
1978
+ },
1979
+ {
1980
+ "epoch": 2.186046511627907,
1981
+ "grad_norm": 0.1473415493965149,
1982
+ "learning_rate": 2.8807342837812783e-05,
1983
+ "loss": 0.38557326793670654,
1984
+ "step": 564
1985
+ },
1986
+ {
1987
+ "epoch": 2.193798449612403,
1988
+ "grad_norm": 0.2709689438343048,
1989
+ "learning_rate": 2.8665474512742603e-05,
1990
+ "loss": 0.41664543747901917,
1991
+ "step": 566
1992
+ },
1993
+ {
1994
+ "epoch": 2.201550387596899,
1995
+ "grad_norm": 0.06767801940441132,
1996
+ "learning_rate": 2.851609864630911e-05,
1997
+ "loss": 0.4579377770423889,
1998
+ "step": 568
1999
+ },
2000
+ {
2001
+ "epoch": 2.2093023255813953,
2002
+ "grad_norm": 0.3255118727684021,
2003
+ "learning_rate": 2.8359313477573215e-05,
2004
+ "loss": 0.3196179270744324,
2005
+ "step": 570
2006
+ },
2007
+ {
2008
+ "epoch": 2.2170542635658914,
2009
+ "grad_norm": 0.1096249520778656,
2010
+ "learning_rate": 2.8195222118423792e-05,
2011
+ "loss": 0.5369107127189636,
2012
+ "step": 572
2013
+ },
2014
+ {
2015
+ "epoch": 2.2248062015503876,
2016
+ "grad_norm": 0.2894248068332672,
2017
+ "learning_rate": 2.8023932485764768e-05,
2018
+ "loss": 0.23676389455795288,
2019
+ "step": 574
2020
+ },
2021
+ {
2022
+ "epoch": 2.2325581395348837,
2023
+ "grad_norm": 0.19947735965251923,
2024
+ "learning_rate": 2.7845557230542076e-05,
2025
+ "loss": 0.44901129603385925,
2026
+ "step": 576
2027
+ },
2028
+ {
2029
+ "epoch": 2.24031007751938,
2030
+ "grad_norm": 0.06506390869617462,
2031
+ "learning_rate": 2.766021366365729e-05,
2032
+ "loss": 0.5859266519546509,
2033
+ "step": 578
2034
+ },
2035
+ {
2036
+ "epoch": 2.248062015503876,
2037
+ "grad_norm": 0.10611079633235931,
2038
+ "learning_rate": 2.746802367881645e-05,
2039
+ "loss": 0.6005488038063049,
2040
+ "step": 580
2041
+ },
2042
+ {
2043
+ "epoch": 2.255813953488372,
2044
+ "grad_norm": 0.05949712544679642,
2045
+ "learning_rate": 2.726911367236509e-05,
2046
+ "loss": 0.32260704040527344,
2047
+ "step": 582
2048
+ },
2049
+ {
2050
+ "epoch": 2.2635658914728682,
2051
+ "grad_norm": 0.09240850806236267,
2052
+ "learning_rate": 2.706361446016193e-05,
2053
+ "loss": 0.8233704566955566,
2054
+ "step": 584
2055
+ },
2056
+ {
2057
+ "epoch": 2.2713178294573644,
2058
+ "grad_norm": 0.08874181658029556,
2059
+ "learning_rate": 2.685166119154604e-05,
2060
+ "loss": 0.4317566156387329,
2061
+ "step": 586
2062
+ },
2063
+ {
2064
+ "epoch": 2.2790697674418605,
2065
+ "grad_norm": 0.05373215302824974,
2066
+ "learning_rate": 2.6633393260454096e-05,
2067
+ "loss": 0.8105683326721191,
2068
+ "step": 588
2069
+ },
2070
+ {
2071
+ "epoch": 2.2868217054263567,
2072
+ "grad_norm": 0.05979755148291588,
2073
+ "learning_rate": 2.6408954213746025e-05,
2074
+ "loss": 0.4510256350040436,
2075
+ "step": 590
2076
+ },
2077
+ {
2078
+ "epoch": 2.294573643410853,
2079
+ "grad_norm": 0.056298933923244476,
2080
+ "learning_rate": 2.6178491656799504e-05,
2081
+ "loss": 0.7199202179908752,
2082
+ "step": 592
2083
+ },
2084
+ {
2085
+ "epoch": 2.302325581395349,
2086
+ "grad_norm": 0.06022209674119949,
2087
+ "learning_rate": 2.5942157156435248e-05,
2088
+ "loss": 0.47333112359046936,
2089
+ "step": 594
2090
+ },
2091
+ {
2092
+ "epoch": 2.310077519379845,
2093
+ "grad_norm": 0.1291632205247879,
2094
+ "learning_rate": 2.570010614123707e-05,
2095
+ "loss": 0.4947061836719513,
2096
+ "step": 596
2097
+ },
2098
+ {
2099
+ "epoch": 2.317829457364341,
2100
+ "grad_norm": 0.7499107718467712,
2101
+ "learning_rate": 2.5452497799332167e-05,
2102
+ "loss": 0.6046218872070312,
2103
+ "step": 598
2104
+ },
2105
+ {
2106
+ "epoch": 2.3255813953488373,
2107
+ "grad_norm": 0.05242902785539627,
2108
+ "learning_rate": 2.519949497369886e-05,
2109
+ "loss": 0.37087422609329224,
2110
+ "step": 600
2111
+ },
2112
+ {
2113
+ "epoch": 2.3333333333333335,
2114
+ "grad_norm": 0.4367184340953827,
2115
+ "learning_rate": 2.494126405507074e-05,
2116
+ "loss": 0.579389214515686,
2117
+ "step": 602
2118
+ },
2119
+ {
2120
+ "epoch": 2.3410852713178296,
2121
+ "grad_norm": 0.0486953966319561,
2122
+ "learning_rate": 2.467797487250756e-05,
2123
+ "loss": 0.7329738736152649,
2124
+ "step": 604
2125
+ },
2126
+ {
2127
+ "epoch": 2.3488372093023258,
2128
+ "grad_norm": 0.09891688823699951,
2129
+ "learning_rate": 2.4409800581704784e-05,
2130
+ "loss": 0.5676310658454895,
2131
+ "step": 606
2132
+ },
2133
+ {
2134
+ "epoch": 2.356589147286822,
2135
+ "grad_norm": 0.12641683220863342,
2136
+ "learning_rate": 2.4136917551115484e-05,
2137
+ "loss": 0.6383396983146667,
2138
+ "step": 608
2139
+ },
2140
+ {
2141
+ "epoch": 2.3643410852713176,
2142
+ "grad_norm": 0.07655075937509537,
2143
+ "learning_rate": 2.3859505245959206e-05,
2144
+ "loss": 0.6663593053817749,
2145
+ "step": 610
2146
+ },
2147
+ {
2148
+ "epoch": 2.3720930232558137,
2149
+ "grad_norm": 0.062206994742155075,
2150
+ "learning_rate": 2.3577746110194188e-05,
2151
+ "loss": 0.32523995637893677,
2152
+ "step": 612
2153
+ },
2154
+ {
2155
+ "epoch": 2.37984496124031,
2156
+ "grad_norm": 0.13163718581199646,
2157
+ "learning_rate": 2.329182544653074e-05,
2158
+ "loss": 0.5087898373603821,
2159
+ "step": 614
2160
+ },
2161
+ {
2162
+ "epoch": 2.387596899224806,
2163
+ "grad_norm": 0.04577813297510147,
2164
+ "learning_rate": 2.3001931294564278e-05,
2165
+ "loss": 0.5215730667114258,
2166
+ "step": 616
2167
+ },
2168
+ {
2169
+ "epoch": 2.395348837209302,
2170
+ "grad_norm": 0.06540275365114212,
2171
+ "learning_rate": 2.27082543071086e-05,
2172
+ "loss": 0.7069303393363953,
2173
+ "step": 618
2174
+ },
2175
+ {
2176
+ "epoch": 2.4031007751937983,
2177
+ "grad_norm": 0.04587893187999725,
2178
+ "learning_rate": 2.2410987624810527e-05,
2179
+ "loss": 0.6097102165222168,
2180
+ "step": 620
2181
+ },
2182
+ {
2183
+ "epoch": 2.4108527131782944,
2184
+ "grad_norm": 0.18531644344329834,
2185
+ "learning_rate": 2.2110326749128246e-05,
2186
+ "loss": 0.28449299931526184,
2187
+ "step": 622
2188
+ },
2189
+ {
2190
+ "epoch": 2.4186046511627906,
2191
+ "grad_norm": 0.06915592402219772,
2192
+ "learning_rate": 2.180646941375716e-05,
2193
+ "loss": 0.5394483208656311,
2194
+ "step": 624
2195
+ },
2196
+ {
2197
+ "epoch": 2.4263565891472867,
2198
+ "grad_norm": 0.0683450847864151,
2199
+ "learning_rate": 2.149961545458774e-05,
2200
+ "loss": 0.351560115814209,
2201
+ "step": 626
2202
+ },
2203
+ {
2204
+ "epoch": 2.434108527131783,
2205
+ "grad_norm": 0.0661771222949028,
2206
+ "learning_rate": 2.1189966678280585e-05,
2207
+ "loss": 0.6790451407432556,
2208
+ "step": 628
2209
+ },
2210
+ {
2211
+ "epoch": 2.441860465116279,
2212
+ "grad_norm": 0.2682180106639862,
2213
+ "learning_rate": 2.0877726729545665e-05,
2214
+ "loss": 0.34560778737068176,
2215
+ "step": 630
2216
+ },
2217
+ {
2218
+ "epoch": 2.449612403100775,
2219
+ "grad_norm": 0.05607810616493225,
2220
+ "learning_rate": 2.0563100957212584e-05,
2221
+ "loss": 0.35299909114837646,
2222
+ "step": 632
2223
+ },
2224
+ {
2225
+ "epoch": 2.4573643410852712,
2226
+ "grad_norm": 0.1276787519454956,
2227
+ "learning_rate": 2.02462962791801e-05,
2228
+ "loss": 0.45075708627700806,
2229
+ "step": 634
2230
+ },
2231
+ {
2232
+ "epoch": 2.4651162790697674,
2233
+ "grad_norm": 0.07231509685516357,
2234
+ "learning_rate": 1.9927521046333833e-05,
2235
+ "loss": 0.4892677664756775,
2236
+ "step": 636
2237
+ },
2238
+ {
2239
+ "epoch": 2.4728682170542635,
2240
+ "grad_norm": 0.12232723832130432,
2241
+ "learning_rate": 1.9606984905521463e-05,
2242
+ "loss": 0.6066938042640686,
2243
+ "step": 638
2244
+ },
2245
+ {
2246
+ "epoch": 2.4806201550387597,
2247
+ "grad_norm": 0.054693497717380524,
2248
+ "learning_rate": 1.928489866167559e-05,
2249
+ "loss": 0.3974202275276184,
2250
+ "step": 640
2251
+ },
2252
+ {
2253
+ "epoch": 2.488372093023256,
2254
+ "grad_norm": 0.07348073273897171,
2255
+ "learning_rate": 1.896147413917511e-05,
2256
+ "loss": 0.43941450119018555,
2257
+ "step": 642
2258
+ },
2259
+ {
2260
+ "epoch": 2.496124031007752,
2261
+ "grad_norm": 0.05807847902178764,
2262
+ "learning_rate": 1.863692404253597e-05,
2263
+ "loss": 0.5508748888969421,
2264
+ "step": 644
2265
+ },
2266
+ {
2267
+ "epoch": 2.503875968992248,
2268
+ "grad_norm": 0.08628101646900177,
2269
+ "learning_rate": 1.83114618165232e-05,
2270
+ "loss": 0.5954611897468567,
2271
+ "step": 646
2272
+ },
2273
+ {
2274
+ "epoch": 2.511627906976744,
2275
+ "grad_norm": 0.08698024600744247,
2276
+ "learning_rate": 1.798530150577603e-05,
2277
+ "loss": 0.7873520851135254,
2278
+ "step": 648
2279
+ },
2280
+ {
2281
+ "epoch": 2.5193798449612403,
2282
+ "grad_norm": 0.0802086666226387,
2283
+ "learning_rate": 1.765865761403861e-05,
2284
+ "loss": 0.27345526218414307,
2285
+ "step": 650
2286
+ },
2287
+ {
2288
+ "epoch": 2.5271317829457365,
2289
+ "grad_norm": 0.058408260345458984,
2290
+ "learning_rate": 1.7331744963088654e-05,
2291
+ "loss": 0.5833812355995178,
2292
+ "step": 652
2293
+ },
2294
+ {
2295
+ "epoch": 2.5348837209302326,
2296
+ "grad_norm": 0.10947899520397186,
2297
+ "learning_rate": 1.7004778551456995e-05,
2298
+ "loss": 0.3762988746166229,
2299
+ "step": 654
2300
+ },
2301
+ {
2302
+ "epoch": 2.5426356589147288,
2303
+ "grad_norm": 0.08571284264326096,
2304
+ "learning_rate": 1.667797341303094e-05,
2305
+ "loss": 0.5067244172096252,
2306
+ "step": 656
2307
+ },
2308
+ {
2309
+ "epoch": 2.550387596899225,
2310
+ "grad_norm": 0.06394554674625397,
2311
+ "learning_rate": 1.6351544475634277e-05,
2312
+ "loss": 0.42985814809799194,
2313
+ "step": 658
2314
+ },
2315
+ {
2316
+ "epoch": 2.558139534883721,
2317
+ "grad_norm": 0.1848040074110031,
2318
+ "learning_rate": 1.6025706419677054e-05,
2319
+ "loss": 0.8337141871452332,
2320
+ "step": 660
2321
+ },
2322
+ {
2323
+ "epoch": 2.565891472868217,
2324
+ "grad_norm": 0.04375322908163071,
2325
+ "learning_rate": 1.570067353696823e-05,
2326
+ "loss": 0.5003541707992554,
2327
+ "step": 662
2328
+ },
2329
+ {
2330
+ "epoch": 2.5736434108527133,
2331
+ "grad_norm": 0.04600893706083298,
2332
+ "learning_rate": 1.5376659589783585e-05,
2333
+ "loss": 0.3022569715976715,
2334
+ "step": 664
2335
+ },
2336
+ {
2337
+ "epoch": 2.5813953488372094,
2338
+ "grad_norm": 0.05343756452202797,
2339
+ "learning_rate": 1.5053877670282193e-05,
2340
+ "loss": 0.4718426465988159,
2341
+ "step": 666
2342
+ },
2343
+ {
2344
+ "epoch": 2.5891472868217056,
2345
+ "grad_norm": 0.09041419625282288,
2346
+ "learning_rate": 1.473254006036345e-05,
2347
+ "loss": 0.4901648163795471,
2348
+ "step": 668
2349
+ },
2350
+ {
2351
+ "epoch": 2.5968992248062017,
2352
+ "grad_norm": 0.06343325972557068,
2353
+ "learning_rate": 1.4412858092056995e-05,
2354
+ "loss": 0.6914687156677246,
2355
+ "step": 670
2356
+ },
2357
+ {
2358
+ "epoch": 2.604651162790698,
2359
+ "grad_norm": 0.06813778728246689,
2360
+ "learning_rate": 1.4095042008537343e-05,
2361
+ "loss": 0.4894769787788391,
2362
+ "step": 672
2363
+ },
2364
+ {
2365
+ "epoch": 2.612403100775194,
2366
+ "grad_norm": 0.1439589262008667,
2367
+ "learning_rate": 1.3779300825854615e-05,
2368
+ "loss": 0.7514118552207947,
2369
+ "step": 674
2370
+ },
2371
+ {
2372
+ "epoch": 2.62015503875969,
2373
+ "grad_norm": 0.07022061944007874,
2374
+ "learning_rate": 1.3465842195472315e-05,
2375
+ "loss": 0.7393191456794739,
2376
+ "step": 676
2377
+ },
2378
+ {
2379
+ "epoch": 2.6279069767441863,
2380
+ "grad_norm": 0.07147393375635147,
2381
+ "learning_rate": 1.3154872267702535e-05,
2382
+ "loss": 0.8212107419967651,
2383
+ "step": 678
2384
+ },
2385
+ {
2386
+ "epoch": 2.6356589147286824,
2387
+ "grad_norm": 0.06350687146186829,
2388
+ "learning_rate": 1.2846595556128338e-05,
2389
+ "loss": 0.7656596302986145,
2390
+ "step": 680
2391
+ },
2392
+ {
2393
+ "epoch": 2.6434108527131785,
2394
+ "grad_norm": 0.1861017942428589,
2395
+ "learning_rate": 1.2541214803102764e-05,
2396
+ "loss": 0.39778298139572144,
2397
+ "step": 682
2398
+ },
2399
+ {
2400
+ "epoch": 2.6511627906976747,
2401
+ "grad_norm": 0.12844400107860565,
2402
+ "learning_rate": 1.2238930846412478e-05,
2403
+ "loss": 0.4492897689342499,
2404
+ "step": 684
2405
+ },
2406
+ {
2407
+ "epoch": 2.6589147286821704,
2408
+ "grad_norm": 0.09717841446399689,
2409
+ "learning_rate": 1.1939942487194114e-05,
2410
+ "loss": 0.5477796792984009,
2411
+ "step": 686
2412
+ },
2413
+ {
2414
+ "epoch": 2.6666666666666665,
2415
+ "grad_norm": 0.0593947097659111,
2416
+ "learning_rate": 1.1644446359190002e-05,
2417
+ "loss": 0.28585392236709595,
2418
+ "step": 688
2419
+ },
2420
+ {
2421
+ "epoch": 2.6744186046511627,
2422
+ "grad_norm": 0.045567888766527176,
2423
+ "learning_rate": 1.1352636799429364e-05,
2424
+ "loss": 0.5053625106811523,
2425
+ "step": 690
2426
+ },
2427
+ {
2428
+ "epoch": 2.682170542635659,
2429
+ "grad_norm": 0.05959075689315796,
2430
+ "learning_rate": 1.1064705720419824e-05,
2431
+ "loss": 0.567241370677948,
2432
+ "step": 692
2433
+ },
2434
+ {
2435
+ "epoch": 2.689922480620155,
2436
+ "grad_norm": 0.15132786333560944,
2437
+ "learning_rate": 1.0780842483933762e-05,
2438
+ "loss": 0.6684018969535828,
2439
+ "step": 694
2440
+ },
2441
+ {
2442
+ "epoch": 2.697674418604651,
2443
+ "grad_norm": 0.08239150047302246,
2444
+ "learning_rate": 1.0501233776471719e-05,
2445
+ "loss": 0.33106908202171326,
2446
+ "step": 696
2447
+ },
2448
+ {
2449
+ "epoch": 2.705426356589147,
2450
+ "grad_norm": 0.06124155595898628,
2451
+ "learning_rate": 1.0226063486485696e-05,
2452
+ "loss": 0.566682755947113,
2453
+ "step": 698
2454
+ },
2455
+ {
2456
+ "epoch": 2.7131782945736433,
2457
+ "grad_norm": 0.07035624980926514,
2458
+ "learning_rate": 9.955512583442338e-06,
2459
+ "loss": 0.4341398775577545,
2460
+ "step": 700
2461
+ },
2462
+ {
2463
+ "epoch": 2.7209302325581395,
2464
+ "grad_norm": 0.05901051685214043,
2465
+ "learning_rate": 9.689758998805937e-06,
2466
+ "loss": 0.4164765775203705,
2467
+ "step": 702
2468
+ },
2469
+ {
2470
+ "epoch": 2.7286821705426356,
2471
+ "grad_norm": 0.04497726634144783,
2472
+ "learning_rate": 9.428977509019321e-06,
2473
+ "loss": 0.40749120712280273,
2474
+ "step": 704
2475
+ },
2476
+ {
2477
+ "epoch": 2.7364341085271318,
2478
+ "grad_norm": 0.15438151359558105,
2479
+ "learning_rate": 9.173339620559945e-06,
2480
+ "loss": 0.28900110721588135,
2481
+ "step": 706
2482
+ },
2483
+ {
2484
+ "epoch": 2.744186046511628,
2485
+ "grad_norm": 0.05592001974582672,
2486
+ "learning_rate": 8.923013457146072e-06,
2487
+ "loss": 0.41211241483688354,
2488
+ "step": 708
2489
+ },
2490
+ {
2491
+ "epoch": 2.751937984496124,
2492
+ "grad_norm": 0.0629056990146637,
2493
+ "learning_rate": 8.678163649168217e-06,
2494
+ "loss": 0.5537896156311035,
2495
+ "step": 710
2496
+ },
2497
+ {
2498
+ "epoch": 2.75968992248062,
2499
+ "grad_norm": 0.06699282675981522,
2500
+ "learning_rate": 8.43895122541748e-06,
2501
+ "loss": 0.5788278579711914,
2502
+ "step": 712
2503
+ },
2504
+ {
2505
+ "epoch": 2.7674418604651163,
2506
+ "grad_norm": 0.13577990233898163,
2507
+ "learning_rate": 8.205533507182964e-06,
2508
+ "loss": 0.37125617265701294,
2509
+ "step": 714
2510
+ },
2511
+ {
2512
+ "epoch": 2.7751937984496124,
2513
+ "grad_norm": 0.16210103034973145,
2514
+ "learning_rate": 7.978064004787233e-06,
2515
+ "loss": 0.3962320387363434,
2516
+ "step": 716
2517
+ },
2518
+ {
2519
+ "epoch": 2.7829457364341086,
2520
+ "grad_norm": 0.06700747460126877,
2521
+ "learning_rate": 7.756692316628171e-06,
2522
+ "loss": 0.6869024634361267,
2523
+ "step": 718
2524
+ },
2525
+ {
2526
+ "epoch": 2.7906976744186047,
2527
+ "grad_norm": 0.06708226352930069,
2528
+ "learning_rate": 7.541564030793529e-06,
2529
+ "loss": 0.5122371912002563,
2530
+ "step": 720
2531
+ },
2532
+ {
2533
+ "epoch": 2.798449612403101,
2534
+ "grad_norm": 0.0619942843914032,
2535
+ "learning_rate": 7.332820629313089e-06,
2536
+ "loss": 0.4030957818031311,
2537
+ "step": 722
2538
+ },
2539
+ {
2540
+ "epoch": 2.806201550387597,
2541
+ "grad_norm": 0.08853983879089355,
2542
+ "learning_rate": 7.1305993951108914e-06,
2543
+ "loss": 0.4579683840274811,
2544
+ "step": 724
2545
+ },
2546
+ {
2547
+ "epoch": 2.813953488372093,
2548
+ "grad_norm": 0.1136261448264122,
2549
+ "learning_rate": 6.935033321719423e-06,
2550
+ "loss": 0.4582154452800751,
2551
+ "step": 726
2552
+ },
2553
+ {
2554
+ "epoch": 2.8217054263565893,
2555
+ "grad_norm": 0.03374806419014931,
2556
+ "learning_rate": 6.74625102581455e-06,
2557
+ "loss": 0.46171411871910095,
2558
+ "step": 728
2559
+ },
2560
+ {
2561
+ "epoch": 2.8294573643410854,
2562
+ "grad_norm": 0.05085311084985733,
2563
+ "learning_rate": 6.56437666262903e-06,
2564
+ "loss": 0.4829785227775574,
2565
+ "step": 730
2566
+ },
2567
+ {
2568
+ "epoch": 2.8372093023255816,
2569
+ "grad_norm": 0.051897477358579636,
2570
+ "learning_rate": 6.389529844300143e-06,
2571
+ "loss": 0.4446869194507599,
2572
+ "step": 732
2573
+ },
2574
+ {
2575
+ "epoch": 2.8449612403100772,
2576
+ "grad_norm": 0.06380399316549301,
2577
+ "learning_rate": 6.221825561205165e-06,
2578
+ "loss": 0.5170708298683167,
2579
+ "step": 734
2580
+ },
2581
+ {
2582
+ "epoch": 2.8527131782945734,
2583
+ "grad_norm": 0.07282527536153793,
2584
+ "learning_rate": 6.061374106336333e-06,
2585
+ "loss": 0.6230844259262085,
2586
+ "step": 736
2587
+ },
2588
+ {
2589
+ "epoch": 2.8604651162790695,
2590
+ "grad_norm": 0.09063038229942322,
2591
+ "learning_rate": 5.908281002765252e-06,
2592
+ "loss": 0.35932058095932007,
2593
+ "step": 738
2594
+ },
2595
+ {
2596
+ "epoch": 2.8682170542635657,
2597
+ "grad_norm": 1.006274938583374,
2598
+ "learning_rate": 5.762646934244159e-06,
2599
+ "loss": 0.3806362748146057,
2600
+ "step": 740
2601
+ },
2602
+ {
2603
+ "epoch": 2.875968992248062,
2604
+ "grad_norm": 0.3232397139072418,
2605
+ "learning_rate": 5.624567678989899e-06,
2606
+ "loss": 0.513190507888794,
2607
+ "step": 742
2608
+ },
2609
+ {
2610
+ "epoch": 2.883720930232558,
2611
+ "grad_norm": 0.120949886739254,
2612
+ "learning_rate": 5.494134046694099e-06,
2613
+ "loss": 0.6526894569396973,
2614
+ "step": 744
2615
+ },
2616
+ {
2617
+ "epoch": 2.891472868217054,
2618
+ "grad_norm": 0.12644588947296143,
2619
+ "learning_rate": 5.371431818800933e-06,
2620
+ "loss": 0.4791458249092102,
2621
+ "step": 746
2622
+ },
2623
+ {
2624
+ "epoch": 2.89922480620155,
2625
+ "grad_norm": 0.06687454879283905,
2626
+ "learning_rate": 5.256541692091802e-06,
2627
+ "loss": 0.5770004987716675,
2628
+ "step": 748
2629
+ },
2630
+ {
2631
+ "epoch": 2.9069767441860463,
2632
+ "grad_norm": 0.08843245357275009,
2633
+ "learning_rate": 5.149539225613978e-06,
2634
+ "loss": 0.3434167802333832,
2635
+ "step": 750
2636
+ },
2637
+ {
2638
+ "epoch": 2.9147286821705425,
2639
+ "grad_norm": 0.07200266420841217,
2640
+ "learning_rate": 5.050494790988215e-06,
2641
+ "loss": 0.4575299322605133,
2642
+ "step": 752
2643
+ },
2644
+ {
2645
+ "epoch": 2.9224806201550386,
2646
+ "grad_norm": 0.07060275971889496,
2647
+ "learning_rate": 4.959473526127871e-06,
2648
+ "loss": 0.3564453721046448,
2649
+ "step": 754
2650
+ },
2651
+ {
2652
+ "epoch": 2.9302325581395348,
2653
+ "grad_norm": 0.06662537902593613,
2654
+ "learning_rate": 4.876535292400089e-06,
2655
+ "loss": 0.7428521513938904,
2656
+ "step": 756
2657
+ },
2658
+ {
2659
+ "epoch": 2.937984496124031,
2660
+ "grad_norm": 0.05963343381881714,
2661
+ "learning_rate": 4.801734635257146e-06,
2662
+ "loss": 0.4571719169616699,
2663
+ "step": 758
2664
+ },
2665
+ {
2666
+ "epoch": 2.945736434108527,
2667
+ "grad_norm": 0.05507909134030342,
2668
+ "learning_rate": 4.73512074836392e-06,
2669
+ "loss": 0.5132399797439575,
2670
+ "step": 760
2671
+ },
2672
+ {
2673
+ "epoch": 2.953488372093023,
2674
+ "grad_norm": 0.05289539694786072,
2675
+ "learning_rate": 4.676737441244973e-06,
2676
+ "loss": 0.814540445804596,
2677
+ "step": 762
2678
+ },
2679
+ {
2680
+ "epoch": 2.9612403100775193,
2681
+ "grad_norm": 0.05587043985724449,
2682
+ "learning_rate": 4.626623110472678e-06,
2683
+ "loss": 0.5996021628379822,
2684
+ "step": 764
2685
+ },
2686
+ {
2687
+ "epoch": 2.9689922480620154,
2688
+ "grad_norm": 0.0820513367652893,
2689
+ "learning_rate": 4.584810714415136e-06,
2690
+ "loss": 0.2337801605463028,
2691
+ "step": 766
2692
+ },
2693
+ {
2694
+ "epoch": 2.9767441860465116,
2695
+ "grad_norm": 0.08467745780944824,
2696
+ "learning_rate": 4.551327751560703e-06,
2697
+ "loss": 0.43569573760032654,
2698
+ "step": 768
2699
+ },
2700
+ {
2701
+ "epoch": 2.9844961240310077,
2702
+ "grad_norm": 0.09394794702529907,
2703
+ "learning_rate": 4.526196242433211e-06,
2704
+ "loss": 0.42782190442085266,
2705
+ "step": 770
2706
+ },
2707
+ {
2708
+ "epoch": 2.992248062015504,
2709
+ "grad_norm": 0.05439142882823944,
2710
+ "learning_rate": 4.509432715109889e-06,
2711
+ "loss": 0.516304612159729,
2712
+ "step": 772
2713
+ },
2714
+ {
2715
+ "epoch": 3.0,
2716
+ "grad_norm": 0.03667069226503372,
2717
+ "learning_rate": 4.50104819435143e-06,
2718
+ "loss": 0.15898612141609192,
2719
+ "step": 774
2720
+ },
2721
+ {
2722
+ "epoch": 3.0,
2723
+ "step": 774,
2724
+ "total_flos": 3.2487544184132076e+18,
2725
+ "train_loss": 0.8073726282178277,
2726
+ "train_runtime": 15483.8831,
2727
+ "train_samples_per_second": 3.199,
2728
+ "train_steps_per_second": 0.05
2729
+ }
2730
+ ],
2731
+ "logging_steps": 2,
2732
+ "max_steps": 774,
2733
+ "num_input_tokens_seen": 0,
2734
+ "num_train_epochs": 3,
2735
+ "save_steps": 99999,
2736
+ "stateful_callbacks": {
2737
+ "TrainerControl": {
2738
+ "args": {
2739
+ "should_epoch_stop": false,
2740
+ "should_evaluate": false,
2741
+ "should_log": false,
2742
+ "should_save": true,
2743
+ "should_training_stop": true
2744
+ },
2745
+ "attributes": {}
2746
+ }
2747
+ },
2748
+ "total_flos": 3.2487544184132076e+18,
2749
+ "train_batch_size": 1,
2750
+ "trial_name": null,
2751
+ "trial_params": null
2752
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0a8e8ca2a2815ffd210e251800d296fa9e65140166a435c6db7993b0df8525a
3
+ size 5649
training_loss.png ADDED