Maxime Turlot commited on
Commit
f8766c7
·
1 Parent(s): 360cda8

Add BlueSecureBERT weights & tokenizer v1.0

Browse files
checkpoint-1500/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
checkpoint-1500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2673e42d4637733a09e0553abdaedcc9fa4e6ac22cf009b506a3776cc655d6e4
3
+ size 498612824
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c05df94321022303086794957bc01d2e212db00b0536724efb3e4e5dd2a455d
3
+ size 997345530
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:873433ef920a90902d6f896eeee752f73d9ee50665eb5ab1a3d6d232701f69d3
3
+ size 14244
checkpoint-1500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33e26a663d9f99b76583d7a759a02229c9acb028b2a47d5da0c15de5bd40410e
3
+ size 988
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:008593e472c1fb4d926ab4ddfb67a4d0177133cece362148c52047b8cbe88aa5
3
+ size 1064
checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "max_len": 512,
53
+ "model_max_length": 512,
54
+ "pad_token": "<pad>",
55
+ "sep_token": "</s>",
56
+ "tokenizer_class": "RobertaTokenizer",
57
+ "trim_offsets": true,
58
+ "unk_token": "<unk>"
59
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.9490445859872612,
4
+ "best_model_checkpoint": "/workspace/AI/Trend_Primus-FineWeb_Filtering-pipeline/securebert_finetuned/defensive_vs_rest/checkpoint-1500",
5
+ "epoch": 2.7272727272727275,
6
+ "eval_steps": 300,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.18181818181818182,
14
+ "grad_norm": 1.2393633127212524,
15
+ "learning_rate": 5.6571428571428576e-06,
16
+ "loss": 0.1439,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.36363636363636365,
21
+ "grad_norm": 1.2253516912460327,
22
+ "learning_rate": 1.1371428571428573e-05,
23
+ "loss": 0.0855,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.5454545454545454,
28
+ "grad_norm": 0.6038007140159607,
29
+ "learning_rate": 1.708571428571429e-05,
30
+ "loss": 0.0369,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.5454545454545454,
35
+ "eval_f1": 0.8451570393147375,
36
+ "eval_f2": 0.9234018045690152,
37
+ "eval_loss": 0.022321535274386406,
38
+ "eval_precision": 0.7405696689761355,
39
+ "eval_recall": 0.9841432225063939,
40
+ "eval_runtime": 10.9053,
41
+ "eval_samples_per_second": 717.082,
42
+ "eval_steps_per_second": 11.279,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.7272727272727273,
47
+ "grad_norm": 0.47052955627441406,
48
+ "learning_rate": 1.924615384615385e-05,
49
+ "loss": 0.0203,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.9090909090909091,
54
+ "grad_norm": 2.0642011165618896,
55
+ "learning_rate": 1.770769230769231e-05,
56
+ "loss": 0.0176,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 1.0909090909090908,
61
+ "grad_norm": 0.28900912404060364,
62
+ "learning_rate": 1.6169230769230772e-05,
63
+ "loss": 0.0116,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 1.0909090909090908,
68
+ "eval_f1": 0.9583852479441815,
69
+ "eval_f2": 0.9733751771613687,
70
+ "eval_loss": 0.013802087865769863,
71
+ "eval_precision": 0.934402332361516,
72
+ "eval_recall": 0.9836317135549872,
73
+ "eval_runtime": 11.4515,
74
+ "eval_samples_per_second": 682.883,
75
+ "eval_steps_per_second": 10.741,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 1.2727272727272727,
80
+ "grad_norm": 0.4510941803455353,
81
+ "learning_rate": 1.4630769230769233e-05,
82
+ "loss": 0.0083,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.4545454545454546,
87
+ "grad_norm": 0.546210765838623,
88
+ "learning_rate": 1.3092307692307693e-05,
89
+ "loss": 0.0074,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.6363636363636362,
94
+ "grad_norm": 2.2349956035614014,
95
+ "learning_rate": 1.1553846153846156e-05,
96
+ "loss": 0.0084,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.6363636363636362,
101
+ "eval_f1": 0.9500734214390602,
102
+ "eval_f2": 0.9752788664455834,
103
+ "eval_loss": 0.009357056580483913,
104
+ "eval_precision": 0.9108399812294697,
105
+ "eval_recall": 0.992838874680307,
106
+ "eval_runtime": 11.0303,
107
+ "eval_samples_per_second": 708.955,
108
+ "eval_steps_per_second": 11.151,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.8181818181818183,
113
+ "grad_norm": 0.4002629518508911,
114
+ "learning_rate": 1.0015384615384615e-05,
115
+ "loss": 0.0076,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 2.0,
120
+ "grad_norm": 0.0686814934015274,
121
+ "learning_rate": 8.476923076923078e-06,
122
+ "loss": 0.0063,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 2.1818181818181817,
127
+ "grad_norm": 0.40794429183006287,
128
+ "learning_rate": 6.9384615384615395e-06,
129
+ "loss": 0.0032,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 2.1818181818181817,
134
+ "eval_f1": 0.9634601043997018,
135
+ "eval_f2": 0.9799757281553398,
136
+ "eval_loss": 0.009985245764255524,
137
+ "eval_precision": 0.937137330754352,
138
+ "eval_recall": 0.991304347826087,
139
+ "eval_runtime": 10.5643,
140
+ "eval_samples_per_second": 740.231,
141
+ "eval_steps_per_second": 11.643,
142
+ "step": 1200
143
+ },
144
+ {
145
+ "epoch": 2.3636363636363638,
146
+ "grad_norm": 0.26694178581237793,
147
+ "learning_rate": 5.400000000000001e-06,
148
+ "loss": 0.0031,
149
+ "step": 1300
150
+ },
151
+ {
152
+ "epoch": 2.5454545454545454,
153
+ "grad_norm": 0.1839500516653061,
154
+ "learning_rate": 3.861538461538462e-06,
155
+ "loss": 0.0028,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 2.7272727272727275,
160
+ "grad_norm": 0.3117663264274597,
161
+ "learning_rate": 2.3230769230769234e-06,
162
+ "loss": 0.0021,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 2.7272727272727275,
167
+ "eval_f1": 0.9694694694694694,
168
+ "eval_f2": 0.9821519115708346,
169
+ "eval_loss": 0.010657339356839657,
170
+ "eval_precision": 0.9490445859872612,
171
+ "eval_recall": 0.9907928388746803,
172
+ "eval_runtime": 10.7067,
173
+ "eval_samples_per_second": 730.384,
174
+ "eval_steps_per_second": 11.488,
175
+ "step": 1500
176
+ }
177
+ ],
178
+ "logging_steps": 100,
179
+ "max_steps": 1650,
180
+ "num_input_tokens_seen": 0,
181
+ "num_train_epochs": 3,
182
+ "save_steps": 300,
183
+ "stateful_callbacks": {
184
+ "EarlyStoppingCallback": {
185
+ "args": {
186
+ "early_stopping_patience": 2,
187
+ "early_stopping_threshold": 0.0
188
+ },
189
+ "attributes": {
190
+ "early_stopping_patience_counter": 0
191
+ }
192
+ },
193
+ "TrainerControl": {
194
+ "args": {
195
+ "should_epoch_stop": false,
196
+ "should_evaluate": false,
197
+ "should_log": false,
198
+ "should_save": true,
199
+ "should_training_stop": false
200
+ },
201
+ "attributes": {}
202
+ }
203
+ },
204
+ "total_flos": 5.05067981869056e+16,
205
+ "train_batch_size": 64,
206
+ "trial_name": null,
207
+ "trial_params": null
208
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a880bc2113f1c59c5c8811e0cfbdcc1f299ce53865365dda540b7b7d0db8df
3
+ size 5496
checkpoint-1500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2100/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
checkpoint-2100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:517257bc47b3f038a7403dd81da4bfce3de079acfc45697e0b00a8d9200e381f
3
+ size 498612824
checkpoint-2100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d165a1ee4621c558a9a7fc2a8583b72069a277efcc7ee3076c5274172ad67803
3
+ size 997345530
checkpoint-2100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:840bbd869826eca2ba6c2837e2dad965324983f11c19e9319c8a75b1a148f48d
3
+ size 14244
checkpoint-2100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f5dc0ebad59bb1208d0eb1ca5cd9a005f35ab6729971fcb4a04c903bf3d7a5
3
+ size 988
checkpoint-2100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4143afc8e9b0613a160a5744369a2b7a815380a150e53caa5289fa29dad912d6
3
+ size 1064
checkpoint-2100/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-2100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2100/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "max_len": 512,
53
+ "model_max_length": 512,
54
+ "pad_token": "<pad>",
55
+ "sep_token": "</s>",
56
+ "tokenizer_class": "RobertaTokenizer",
57
+ "trim_offsets": true,
58
+ "unk_token": "<unk>"
59
+ }
checkpoint-2100/trainer_state.json ADDED
@@ -0,0 +1,1597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1800,
3
+ "best_metric": 0.9762746087834427,
4
+ "best_model_checkpoint": "/workspace/AI/Trend_Primus-FineWeb_Filtering-pipeline/securebert_finetuned/defensive_vs_rest/checkpoint-1800",
5
+ "epoch": 3.8181818181818183,
6
+ "eval_steps": 300,
7
+ "global_step": 2100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01818181818181818,
14
+ "grad_norm": 0.4759802222251892,
15
+ "learning_rate": 5.714285714285715e-07,
16
+ "loss": 0.1542,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03636363636363636,
21
+ "grad_norm": 0.3740444481372833,
22
+ "learning_rate": 1.142857142857143e-06,
23
+ "loss": 0.1623,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05454545454545454,
28
+ "grad_norm": 0.5404930710792542,
29
+ "learning_rate": 1.7142857142857145e-06,
30
+ "loss": 0.1582,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.07272727272727272,
35
+ "grad_norm": 0.3705693781375885,
36
+ "learning_rate": 2.285714285714286e-06,
37
+ "loss": 0.1422,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.09090909090909091,
42
+ "grad_norm": 0.4376075565814972,
43
+ "learning_rate": 2.8571428571428573e-06,
44
+ "loss": 0.1403,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10909090909090909,
49
+ "grad_norm": 0.46200892329216003,
50
+ "learning_rate": 3.428571428571429e-06,
51
+ "loss": 0.142,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.12727272727272726,
56
+ "grad_norm": 0.2855919599533081,
57
+ "learning_rate": 4.000000000000001e-06,
58
+ "loss": 0.144,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.14545454545454545,
63
+ "grad_norm": 0.33721357583999634,
64
+ "learning_rate": 4.571428571428572e-06,
65
+ "loss": 0.1401,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.16363636363636364,
70
+ "grad_norm": 0.6751205921173096,
71
+ "learning_rate": 5.142857142857142e-06,
72
+ "loss": 0.132,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.18181818181818182,
77
+ "grad_norm": 1.1521556377410889,
78
+ "learning_rate": 5.7142857142857145e-06,
79
+ "loss": 0.1277,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.2,
84
+ "grad_norm": 0.6443154811859131,
85
+ "learning_rate": 6.285714285714286e-06,
86
+ "loss": 0.1079,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.21818181818181817,
91
+ "grad_norm": 0.6698077321052551,
92
+ "learning_rate": 6.857142857142858e-06,
93
+ "loss": 0.1066,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.23636363636363636,
98
+ "grad_norm": 0.8653299808502197,
99
+ "learning_rate": 7.428571428571429e-06,
100
+ "loss": 0.0943,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.2545454545454545,
105
+ "grad_norm": 1.1476327180862427,
106
+ "learning_rate": 8.000000000000001e-06,
107
+ "loss": 0.0924,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.2727272727272727,
112
+ "grad_norm": 0.5096330642700195,
113
+ "learning_rate": 8.571428571428571e-06,
114
+ "loss": 0.0844,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2909090909090909,
119
+ "grad_norm": 1.1907318830490112,
120
+ "learning_rate": 9.142857142857144e-06,
121
+ "loss": 0.0813,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.3090909090909091,
126
+ "grad_norm": 1.299401879310608,
127
+ "learning_rate": 9.714285714285715e-06,
128
+ "loss": 0.0749,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.32727272727272727,
133
+ "grad_norm": 0.8548530340194702,
134
+ "learning_rate": 1.0285714285714285e-05,
135
+ "loss": 0.0661,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.34545454545454546,
140
+ "grad_norm": 0.8947266936302185,
141
+ "learning_rate": 1.0857142857142858e-05,
142
+ "loss": 0.0638,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.36363636363636365,
147
+ "grad_norm": 2.172971725463867,
148
+ "learning_rate": 1.1428571428571429e-05,
149
+ "loss": 0.0624,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.38181818181818183,
154
+ "grad_norm": 4.725502967834473,
155
+ "learning_rate": 1.2e-05,
156
+ "loss": 0.0504,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.4,
161
+ "grad_norm": 1.3992273807525635,
162
+ "learning_rate": 1.2571428571428572e-05,
163
+ "loss": 0.0417,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.41818181818181815,
168
+ "grad_norm": 0.9457690119743347,
169
+ "learning_rate": 1.3142857142857145e-05,
170
+ "loss": 0.0377,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.43636363636363634,
175
+ "grad_norm": 1.1091430187225342,
176
+ "learning_rate": 1.3714285714285716e-05,
177
+ "loss": 0.0323,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.45454545454545453,
182
+ "grad_norm": 0.7338578701019287,
183
+ "learning_rate": 1.4285714285714287e-05,
184
+ "loss": 0.0269,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4727272727272727,
189
+ "grad_norm": 1.0748590230941772,
190
+ "learning_rate": 1.4857142857142858e-05,
191
+ "loss": 0.0275,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.4909090909090909,
196
+ "grad_norm": 1.0615975856781006,
197
+ "learning_rate": 1.542857142857143e-05,
198
+ "loss": 0.0214,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.509090909090909,
203
+ "grad_norm": 0.8980767130851746,
204
+ "learning_rate": 1.6000000000000003e-05,
205
+ "loss": 0.0224,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.5272727272727272,
210
+ "grad_norm": 0.8434118628501892,
211
+ "learning_rate": 1.6571428571428574e-05,
212
+ "loss": 0.0271,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5454545454545454,
217
+ "grad_norm": 0.7067236304283142,
218
+ "learning_rate": 1.7142857142857142e-05,
219
+ "loss": 0.0225,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5454545454545454,
224
+ "eval_f1": 0.886688162137264,
225
+ "eval_f2": 0.9429803076320172,
226
+ "eval_loss": 0.01607716828584671,
227
+ "eval_precision": 0.8064516129032258,
228
+ "eval_recall": 0.9846547314578005,
229
+ "eval_runtime": 10.5831,
230
+ "eval_samples_per_second": 738.914,
231
+ "eval_steps_per_second": 11.622,
232
+ "step": 300
233
+ },
234
+ {
235
+ "epoch": 0.5636363636363636,
236
+ "grad_norm": 1.111824631690979,
237
+ "learning_rate": 1.7714285714285717e-05,
238
+ "loss": 0.0186,
239
+ "step": 310
240
+ },
241
+ {
242
+ "epoch": 0.5818181818181818,
243
+ "grad_norm": 1.0324747562408447,
244
+ "learning_rate": 1.8285714285714288e-05,
245
+ "loss": 0.0218,
246
+ "step": 320
247
+ },
248
+ {
249
+ "epoch": 0.6,
250
+ "grad_norm": 1.2407127618789673,
251
+ "learning_rate": 1.885714285714286e-05,
252
+ "loss": 0.0223,
253
+ "step": 330
254
+ },
255
+ {
256
+ "epoch": 0.6181818181818182,
257
+ "grad_norm": 0.6641681790351868,
258
+ "learning_rate": 1.942857142857143e-05,
259
+ "loss": 0.022,
260
+ "step": 340
261
+ },
262
+ {
263
+ "epoch": 0.6363636363636364,
264
+ "grad_norm": 0.8924217224121094,
265
+ "learning_rate": 2e-05,
266
+ "loss": 0.0178,
267
+ "step": 350
268
+ },
269
+ {
270
+ "epoch": 0.6545454545454545,
271
+ "grad_norm": 0.49054834246635437,
272
+ "learning_rate": 1.991666666666667e-05,
273
+ "loss": 0.0139,
274
+ "step": 360
275
+ },
276
+ {
277
+ "epoch": 0.6727272727272727,
278
+ "grad_norm": 1.4703093767166138,
279
+ "learning_rate": 1.9833333333333335e-05,
280
+ "loss": 0.0211,
281
+ "step": 370
282
+ },
283
+ {
284
+ "epoch": 0.6909090909090909,
285
+ "grad_norm": 2.7741594314575195,
286
+ "learning_rate": 1.9750000000000002e-05,
287
+ "loss": 0.0122,
288
+ "step": 380
289
+ },
290
+ {
291
+ "epoch": 0.7090909090909091,
292
+ "grad_norm": 1.8091500997543335,
293
+ "learning_rate": 1.9666666666666666e-05,
294
+ "loss": 0.0191,
295
+ "step": 390
296
+ },
297
+ {
298
+ "epoch": 0.7272727272727273,
299
+ "grad_norm": 3.808887243270874,
300
+ "learning_rate": 1.9583333333333333e-05,
301
+ "loss": 0.0168,
302
+ "step": 400
303
+ },
304
+ {
305
+ "epoch": 0.7454545454545455,
306
+ "grad_norm": 0.5149283409118652,
307
+ "learning_rate": 1.95e-05,
308
+ "loss": 0.0145,
309
+ "step": 410
310
+ },
311
+ {
312
+ "epoch": 0.7636363636363637,
313
+ "grad_norm": 0.45617809891700745,
314
+ "learning_rate": 1.9416666666666667e-05,
315
+ "loss": 0.0088,
316
+ "step": 420
317
+ },
318
+ {
319
+ "epoch": 0.7818181818181819,
320
+ "grad_norm": 1.402259111404419,
321
+ "learning_rate": 1.9333333333333333e-05,
322
+ "loss": 0.0126,
323
+ "step": 430
324
+ },
325
+ {
326
+ "epoch": 0.8,
327
+ "grad_norm": 1.5392917394638062,
328
+ "learning_rate": 1.925e-05,
329
+ "loss": 0.0138,
330
+ "step": 440
331
+ },
332
+ {
333
+ "epoch": 0.8181818181818182,
334
+ "grad_norm": 2.108272075653076,
335
+ "learning_rate": 1.916666666666667e-05,
336
+ "loss": 0.0165,
337
+ "step": 450
338
+ },
339
+ {
340
+ "epoch": 0.8363636363636363,
341
+ "grad_norm": 0.6225730776786804,
342
+ "learning_rate": 1.9083333333333338e-05,
343
+ "loss": 0.012,
344
+ "step": 460
345
+ },
346
+ {
347
+ "epoch": 0.8545454545454545,
348
+ "grad_norm": 1.8889803886413574,
349
+ "learning_rate": 1.9e-05,
350
+ "loss": 0.0097,
351
+ "step": 470
352
+ },
353
+ {
354
+ "epoch": 0.8727272727272727,
355
+ "grad_norm": 0.1781783550977707,
356
+ "learning_rate": 1.8916666666666668e-05,
357
+ "loss": 0.012,
358
+ "step": 480
359
+ },
360
+ {
361
+ "epoch": 0.8909090909090909,
362
+ "grad_norm": 0.7276476621627808,
363
+ "learning_rate": 1.8833333333333335e-05,
364
+ "loss": 0.0151,
365
+ "step": 490
366
+ },
367
+ {
368
+ "epoch": 0.9090909090909091,
369
+ "grad_norm": 1.302780032157898,
370
+ "learning_rate": 1.8750000000000002e-05,
371
+ "loss": 0.0124,
372
+ "step": 500
373
+ },
374
+ {
375
+ "epoch": 0.9272727272727272,
376
+ "grad_norm": 0.8764067888259888,
377
+ "learning_rate": 1.866666666666667e-05,
378
+ "loss": 0.0127,
379
+ "step": 510
380
+ },
381
+ {
382
+ "epoch": 0.9454545454545454,
383
+ "grad_norm": 0.3675801157951355,
384
+ "learning_rate": 1.8583333333333336e-05,
385
+ "loss": 0.011,
386
+ "step": 520
387
+ },
388
+ {
389
+ "epoch": 0.9636363636363636,
390
+ "grad_norm": 0.413601279258728,
391
+ "learning_rate": 1.8500000000000002e-05,
392
+ "loss": 0.0119,
393
+ "step": 530
394
+ },
395
+ {
396
+ "epoch": 0.9818181818181818,
397
+ "grad_norm": 0.3606299161911011,
398
+ "learning_rate": 1.8416666666666666e-05,
399
+ "loss": 0.0096,
400
+ "step": 540
401
+ },
402
+ {
403
+ "epoch": 1.0,
404
+ "grad_norm": 0.6626041531562805,
405
+ "learning_rate": 1.8333333333333333e-05,
406
+ "loss": 0.0091,
407
+ "step": 550
408
+ },
409
+ {
410
+ "epoch": 1.018181818181818,
411
+ "grad_norm": 0.34816664457321167,
412
+ "learning_rate": 1.825e-05,
413
+ "loss": 0.0064,
414
+ "step": 560
415
+ },
416
+ {
417
+ "epoch": 1.0363636363636364,
418
+ "grad_norm": 0.7120109796524048,
419
+ "learning_rate": 1.8166666666666667e-05,
420
+ "loss": 0.007,
421
+ "step": 570
422
+ },
423
+ {
424
+ "epoch": 1.0545454545454545,
425
+ "grad_norm": 0.34991776943206787,
426
+ "learning_rate": 1.8083333333333334e-05,
427
+ "loss": 0.0156,
428
+ "step": 580
429
+ },
430
+ {
431
+ "epoch": 1.0727272727272728,
432
+ "grad_norm": 0.4325370788574219,
433
+ "learning_rate": 1.8e-05,
434
+ "loss": 0.0116,
435
+ "step": 590
436
+ },
437
+ {
438
+ "epoch": 1.0909090909090908,
439
+ "grad_norm": 1.3302485942840576,
440
+ "learning_rate": 1.7916666666666667e-05,
441
+ "loss": 0.007,
442
+ "step": 600
443
+ },
444
+ {
445
+ "epoch": 1.0909090909090908,
446
+ "eval_f1": 0.9619118745332338,
447
+ "eval_f2": 0.9775349119611415,
448
+ "eval_loss": 0.009453566744923592,
449
+ "eval_precision": 0.9369544131910766,
450
+ "eval_recall": 0.9882352941176471,
451
+ "eval_runtime": 10.7346,
452
+ "eval_samples_per_second": 728.488,
453
+ "eval_steps_per_second": 11.458,
454
+ "step": 600
455
+ },
456
+ {
457
+ "epoch": 1.1090909090909091,
458
+ "grad_norm": 0.5350901484489441,
459
+ "learning_rate": 1.7833333333333334e-05,
460
+ "loss": 0.0038,
461
+ "step": 610
462
+ },
463
+ {
464
+ "epoch": 1.1272727272727272,
465
+ "grad_norm": 0.19123613834381104,
466
+ "learning_rate": 1.775e-05,
467
+ "loss": 0.0075,
468
+ "step": 620
469
+ },
470
+ {
471
+ "epoch": 1.1454545454545455,
472
+ "grad_norm": 0.2627851963043213,
473
+ "learning_rate": 1.7666666666666668e-05,
474
+ "loss": 0.0069,
475
+ "step": 630
476
+ },
477
+ {
478
+ "epoch": 1.1636363636363636,
479
+ "grad_norm": 0.49250972270965576,
480
+ "learning_rate": 1.7583333333333335e-05,
481
+ "loss": 0.009,
482
+ "step": 640
483
+ },
484
+ {
485
+ "epoch": 1.1818181818181819,
486
+ "grad_norm": 1.2556400299072266,
487
+ "learning_rate": 1.7500000000000002e-05,
488
+ "loss": 0.0106,
489
+ "step": 650
490
+ },
491
+ {
492
+ "epoch": 1.2,
493
+ "grad_norm": 0.23302438855171204,
494
+ "learning_rate": 1.741666666666667e-05,
495
+ "loss": 0.006,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 1.2181818181818183,
500
+ "grad_norm": 0.22926795482635498,
501
+ "learning_rate": 1.7333333333333336e-05,
502
+ "loss": 0.0053,
503
+ "step": 670
504
+ },
505
+ {
506
+ "epoch": 1.2363636363636363,
507
+ "grad_norm": 0.41634848713874817,
508
+ "learning_rate": 1.7250000000000003e-05,
509
+ "loss": 0.0096,
510
+ "step": 680
511
+ },
512
+ {
513
+ "epoch": 1.2545454545454544,
514
+ "grad_norm": 0.7806673049926758,
515
+ "learning_rate": 1.7166666666666666e-05,
516
+ "loss": 0.0077,
517
+ "step": 690
518
+ },
519
+ {
520
+ "epoch": 1.2727272727272727,
521
+ "grad_norm": 0.6627803444862366,
522
+ "learning_rate": 1.7083333333333333e-05,
523
+ "loss": 0.008,
524
+ "step": 700
525
+ },
526
+ {
527
+ "epoch": 1.290909090909091,
528
+ "grad_norm": 0.33546727895736694,
529
+ "learning_rate": 1.7e-05,
530
+ "loss": 0.0074,
531
+ "step": 710
532
+ },
533
+ {
534
+ "epoch": 1.309090909090909,
535
+ "grad_norm": 1.327726125717163,
536
+ "learning_rate": 1.6916666666666667e-05,
537
+ "loss": 0.0044,
538
+ "step": 720
539
+ },
540
+ {
541
+ "epoch": 1.3272727272727272,
542
+ "grad_norm": 0.4449763894081116,
543
+ "learning_rate": 1.6833333333333334e-05,
544
+ "loss": 0.0046,
545
+ "step": 730
546
+ },
547
+ {
548
+ "epoch": 1.3454545454545455,
549
+ "grad_norm": 0.2766354978084564,
550
+ "learning_rate": 1.675e-05,
551
+ "loss": 0.0034,
552
+ "step": 740
553
+ },
554
+ {
555
+ "epoch": 1.3636363636363638,
556
+ "grad_norm": 0.830558180809021,
557
+ "learning_rate": 1.6666666666666667e-05,
558
+ "loss": 0.0074,
559
+ "step": 750
560
+ },
561
+ {
562
+ "epoch": 1.3818181818181818,
563
+ "grad_norm": 1.0488086938858032,
564
+ "learning_rate": 1.659166666666667e-05,
565
+ "loss": 0.0065,
566
+ "step": 760
567
+ },
568
+ {
569
+ "epoch": 1.4,
570
+ "grad_norm": 0.5093031525611877,
571
+ "learning_rate": 1.6508333333333336e-05,
572
+ "loss": 0.0053,
573
+ "step": 770
574
+ },
575
+ {
576
+ "epoch": 1.4181818181818182,
577
+ "grad_norm": 0.3070843517780304,
578
+ "learning_rate": 1.6425000000000003e-05,
579
+ "loss": 0.0045,
580
+ "step": 780
581
+ },
582
+ {
583
+ "epoch": 1.4363636363636363,
584
+ "grad_norm": 1.093131422996521,
585
+ "learning_rate": 1.634166666666667e-05,
586
+ "loss": 0.0058,
587
+ "step": 790
588
+ },
589
+ {
590
+ "epoch": 1.4545454545454546,
591
+ "grad_norm": 2.2531373500823975,
592
+ "learning_rate": 1.6258333333333333e-05,
593
+ "loss": 0.0065,
594
+ "step": 800
595
+ },
596
+ {
597
+ "epoch": 1.4727272727272727,
598
+ "grad_norm": 0.27250564098358154,
599
+ "learning_rate": 1.6175e-05,
600
+ "loss": 0.005,
601
+ "step": 810
602
+ },
603
+ {
604
+ "epoch": 1.490909090909091,
605
+ "grad_norm": 2.3462181091308594,
606
+ "learning_rate": 1.6091666666666667e-05,
607
+ "loss": 0.0077,
608
+ "step": 820
609
+ },
610
+ {
611
+ "epoch": 1.509090909090909,
612
+ "grad_norm": 0.5783445835113525,
613
+ "learning_rate": 1.6008333333333334e-05,
614
+ "loss": 0.0065,
615
+ "step": 830
616
+ },
617
+ {
618
+ "epoch": 1.5272727272727273,
619
+ "grad_norm": 2.6000328063964844,
620
+ "learning_rate": 1.5925e-05,
621
+ "loss": 0.0073,
622
+ "step": 840
623
+ },
624
+ {
625
+ "epoch": 1.5454545454545454,
626
+ "grad_norm": 0.27279505133628845,
627
+ "learning_rate": 1.5841666666666668e-05,
628
+ "loss": 0.0054,
629
+ "step": 850
630
+ },
631
+ {
632
+ "epoch": 1.5636363636363635,
633
+ "grad_norm": 0.5557974576950073,
634
+ "learning_rate": 1.5758333333333335e-05,
635
+ "loss": 0.0054,
636
+ "step": 860
637
+ },
638
+ {
639
+ "epoch": 1.5818181818181818,
640
+ "grad_norm": 0.4363366365432739,
641
+ "learning_rate": 1.5675e-05,
642
+ "loss": 0.0064,
643
+ "step": 870
644
+ },
645
+ {
646
+ "epoch": 1.6,
647
+ "grad_norm": 0.055520545691251755,
648
+ "learning_rate": 1.559166666666667e-05,
649
+ "loss": 0.0036,
650
+ "step": 880
651
+ },
652
+ {
653
+ "epoch": 1.6181818181818182,
654
+ "grad_norm": 0.31246721744537354,
655
+ "learning_rate": 1.5508333333333335e-05,
656
+ "loss": 0.0035,
657
+ "step": 890
658
+ },
659
+ {
660
+ "epoch": 1.6363636363636362,
661
+ "grad_norm": 0.3083712160587311,
662
+ "learning_rate": 1.5425000000000002e-05,
663
+ "loss": 0.0043,
664
+ "step": 900
665
+ },
666
+ {
667
+ "epoch": 1.6363636363636362,
668
+ "eval_f1": 0.9619188921859545,
669
+ "eval_f2": 0.9814310223029569,
670
+ "eval_loss": 0.007550612557679415,
671
+ "eval_precision": 0.9310674964097654,
672
+ "eval_recall": 0.9948849104859335,
673
+ "eval_runtime": 11.1497,
674
+ "eval_samples_per_second": 701.366,
675
+ "eval_steps_per_second": 11.032,
676
+ "step": 900
677
+ },
678
+ {
679
+ "epoch": 1.6545454545454545,
680
+ "grad_norm": 1.0350213050842285,
681
+ "learning_rate": 1.534166666666667e-05,
682
+ "loss": 0.0062,
683
+ "step": 910
684
+ },
685
+ {
686
+ "epoch": 1.6727272727272728,
687
+ "grad_norm": 1.7585707902908325,
688
+ "learning_rate": 1.5258333333333334e-05,
689
+ "loss": 0.006,
690
+ "step": 920
691
+ },
692
+ {
693
+ "epoch": 1.690909090909091,
694
+ "grad_norm": 0.35897427797317505,
695
+ "learning_rate": 1.5175000000000001e-05,
696
+ "loss": 0.0041,
697
+ "step": 930
698
+ },
699
+ {
700
+ "epoch": 1.709090909090909,
701
+ "grad_norm": 0.2970544993877411,
702
+ "learning_rate": 1.5091666666666668e-05,
703
+ "loss": 0.0058,
704
+ "step": 940
705
+ },
706
+ {
707
+ "epoch": 1.7272727272727273,
708
+ "grad_norm": 0.10311456769704819,
709
+ "learning_rate": 1.5008333333333333e-05,
710
+ "loss": 0.0057,
711
+ "step": 950
712
+ },
713
+ {
714
+ "epoch": 1.7454545454545456,
715
+ "grad_norm": 1.624154806137085,
716
+ "learning_rate": 1.4925e-05,
717
+ "loss": 0.0088,
718
+ "step": 960
719
+ },
720
+ {
721
+ "epoch": 1.7636363636363637,
722
+ "grad_norm": 0.22448480129241943,
723
+ "learning_rate": 1.4841666666666667e-05,
724
+ "loss": 0.0038,
725
+ "step": 970
726
+ },
727
+ {
728
+ "epoch": 1.7818181818181817,
729
+ "grad_norm": 0.9474364519119263,
730
+ "learning_rate": 1.4758333333333334e-05,
731
+ "loss": 0.0044,
732
+ "step": 980
733
+ },
734
+ {
735
+ "epoch": 1.8,
736
+ "grad_norm": 0.05209196358919144,
737
+ "learning_rate": 1.4675000000000001e-05,
738
+ "loss": 0.0064,
739
+ "step": 990
740
+ },
741
+ {
742
+ "epoch": 1.8181818181818183,
743
+ "grad_norm": 0.3231663405895233,
744
+ "learning_rate": 1.4591666666666668e-05,
745
+ "loss": 0.005,
746
+ "step": 1000
747
+ },
748
+ {
749
+ "epoch": 1.8363636363636364,
750
+ "grad_norm": 0.4507773220539093,
751
+ "learning_rate": 1.4508333333333335e-05,
752
+ "loss": 0.0047,
753
+ "step": 1010
754
+ },
755
+ {
756
+ "epoch": 1.8545454545454545,
757
+ "grad_norm": 0.28643473982810974,
758
+ "learning_rate": 1.4425e-05,
759
+ "loss": 0.006,
760
+ "step": 1020
761
+ },
762
+ {
763
+ "epoch": 1.8727272727272726,
764
+ "grad_norm": 0.30528539419174194,
765
+ "learning_rate": 1.4341666666666667e-05,
766
+ "loss": 0.0035,
767
+ "step": 1030
768
+ },
769
+ {
770
+ "epoch": 1.8909090909090909,
771
+ "grad_norm": 0.7955114245414734,
772
+ "learning_rate": 1.4258333333333334e-05,
773
+ "loss": 0.0049,
774
+ "step": 1040
775
+ },
776
+ {
777
+ "epoch": 1.9090909090909092,
778
+ "grad_norm": 0.6773081421852112,
779
+ "learning_rate": 1.4175e-05,
780
+ "loss": 0.0058,
781
+ "step": 1050
782
+ },
783
+ {
784
+ "epoch": 1.9272727272727272,
785
+ "grad_norm": 0.5149025917053223,
786
+ "learning_rate": 1.4091666666666668e-05,
787
+ "loss": 0.0036,
788
+ "step": 1060
789
+ },
790
+ {
791
+ "epoch": 1.9454545454545453,
792
+ "grad_norm": 0.5025485754013062,
793
+ "learning_rate": 1.4008333333333334e-05,
794
+ "loss": 0.0073,
795
+ "step": 1070
796
+ },
797
+ {
798
+ "epoch": 1.9636363636363636,
799
+ "grad_norm": 0.6183115839958191,
800
+ "learning_rate": 1.3925000000000001e-05,
801
+ "loss": 0.0038,
802
+ "step": 1080
803
+ },
804
+ {
805
+ "epoch": 1.981818181818182,
806
+ "grad_norm": 0.33286458253860474,
807
+ "learning_rate": 1.3841666666666668e-05,
808
+ "loss": 0.0047,
809
+ "step": 1090
810
+ },
811
+ {
812
+ "epoch": 2.0,
813
+ "grad_norm": 0.766334056854248,
814
+ "learning_rate": 1.3758333333333333e-05,
815
+ "loss": 0.0083,
816
+ "step": 1100
817
+ },
818
+ {
819
+ "epoch": 2.018181818181818,
820
+ "grad_norm": 0.23794743418693542,
821
+ "learning_rate": 1.3675e-05,
822
+ "loss": 0.0027,
823
+ "step": 1110
824
+ },
825
+ {
826
+ "epoch": 2.036363636363636,
827
+ "grad_norm": 0.32537227869033813,
828
+ "learning_rate": 1.3591666666666667e-05,
829
+ "loss": 0.0019,
830
+ "step": 1120
831
+ },
832
+ {
833
+ "epoch": 2.0545454545454547,
834
+ "grad_norm": 0.3572939932346344,
835
+ "learning_rate": 1.3508333333333334e-05,
836
+ "loss": 0.0023,
837
+ "step": 1130
838
+ },
839
+ {
840
+ "epoch": 2.0727272727272728,
841
+ "grad_norm": 0.2717416286468506,
842
+ "learning_rate": 1.3425000000000001e-05,
843
+ "loss": 0.001,
844
+ "step": 1140
845
+ },
846
+ {
847
+ "epoch": 2.090909090909091,
848
+ "grad_norm": 0.20052389800548553,
849
+ "learning_rate": 1.3341666666666668e-05,
850
+ "loss": 0.0017,
851
+ "step": 1150
852
+ },
853
+ {
854
+ "epoch": 2.109090909090909,
855
+ "grad_norm": 0.1603120118379593,
856
+ "learning_rate": 1.3258333333333335e-05,
857
+ "loss": 0.0031,
858
+ "step": 1160
859
+ },
860
+ {
861
+ "epoch": 2.1272727272727274,
862
+ "grad_norm": 0.15461675822734833,
863
+ "learning_rate": 1.3175e-05,
864
+ "loss": 0.0021,
865
+ "step": 1170
866
+ },
867
+ {
868
+ "epoch": 2.1454545454545455,
869
+ "grad_norm": 0.7481666803359985,
870
+ "learning_rate": 1.3091666666666667e-05,
871
+ "loss": 0.0033,
872
+ "step": 1180
873
+ },
874
+ {
875
+ "epoch": 2.1636363636363636,
876
+ "grad_norm": 0.3837297558784485,
877
+ "learning_rate": 1.3008333333333334e-05,
878
+ "loss": 0.0018,
879
+ "step": 1190
880
+ },
881
+ {
882
+ "epoch": 2.1818181818181817,
883
+ "grad_norm": 0.09467964619398117,
884
+ "learning_rate": 1.2925e-05,
885
+ "loss": 0.0017,
886
+ "step": 1200
887
+ },
888
+ {
889
+ "epoch": 2.1818181818181817,
890
+ "eval_f1": 0.9751068644707066,
891
+ "eval_f2": 0.9850640113798008,
892
+ "eval_loss": 0.009348779916763306,
893
+ "eval_precision": 0.9589515331355094,
894
+ "eval_recall": 0.9918158567774936,
895
+ "eval_runtime": 10.6749,
896
+ "eval_samples_per_second": 732.56,
897
+ "eval_steps_per_second": 11.522,
898
+ "step": 1200
899
+ },
900
+ {
901
+ "epoch": 2.2,
902
+ "grad_norm": 0.2888661324977875,
903
+ "learning_rate": 1.2841666666666668e-05,
904
+ "loss": 0.0017,
905
+ "step": 1210
906
+ },
907
+ {
908
+ "epoch": 2.2181818181818183,
909
+ "grad_norm": 1.4225064516067505,
910
+ "learning_rate": 1.2758333333333335e-05,
911
+ "loss": 0.0017,
912
+ "step": 1220
913
+ },
914
+ {
915
+ "epoch": 2.2363636363636363,
916
+ "grad_norm": 0.5475151538848877,
917
+ "learning_rate": 1.2675000000000001e-05,
918
+ "loss": 0.0022,
919
+ "step": 1230
920
+ },
921
+ {
922
+ "epoch": 2.2545454545454544,
923
+ "grad_norm": 0.2563498914241791,
924
+ "learning_rate": 1.2591666666666668e-05,
925
+ "loss": 0.0026,
926
+ "step": 1240
927
+ },
928
+ {
929
+ "epoch": 2.2727272727272725,
930
+ "grad_norm": 0.09335105866193771,
931
+ "learning_rate": 1.2508333333333334e-05,
932
+ "loss": 0.0013,
933
+ "step": 1250
934
+ },
935
+ {
936
+ "epoch": 2.290909090909091,
937
+ "grad_norm": 0.08890422433614731,
938
+ "learning_rate": 1.2425e-05,
939
+ "loss": 0.0015,
940
+ "step": 1260
941
+ },
942
+ {
943
+ "epoch": 2.309090909090909,
944
+ "grad_norm": 0.0670776441693306,
945
+ "learning_rate": 1.2341666666666667e-05,
946
+ "loss": 0.0015,
947
+ "step": 1270
948
+ },
949
+ {
950
+ "epoch": 2.327272727272727,
951
+ "grad_norm": 0.09385448694229126,
952
+ "learning_rate": 1.2258333333333334e-05,
953
+ "loss": 0.0022,
954
+ "step": 1280
955
+ },
956
+ {
957
+ "epoch": 2.3454545454545457,
958
+ "grad_norm": 0.31550052762031555,
959
+ "learning_rate": 1.2175000000000001e-05,
960
+ "loss": 0.0082,
961
+ "step": 1290
962
+ },
963
+ {
964
+ "epoch": 2.3636363636363638,
965
+ "grad_norm": 0.14805318415164948,
966
+ "learning_rate": 1.2091666666666668e-05,
967
+ "loss": 0.003,
968
+ "step": 1300
969
+ },
970
+ {
971
+ "epoch": 2.381818181818182,
972
+ "grad_norm": 0.5259885787963867,
973
+ "learning_rate": 1.2008333333333335e-05,
974
+ "loss": 0.0038,
975
+ "step": 1310
976
+ },
977
+ {
978
+ "epoch": 2.4,
979
+ "grad_norm": 0.37176281213760376,
980
+ "learning_rate": 1.1925e-05,
981
+ "loss": 0.0017,
982
+ "step": 1320
983
+ },
984
+ {
985
+ "epoch": 2.418181818181818,
986
+ "grad_norm": 0.33867013454437256,
987
+ "learning_rate": 1.1841666666666667e-05,
988
+ "loss": 0.0026,
989
+ "step": 1330
990
+ },
991
+ {
992
+ "epoch": 2.4363636363636365,
993
+ "grad_norm": 0.04352513328194618,
994
+ "learning_rate": 1.1758333333333334e-05,
995
+ "loss": 0.002,
996
+ "step": 1340
997
+ },
998
+ {
999
+ "epoch": 2.4545454545454546,
1000
+ "grad_norm": 0.35843801498413086,
1001
+ "learning_rate": 1.1675000000000001e-05,
1002
+ "loss": 0.0028,
1003
+ "step": 1350
1004
+ },
1005
+ {
1006
+ "epoch": 2.4727272727272727,
1007
+ "grad_norm": 0.5411182045936584,
1008
+ "learning_rate": 1.1591666666666668e-05,
1009
+ "loss": 0.0019,
1010
+ "step": 1360
1011
+ },
1012
+ {
1013
+ "epoch": 2.4909090909090907,
1014
+ "grad_norm": 0.08427491784095764,
1015
+ "learning_rate": 1.1508333333333335e-05,
1016
+ "loss": 0.0017,
1017
+ "step": 1370
1018
+ },
1019
+ {
1020
+ "epoch": 2.509090909090909,
1021
+ "grad_norm": 0.27736711502075195,
1022
+ "learning_rate": 1.1425000000000002e-05,
1023
+ "loss": 0.0023,
1024
+ "step": 1380
1025
+ },
1026
+ {
1027
+ "epoch": 2.5272727272727273,
1028
+ "grad_norm": 0.06100330501794815,
1029
+ "learning_rate": 1.1341666666666668e-05,
1030
+ "loss": 0.0017,
1031
+ "step": 1390
1032
+ },
1033
+ {
1034
+ "epoch": 2.5454545454545454,
1035
+ "grad_norm": 0.07736339420080185,
1036
+ "learning_rate": 1.1258333333333334e-05,
1037
+ "loss": 0.0038,
1038
+ "step": 1400
1039
+ },
1040
+ {
1041
+ "epoch": 2.5636363636363635,
1042
+ "grad_norm": 0.15126390755176544,
1043
+ "learning_rate": 1.1175e-05,
1044
+ "loss": 0.0009,
1045
+ "step": 1410
1046
+ },
1047
+ {
1048
+ "epoch": 2.581818181818182,
1049
+ "grad_norm": 0.0258785467594862,
1050
+ "learning_rate": 1.1091666666666667e-05,
1051
+ "loss": 0.001,
1052
+ "step": 1420
1053
+ },
1054
+ {
1055
+ "epoch": 2.6,
1056
+ "grad_norm": 0.8827760815620422,
1057
+ "learning_rate": 1.1008333333333334e-05,
1058
+ "loss": 0.003,
1059
+ "step": 1430
1060
+ },
1061
+ {
1062
+ "epoch": 2.618181818181818,
1063
+ "grad_norm": 0.14176161587238312,
1064
+ "learning_rate": 1.0925000000000001e-05,
1065
+ "loss": 0.0023,
1066
+ "step": 1440
1067
+ },
1068
+ {
1069
+ "epoch": 2.6363636363636362,
1070
+ "grad_norm": 0.17485152184963226,
1071
+ "learning_rate": 1.0841666666666668e-05,
1072
+ "loss": 0.0023,
1073
+ "step": 1450
1074
+ },
1075
+ {
1076
+ "epoch": 2.6545454545454543,
1077
+ "grad_norm": 0.1624346673488617,
1078
+ "learning_rate": 1.0758333333333335e-05,
1079
+ "loss": 0.0016,
1080
+ "step": 1460
1081
+ },
1082
+ {
1083
+ "epoch": 2.672727272727273,
1084
+ "grad_norm": 0.32750600576400757,
1085
+ "learning_rate": 1.0675e-05,
1086
+ "loss": 0.0018,
1087
+ "step": 1470
1088
+ },
1089
+ {
1090
+ "epoch": 2.690909090909091,
1091
+ "grad_norm": 0.3369393050670624,
1092
+ "learning_rate": 1.0591666666666667e-05,
1093
+ "loss": 0.0047,
1094
+ "step": 1480
1095
+ },
1096
+ {
1097
+ "epoch": 2.709090909090909,
1098
+ "grad_norm": 0.19913850724697113,
1099
+ "learning_rate": 1.0508333333333334e-05,
1100
+ "loss": 0.0029,
1101
+ "step": 1490
1102
+ },
1103
+ {
1104
+ "epoch": 2.7272727272727275,
1105
+ "grad_norm": 0.6039556264877319,
1106
+ "learning_rate": 1.0425000000000001e-05,
1107
+ "loss": 0.0029,
1108
+ "step": 1500
1109
+ },
1110
+ {
1111
+ "epoch": 2.7272727272727275,
1112
+ "eval_f1": 0.9707426856714179,
1113
+ "eval_f2": 0.9838807785888077,
1114
+ "eval_loss": 0.008097349666059017,
1115
+ "eval_precision": 0.9496086105675147,
1116
+ "eval_recall": 0.992838874680307,
1117
+ "eval_runtime": 10.7985,
1118
+ "eval_samples_per_second": 724.178,
1119
+ "eval_steps_per_second": 11.391,
1120
+ "step": 1500
1121
+ },
1122
+ {
1123
+ "epoch": 2.7454545454545456,
1124
+ "grad_norm": 0.15625803172588348,
1125
+ "learning_rate": 1.0341666666666668e-05,
1126
+ "loss": 0.001,
1127
+ "step": 1510
1128
+ },
1129
+ {
1130
+ "epoch": 2.7636363636363637,
1131
+ "grad_norm": 0.5355175733566284,
1132
+ "learning_rate": 1.0258333333333335e-05,
1133
+ "loss": 0.0015,
1134
+ "step": 1520
1135
+ },
1136
+ {
1137
+ "epoch": 2.7818181818181817,
1138
+ "grad_norm": 0.054884154349565506,
1139
+ "learning_rate": 1.0175000000000002e-05,
1140
+ "loss": 0.0043,
1141
+ "step": 1530
1142
+ },
1143
+ {
1144
+ "epoch": 2.8,
1145
+ "grad_norm": 0.14347773790359497,
1146
+ "learning_rate": 1.0091666666666669e-05,
1147
+ "loss": 0.0012,
1148
+ "step": 1540
1149
+ },
1150
+ {
1151
+ "epoch": 2.8181818181818183,
1152
+ "grad_norm": 0.09393730759620667,
1153
+ "learning_rate": 1.0008333333333334e-05,
1154
+ "loss": 0.0026,
1155
+ "step": 1550
1156
+ },
1157
+ {
1158
+ "epoch": 2.8363636363636364,
1159
+ "grad_norm": 0.2671602964401245,
1160
+ "learning_rate": 9.925e-06,
1161
+ "loss": 0.0006,
1162
+ "step": 1560
1163
+ },
1164
+ {
1165
+ "epoch": 2.8545454545454545,
1166
+ "grad_norm": 0.04782993346452713,
1167
+ "learning_rate": 9.841666666666668e-06,
1168
+ "loss": 0.0011,
1169
+ "step": 1570
1170
+ },
1171
+ {
1172
+ "epoch": 2.8727272727272726,
1173
+ "grad_norm": 0.5545538067817688,
1174
+ "learning_rate": 9.758333333333334e-06,
1175
+ "loss": 0.0034,
1176
+ "step": 1580
1177
+ },
1178
+ {
1179
+ "epoch": 2.8909090909090907,
1180
+ "grad_norm": 0.18771076202392578,
1181
+ "learning_rate": 9.675000000000001e-06,
1182
+ "loss": 0.0014,
1183
+ "step": 1590
1184
+ },
1185
+ {
1186
+ "epoch": 2.909090909090909,
1187
+ "grad_norm": 0.022369615733623505,
1188
+ "learning_rate": 9.591666666666667e-06,
1189
+ "loss": 0.0007,
1190
+ "step": 1600
1191
+ },
1192
+ {
1193
+ "epoch": 2.9272727272727272,
1194
+ "grad_norm": 0.569296658039093,
1195
+ "learning_rate": 9.508333333333333e-06,
1196
+ "loss": 0.0016,
1197
+ "step": 1610
1198
+ },
1199
+ {
1200
+ "epoch": 2.9454545454545453,
1201
+ "grad_norm": 0.07517626136541367,
1202
+ "learning_rate": 9.425e-06,
1203
+ "loss": 0.0012,
1204
+ "step": 1620
1205
+ },
1206
+ {
1207
+ "epoch": 2.963636363636364,
1208
+ "grad_norm": 0.4265158772468567,
1209
+ "learning_rate": 9.341666666666667e-06,
1210
+ "loss": 0.0013,
1211
+ "step": 1630
1212
+ },
1213
+ {
1214
+ "epoch": 2.981818181818182,
1215
+ "grad_norm": 0.31167715787887573,
1216
+ "learning_rate": 9.258333333333334e-06,
1217
+ "loss": 0.0014,
1218
+ "step": 1640
1219
+ },
1220
+ {
1221
+ "epoch": 3.0,
1222
+ "grad_norm": 0.6325229406356812,
1223
+ "learning_rate": 9.175000000000001e-06,
1224
+ "loss": 0.0007,
1225
+ "step": 1650
1226
+ },
1227
+ {
1228
+ "epoch": 3.018181818181818,
1229
+ "grad_norm": 0.04929906874895096,
1230
+ "learning_rate": 9.091666666666668e-06,
1231
+ "loss": 0.0006,
1232
+ "step": 1660
1233
+ },
1234
+ {
1235
+ "epoch": 3.036363636363636,
1236
+ "grad_norm": 0.22075557708740234,
1237
+ "learning_rate": 9.008333333333335e-06,
1238
+ "loss": 0.0006,
1239
+ "step": 1670
1240
+ },
1241
+ {
1242
+ "epoch": 3.0545454545454547,
1243
+ "grad_norm": 0.2008703649044037,
1244
+ "learning_rate": 8.925e-06,
1245
+ "loss": 0.0018,
1246
+ "step": 1680
1247
+ },
1248
+ {
1249
+ "epoch": 3.0727272727272728,
1250
+ "grad_norm": 0.15318256616592407,
1251
+ "learning_rate": 8.841666666666667e-06,
1252
+ "loss": 0.0011,
1253
+ "step": 1690
1254
+ },
1255
+ {
1256
+ "epoch": 3.090909090909091,
1257
+ "grad_norm": 0.19851188361644745,
1258
+ "learning_rate": 8.758333333333334e-06,
1259
+ "loss": 0.0005,
1260
+ "step": 1700
1261
+ },
1262
+ {
1263
+ "epoch": 3.109090909090909,
1264
+ "grad_norm": 0.02609218843281269,
1265
+ "learning_rate": 8.675e-06,
1266
+ "loss": 0.0014,
1267
+ "step": 1710
1268
+ },
1269
+ {
1270
+ "epoch": 3.1272727272727274,
1271
+ "grad_norm": 0.02781720645725727,
1272
+ "learning_rate": 8.591666666666668e-06,
1273
+ "loss": 0.0004,
1274
+ "step": 1720
1275
+ },
1276
+ {
1277
+ "epoch": 3.1454545454545455,
1278
+ "grad_norm": 0.17195935547351837,
1279
+ "learning_rate": 8.508333333333335e-06,
1280
+ "loss": 0.0011,
1281
+ "step": 1730
1282
+ },
1283
+ {
1284
+ "epoch": 3.1636363636363636,
1285
+ "grad_norm": 0.04604584723711014,
1286
+ "learning_rate": 8.425000000000001e-06,
1287
+ "loss": 0.0017,
1288
+ "step": 1740
1289
+ },
1290
+ {
1291
+ "epoch": 3.1818181818181817,
1292
+ "grad_norm": 0.01334014069288969,
1293
+ "learning_rate": 8.341666666666667e-06,
1294
+ "loss": 0.0005,
1295
+ "step": 1750
1296
+ },
1297
+ {
1298
+ "epoch": 3.2,
1299
+ "grad_norm": 0.10181070119142532,
1300
+ "learning_rate": 8.258333333333334e-06,
1301
+ "loss": 0.0003,
1302
+ "step": 1760
1303
+ },
1304
+ {
1305
+ "epoch": 3.2181818181818183,
1306
+ "grad_norm": 0.029040852561593056,
1307
+ "learning_rate": 8.175e-06,
1308
+ "loss": 0.0002,
1309
+ "step": 1770
1310
+ },
1311
+ {
1312
+ "epoch": 3.2363636363636363,
1313
+ "grad_norm": 1.0948010683059692,
1314
+ "learning_rate": 8.091666666666667e-06,
1315
+ "loss": 0.0006,
1316
+ "step": 1780
1317
+ },
1318
+ {
1319
+ "epoch": 3.2545454545454544,
1320
+ "grad_norm": 0.19002945721149445,
1321
+ "learning_rate": 8.008333333333334e-06,
1322
+ "loss": 0.0008,
1323
+ "step": 1790
1324
+ },
1325
+ {
1326
+ "epoch": 3.2727272727272725,
1327
+ "grad_norm": 0.02836296707391739,
1328
+ "learning_rate": 7.925000000000001e-06,
1329
+ "loss": 0.0006,
1330
+ "step": 1800
1331
+ },
1332
+ {
1333
+ "epoch": 3.2727272727272725,
1334
+ "eval_f1": 0.9827235772357723,
1335
+ "eval_f2": 0.9866340169370472,
1336
+ "eval_loss": 0.0109314676374197,
1337
+ "eval_precision": 0.9762746087834427,
1338
+ "eval_recall": 0.9892583120204603,
1339
+ "eval_runtime": 10.4766,
1340
+ "eval_samples_per_second": 746.427,
1341
+ "eval_steps_per_second": 11.74,
1342
+ "step": 1800
1343
+ },
1344
+ {
1345
+ "epoch": 3.290909090909091,
1346
+ "grad_norm": 0.018972614780068398,
1347
+ "learning_rate": 7.841666666666668e-06,
1348
+ "loss": 0.001,
1349
+ "step": 1810
1350
+ },
1351
+ {
1352
+ "epoch": 3.309090909090909,
1353
+ "grad_norm": 0.003141665132716298,
1354
+ "learning_rate": 7.758333333333335e-06,
1355
+ "loss": 0.001,
1356
+ "step": 1820
1357
+ },
1358
+ {
1359
+ "epoch": 3.327272727272727,
1360
+ "grad_norm": 0.029703687876462936,
1361
+ "learning_rate": 7.675e-06,
1362
+ "loss": 0.0007,
1363
+ "step": 1830
1364
+ },
1365
+ {
1366
+ "epoch": 3.3454545454545457,
1367
+ "grad_norm": 0.18382185697555542,
1368
+ "learning_rate": 7.591666666666667e-06,
1369
+ "loss": 0.0004,
1370
+ "step": 1840
1371
+ },
1372
+ {
1373
+ "epoch": 3.3636363636363638,
1374
+ "grad_norm": 0.05236556753516197,
1375
+ "learning_rate": 7.508333333333334e-06,
1376
+ "loss": 0.002,
1377
+ "step": 1850
1378
+ },
1379
+ {
1380
+ "epoch": 3.381818181818182,
1381
+ "grad_norm": 0.17387185990810394,
1382
+ "learning_rate": 7.425000000000001e-06,
1383
+ "loss": 0.0009,
1384
+ "step": 1860
1385
+ },
1386
+ {
1387
+ "epoch": 3.4,
1388
+ "grad_norm": 0.008212663233280182,
1389
+ "learning_rate": 7.341666666666667e-06,
1390
+ "loss": 0.0007,
1391
+ "step": 1870
1392
+ },
1393
+ {
1394
+ "epoch": 3.418181818181818,
1395
+ "grad_norm": 0.22597701847553253,
1396
+ "learning_rate": 7.258333333333334e-06,
1397
+ "loss": 0.001,
1398
+ "step": 1880
1399
+ },
1400
+ {
1401
+ "epoch": 3.4363636363636365,
1402
+ "grad_norm": 0.07276669144630432,
1403
+ "learning_rate": 7.175000000000001e-06,
1404
+ "loss": 0.0017,
1405
+ "step": 1890
1406
+ },
1407
+ {
1408
+ "epoch": 3.4545454545454546,
1409
+ "grad_norm": 0.29078298807144165,
1410
+ "learning_rate": 7.091666666666667e-06,
1411
+ "loss": 0.0004,
1412
+ "step": 1900
1413
+ },
1414
+ {
1415
+ "epoch": 3.4727272727272727,
1416
+ "grad_norm": 0.11019200086593628,
1417
+ "learning_rate": 7.008333333333334e-06,
1418
+ "loss": 0.0005,
1419
+ "step": 1910
1420
+ },
1421
+ {
1422
+ "epoch": 3.4909090909090907,
1423
+ "grad_norm": 0.017450423911213875,
1424
+ "learning_rate": 6.925000000000001e-06,
1425
+ "loss": 0.0003,
1426
+ "step": 1920
1427
+ },
1428
+ {
1429
+ "epoch": 3.509090909090909,
1430
+ "grad_norm": 0.023930951952934265,
1431
+ "learning_rate": 6.8416666666666675e-06,
1432
+ "loss": 0.0013,
1433
+ "step": 1930
1434
+ },
1435
+ {
1436
+ "epoch": 3.5272727272727273,
1437
+ "grad_norm": 0.1692740023136139,
1438
+ "learning_rate": 6.7583333333333336e-06,
1439
+ "loss": 0.0033,
1440
+ "step": 1940
1441
+ },
1442
+ {
1443
+ "epoch": 3.5454545454545454,
1444
+ "grad_norm": 0.031825270503759384,
1445
+ "learning_rate": 6.6750000000000005e-06,
1446
+ "loss": 0.0008,
1447
+ "step": 1950
1448
+ },
1449
+ {
1450
+ "epoch": 3.5636363636363635,
1451
+ "grad_norm": 0.004583127796649933,
1452
+ "learning_rate": 6.591666666666667e-06,
1453
+ "loss": 0.0005,
1454
+ "step": 1960
1455
+ },
1456
+ {
1457
+ "epoch": 3.581818181818182,
1458
+ "grad_norm": 0.19434763491153717,
1459
+ "learning_rate": 6.508333333333334e-06,
1460
+ "loss": 0.0003,
1461
+ "step": 1970
1462
+ },
1463
+ {
1464
+ "epoch": 3.6,
1465
+ "grad_norm": 0.007167825475335121,
1466
+ "learning_rate": 6.425e-06,
1467
+ "loss": 0.0002,
1468
+ "step": 1980
1469
+ },
1470
+ {
1471
+ "epoch": 3.618181818181818,
1472
+ "grad_norm": 0.24422968924045563,
1473
+ "learning_rate": 6.341666666666667e-06,
1474
+ "loss": 0.0003,
1475
+ "step": 1990
1476
+ },
1477
+ {
1478
+ "epoch": 3.6363636363636362,
1479
+ "grad_norm": 0.00559116480872035,
1480
+ "learning_rate": 6.258333333333334e-06,
1481
+ "loss": 0.0001,
1482
+ "step": 2000
1483
+ },
1484
+ {
1485
+ "epoch": 3.6545454545454543,
1486
+ "grad_norm": 0.3058757185935974,
1487
+ "learning_rate": 6.175000000000001e-06,
1488
+ "loss": 0.0015,
1489
+ "step": 2010
1490
+ },
1491
+ {
1492
+ "epoch": 3.672727272727273,
1493
+ "grad_norm": 0.008120411075651646,
1494
+ "learning_rate": 6.091666666666667e-06,
1495
+ "loss": 0.0008,
1496
+ "step": 2020
1497
+ },
1498
+ {
1499
+ "epoch": 3.690909090909091,
1500
+ "grad_norm": 0.007178381085395813,
1501
+ "learning_rate": 6.008333333333334e-06,
1502
+ "loss": 0.0003,
1503
+ "step": 2030
1504
+ },
1505
+ {
1506
+ "epoch": 3.709090909090909,
1507
+ "grad_norm": 0.12139607220888138,
1508
+ "learning_rate": 5.925000000000001e-06,
1509
+ "loss": 0.0001,
1510
+ "step": 2040
1511
+ },
1512
+ {
1513
+ "epoch": 3.7272727272727275,
1514
+ "grad_norm": 0.16555677354335785,
1515
+ "learning_rate": 5.841666666666667e-06,
1516
+ "loss": 0.0004,
1517
+ "step": 2050
1518
+ },
1519
+ {
1520
+ "epoch": 3.7454545454545456,
1521
+ "grad_norm": 0.08208701014518738,
1522
+ "learning_rate": 5.758333333333334e-06,
1523
+ "loss": 0.0002,
1524
+ "step": 2060
1525
+ },
1526
+ {
1527
+ "epoch": 3.7636363636363637,
1528
+ "grad_norm": 0.0696110725402832,
1529
+ "learning_rate": 5.675000000000001e-06,
1530
+ "loss": 0.0008,
1531
+ "step": 2070
1532
+ },
1533
+ {
1534
+ "epoch": 3.7818181818181817,
1535
+ "grad_norm": 0.019171856343746185,
1536
+ "learning_rate": 5.591666666666668e-06,
1537
+ "loss": 0.0009,
1538
+ "step": 2080
1539
+ },
1540
+ {
1541
+ "epoch": 3.8,
1542
+ "grad_norm": 0.011577253229916096,
1543
+ "learning_rate": 5.508333333333334e-06,
1544
+ "loss": 0.0007,
1545
+ "step": 2090
1546
+ },
1547
+ {
1548
+ "epoch": 3.8181818181818183,
1549
+ "grad_norm": 0.1947954148054123,
1550
+ "learning_rate": 5.4250000000000006e-06,
1551
+ "loss": 0.0005,
1552
+ "step": 2100
1553
+ },
1554
+ {
1555
+ "epoch": 3.8181818181818183,
1556
+ "eval_f1": 0.9789500380420999,
1557
+ "eval_f2": 0.9838907014681892,
1558
+ "eval_loss": 0.012196212075650692,
1559
+ "eval_precision": 0.9708249496981891,
1560
+ "eval_recall": 0.9872122762148338,
1561
+ "eval_runtime": 10.6686,
1562
+ "eval_samples_per_second": 732.992,
1563
+ "eval_steps_per_second": 11.529,
1564
+ "step": 2100
1565
+ }
1566
+ ],
1567
+ "logging_steps": 10,
1568
+ "max_steps": 2750,
1569
+ "num_input_tokens_seen": 0,
1570
+ "num_train_epochs": 5,
1571
+ "save_steps": 300,
1572
+ "stateful_callbacks": {
1573
+ "EarlyStoppingCallback": {
1574
+ "args": {
1575
+ "early_stopping_patience": 3,
1576
+ "early_stopping_threshold": 0.0
1577
+ },
1578
+ "attributes": {
1579
+ "early_stopping_patience_counter": 1
1580
+ }
1581
+ },
1582
+ "TrainerControl": {
1583
+ "args": {
1584
+ "should_epoch_stop": false,
1585
+ "should_evaluate": false,
1586
+ "should_log": false,
1587
+ "should_save": true,
1588
+ "should_training_stop": false
1589
+ },
1590
+ "attributes": {}
1591
+ }
1592
+ },
1593
+ "total_flos": 7.07084650174464e+16,
1594
+ "train_batch_size": 64,
1595
+ "trial_name": null,
1596
+ "trial_params": null
1597
+ }
checkpoint-2100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45ac4db2857ed5af2a19875a4640aaf5ce2df14eae952755a36f8b361f5940d7
3
+ size 5496
checkpoint-2100/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2673e42d4637733a09e0553abdaedcc9fa4e6ac22cf009b506a3776cc655d6e4
3
+ size 498612824
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "max_len": 512,
53
+ "model_max_length": 512,
54
+ "pad_token": "<pad>",
55
+ "sep_token": "</s>",
56
+ "tokenizer_class": "RobertaTokenizer",
57
+ "trim_offsets": true,
58
+ "unk_token": "<unk>"
59
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a880bc2113f1c59c5c8811e0cfbdcc1f299ce53865365dda540b7b7d0db8df
3
+ size 5496
vocab.json ADDED
The diff for this file is too large to render. See raw diff