Vikaspandey582003 commited on
Commit
c00d096
·
verified ·
1 Parent(s): 7a51852

checkpoint step 50

Browse files
checkpoint-50/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8774212fba617daa40bef4cef915e047bc2e6feee4ceebd5580bf1ee2fb50370
3
  size 80792880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8f6b3da54d6d5e4c4c6546f5e250e766dc426eb113dc1c45ab4ed567ffd48b1
3
  size 80792880
checkpoint-50/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f91af0fa995043cb9e216dca74e80d09ee819a9d274a9dcf260d0411a6b48ed
3
  size 161816251
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b4a30cf81e86d5a32d909da97f25a65e4690b495e874547c51260ea02d771e
3
  size 161816251
checkpoint-50/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42c8957cbd17b37e5391f10035f189ea0492f94bda207d033f16b09cc832dbcf
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77323eeff9b7a4a4a795c260f92519ee32bd1dec272ef8a510f9120534b72ca2
3
  size 14645
checkpoint-50/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:259b1303d82fa78c2e55eeb7df6096e0b57593f81a8f6658f2d1675d51e39965
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a83610ba4b367fed3f2c1f69cd0080618139093c0ae8eb9639608a8f1d40eb
3
  size 1465
checkpoint-50/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0021551724137931034,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
@@ -15,26 +15,26 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 130.0,
20
- "completions/max_terminated_length": 130.0,
21
- "completions/mean_length": 68.5,
22
- "completions/mean_terminated_length": 68.5,
23
- "completions/min_length": 26.8,
24
- "completions/min_terminated_length": 26.8,
25
- "entropy": 0.25826080311089755,
26
- "epoch": 0.00021551724137931034,
27
- "frac_reward_zero_std": 0.1,
28
- "grad_norm": 0.28125,
29
  "learning_rate": 1.0000000000000002e-06,
30
- "loss": 0.10238287448883057,
31
- "num_tokens": 12064.0,
32
- "reward": 0.3290319949388504,
33
- "reward_std": 0.40028320252895355,
34
- "rewards/reward_fn/mean": 0.3290319949388504,
35
- "rewards/reward_fn/std": 0.400283208489418,
36
  "step": 5,
37
- "step_time": 22.194597821800016
38
  },
39
  {
40
  "clip_ratio/high_max": 0.0,
@@ -42,26 +42,26 @@
42
  "clip_ratio/low_mean": 0.0,
43
  "clip_ratio/low_min": 0.0,
44
  "clip_ratio/region_mean": 0.0,
45
- "completions/clipped_ratio": 0.0,
46
- "completions/max_length": 87.6,
47
- "completions/max_terminated_length": 87.6,
48
- "completions/mean_length": 48.525,
49
- "completions/mean_terminated_length": 48.525,
50
- "completions/min_length": 17.0,
51
- "completions/min_terminated_length": 17.0,
52
- "entropy": 0.2673827801831067,
53
- "epoch": 0.0004310344827586207,
54
- "frac_reward_zero_std": 0.1,
55
- "grad_norm": 0.484375,
56
  "learning_rate": 2.25e-06,
57
- "loss": 0.05849265456199646,
58
- "num_tokens": 23089.0,
59
- "reward": 0.45969198942184447,
60
- "reward_std": 0.2795014828443527,
61
- "rewards/reward_fn/mean": 0.45969198942184447,
62
- "rewards/reward_fn/std": 0.27950150668621065,
63
  "step": 10,
64
- "step_time": 16.331819552399928
65
  },
66
  {
67
  "clip_ratio/high_max": 0.0,
@@ -69,26 +69,26 @@
69
  "clip_ratio/low_mean": 0.0,
70
  "clip_ratio/low_min": 0.0,
71
  "clip_ratio/region_mean": 0.0,
72
- "completions/clipped_ratio": 0.0,
73
- "completions/max_length": 85.6,
74
- "completions/max_terminated_length": 85.6,
75
- "completions/mean_length": 47.35,
76
- "completions/mean_terminated_length": 47.35,
77
- "completions/min_length": 25.0,
78
- "completions/min_terminated_length": 25.0,
79
- "entropy": 0.20174795808270574,
80
- "epoch": 0.000646551724137931,
81
- "frac_reward_zero_std": 0.4,
82
- "grad_norm": 0.66015625,
83
  "learning_rate": 3.5e-06,
84
- "loss": -0.03555725216865539,
85
- "num_tokens": 34239.0,
86
- "reward": 0.5545999944210053,
87
- "reward_std": 0.32832055240869523,
88
- "rewards/reward_fn/mean": 0.5545999944210053,
89
- "rewards/reward_fn/std": 0.3283205583691597,
90
  "step": 15,
91
- "step_time": 16.44816417159991
92
  },
93
  {
94
  "clip_ratio/high_max": 0.0,
@@ -96,26 +96,26 @@
96
  "clip_ratio/low_mean": 0.0,
97
  "clip_ratio/low_min": 0.0,
98
  "clip_ratio/region_mean": 0.0,
99
- "completions/clipped_ratio": 0.0,
100
- "completions/max_length": 98.2,
101
- "completions/max_terminated_length": 98.2,
102
- "completions/mean_length": 53.6,
103
- "completions/mean_terminated_length": 53.6,
104
- "completions/min_length": 25.0,
105
- "completions/min_terminated_length": 25.0,
106
- "entropy": 0.2620095924474299,
107
- "epoch": 0.0008620689655172414,
108
- "frac_reward_zero_std": 0.2,
109
- "grad_norm": 0.5390625,
110
  "learning_rate": 4.75e-06,
111
- "loss": 0.017401468753814698,
112
- "num_tokens": 45399.0,
113
- "reward": 0.4466240078210831,
114
- "reward_std": 0.27573536019772293,
115
- "rewards/reward_fn/mean": 0.4466240078210831,
116
- "rewards/reward_fn/std": 0.27573537137359383,
117
  "step": 20,
118
- "step_time": 17.988385831799953
119
  },
120
  {
121
  "clip_ratio/high_max": 0.0,
@@ -123,26 +123,26 @@
123
  "clip_ratio/low_mean": 0.0,
124
  "clip_ratio/low_min": 0.0,
125
  "clip_ratio/region_mean": 0.0,
126
- "completions/clipped_ratio": 0.0,
127
- "completions/max_length": 123.8,
128
- "completions/max_terminated_length": 123.8,
129
- "completions/mean_length": 59.15,
130
- "completions/mean_terminated_length": 59.15,
131
- "completions/min_length": 21.8,
132
- "completions/min_terminated_length": 21.8,
133
- "entropy": 0.33701689867302775,
134
- "epoch": 0.0010775862068965517,
135
- "frac_reward_zero_std": 0.0,
136
- "grad_norm": 0.74609375,
137
- "learning_rate": 4.965517241379311e-06,
138
- "loss": -0.02674364447593689,
139
- "num_tokens": 56993.0,
140
- "reward": 0.5006439983844757,
141
- "reward_std": 0.2351181447505951,
142
- "rewards/reward_fn/mean": 0.5006439983844757,
143
- "rewards/reward_fn/std": 0.23511814773082734,
144
  "step": 25,
145
- "step_time": 21.11461545459997
146
  },
147
  {
148
  "clip_ratio/high_max": 0.0,
@@ -150,26 +150,26 @@
150
  "clip_ratio/low_mean": 0.0,
151
  "clip_ratio/low_min": 0.0,
152
  "clip_ratio/region_mean": 0.0,
153
- "completions/clipped_ratio": 0.0,
154
- "completions/max_length": 110.2,
155
- "completions/max_terminated_length": 110.2,
156
- "completions/mean_length": 47.95,
157
- "completions/mean_terminated_length": 47.95,
158
- "completions/min_length": 15.8,
159
- "completions/min_terminated_length": 15.8,
160
- "entropy": 0.24504410615190864,
161
- "epoch": 0.001293103448275862,
162
- "frac_reward_zero_std": 0.4,
163
- "grad_norm": 0.92578125,
164
- "learning_rate": 4.922413793103449e-06,
165
- "loss": -0.05023183822631836,
166
- "num_tokens": 67659.0,
167
- "reward": 0.6457239985466003,
168
- "reward_std": 0.11179900387069211,
169
- "rewards/reward_fn/mean": 0.6457239985466003,
170
- "rewards/reward_fn/std": 0.11179901438299567,
171
  "step": 30,
172
- "step_time": 19.134268762199827
173
  },
174
  {
175
  "clip_ratio/high_max": 0.0,
@@ -177,26 +177,26 @@
177
  "clip_ratio/low_mean": 0.0,
178
  "clip_ratio/low_min": 0.0,
179
  "clip_ratio/region_mean": 0.0,
180
- "completions/clipped_ratio": 0.1,
181
- "completions/max_length": 150.8,
182
- "completions/max_terminated_length": 120.2,
183
- "completions/mean_length": 73.375,
184
- "completions/mean_terminated_length": 54.36666717529297,
185
- "completions/min_length": 15.2,
186
- "completions/min_terminated_length": 15.2,
187
- "entropy": 0.23285924410447478,
188
- "epoch": 0.0015086206896551724,
189
- "frac_reward_zero_std": 0.3,
190
- "grad_norm": 0.58203125,
191
- "learning_rate": 4.879310344827586e-06,
192
- "loss": -0.05149807929992676,
193
- "num_tokens": 79734.0,
194
- "reward": 0.46414998471736907,
195
- "reward_std": 0.4071369742392562,
196
- "rewards/reward_fn/mean": 0.46414998471736907,
197
- "rewards/reward_fn/std": 0.40713699695188554,
198
  "step": 35,
199
- "step_time": 24.679699858600042
200
  },
201
  {
202
  "clip_ratio/high_max": 0.0,
@@ -204,26 +204,26 @@
204
  "clip_ratio/low_mean": 0.0,
205
  "clip_ratio/low_min": 0.0,
206
  "clip_ratio/region_mean": 0.0,
207
- "completions/clipped_ratio": 0.0,
208
- "completions/max_length": 125.4,
209
- "completions/max_terminated_length": 125.4,
210
- "completions/mean_length": 68.925,
211
- "completions/mean_terminated_length": 68.925,
212
- "completions/min_length": 26.0,
213
- "completions/min_terminated_length": 26.0,
214
- "entropy": 0.3110098702833056,
215
- "epoch": 0.0017241379310344827,
216
- "frac_reward_zero_std": 0.1,
217
- "grad_norm": 0.419921875,
218
- "learning_rate": 4.836206896551724e-06,
219
- "loss": 0.030162644386291505,
220
- "num_tokens": 91695.0,
221
- "reward": 0.5143539935350419,
222
- "reward_std": 0.2790891878306866,
223
- "rewards/reward_fn/mean": 0.5143539935350419,
224
- "rewards/reward_fn/std": 0.279089218378067,
225
  "step": 40,
226
- "step_time": 21.565513409599998
227
  },
228
  {
229
  "clip_ratio/high_max": 0.0,
@@ -231,26 +231,26 @@
231
  "clip_ratio/low_mean": 0.0,
232
  "clip_ratio/low_min": 0.0,
233
  "clip_ratio/region_mean": 0.0,
234
- "completions/clipped_ratio": 0.0,
235
- "completions/max_length": 100.0,
236
- "completions/max_terminated_length": 100.0,
237
- "completions/mean_length": 53.875,
238
- "completions/mean_terminated_length": 53.875,
239
- "completions/min_length": 16.6,
240
- "completions/min_terminated_length": 16.6,
241
- "entropy": 0.23047098610550165,
242
- "epoch": 0.001939655172413793,
243
- "frac_reward_zero_std": 0.4,
244
- "grad_norm": 1.15625,
245
- "learning_rate": 4.793103448275862e-06,
246
- "loss": -0.04350074529647827,
247
- "num_tokens": 102994.0,
248
- "reward": 0.6398920059204102,
249
- "reward_std": 0.2279826147481799,
250
- "rewards/reward_fn/mean": 0.6398920059204102,
251
- "rewards/reward_fn/std": 0.22798261437565087,
252
  "step": 45,
253
- "step_time": 18.23975218899968
254
  },
255
  {
256
  "clip_ratio/high_max": 0.0,
@@ -258,32 +258,32 @@
258
  "clip_ratio/low_mean": 0.0,
259
  "clip_ratio/low_min": 0.0,
260
  "clip_ratio/region_mean": 0.0,
261
- "completions/clipped_ratio": 0.0,
262
- "completions/max_length": 133.4,
263
- "completions/max_terminated_length": 133.4,
264
- "completions/mean_length": 75.975,
265
- "completions/mean_terminated_length": 75.975,
266
- "completions/min_length": 23.8,
267
- "completions/min_terminated_length": 23.8,
268
- "entropy": 0.34613882582634686,
269
- "epoch": 0.0021551724137931034,
270
- "frac_reward_zero_std": 0.2,
271
- "grad_norm": 0.1318359375,
272
- "learning_rate": 4.75e-06,
273
- "loss": -0.03668657541275024,
274
- "num_tokens": 115289.0,
275
- "reward": 0.33042599707841874,
276
- "reward_std": 0.5918701648712158,
277
- "rewards/reward_fn/mean": 0.33042599707841874,
278
- "rewards/reward_fn/std": 0.5918702006340026,
279
  "step": 50,
280
- "step_time": 22.43372436519985
281
  }
282
  ],
283
  "logging_steps": 5,
284
- "max_steps": 600,
285
- "num_input_tokens_seen": 115289,
286
- "num_train_epochs": 1,
287
  "save_steps": 50,
288
  "stateful_callbacks": {
289
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 25.0,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.1,
19
+ "completions/max_length": 193.0,
20
+ "completions/max_terminated_length": 163.0,
21
+ "completions/mean_length": 103.975,
22
+ "completions/mean_terminated_length": 90.39881134033203,
23
+ "completions/min_length": 17.6,
24
+ "completions/min_terminated_length": 17.6,
25
+ "entropy": 0.1820149033330381,
26
+ "epoch": 2.5,
27
+ "frac_reward_zero_std": 0.5,
28
+ "grad_norm": 0.25,
29
  "learning_rate": 1.0000000000000002e-06,
30
+ "loss": 0.0794088900089264,
31
+ "num_tokens": 14355.0,
32
+ "reward": 0.15020800828933717,
33
+ "reward_std": 0.6376187483081595,
34
+ "rewards/reward_fn/mean": 0.15020800828933717,
35
+ "rewards/reward_fn/std": 0.6376187764341011,
36
  "step": 5,
37
+ "step_time": 30.522303848797673
38
  },
39
  {
40
  "clip_ratio/high_max": 0.0,
 
42
  "clip_ratio/low_mean": 0.0,
43
  "clip_ratio/low_min": 0.0,
44
  "clip_ratio/region_mean": 0.0,
45
+ "completions/clipped_ratio": 0.025,
46
+ "completions/max_length": 151.2,
47
+ "completions/max_terminated_length": 141.6,
48
+ "completions/mean_length": 80.075,
49
+ "completions/mean_terminated_length": 77.78928833007812,
50
+ "completions/min_length": 17.8,
51
+ "completions/min_terminated_length": 17.8,
52
+ "entropy": 0.16221200795844198,
53
+ "epoch": 5.0,
54
+ "frac_reward_zero_std": 0.5,
55
+ "grad_norm": 0.228515625,
56
  "learning_rate": 2.25e-06,
57
+ "loss": 0.07169516086578369,
58
+ "num_tokens": 27462.0,
59
+ "reward": 0.4012819856405258,
60
+ "reward_std": 0.4128362699819263,
61
+ "rewards/reward_fn/mean": 0.4012819856405258,
62
+ "rewards/reward_fn/std": 0.4128363010211615,
63
  "step": 10,
64
+ "step_time": 25.01976956339822
65
  },
66
  {
67
  "clip_ratio/high_max": 0.0,
 
69
  "clip_ratio/low_mean": 0.0,
70
  "clip_ratio/low_min": 0.0,
71
  "clip_ratio/region_mean": 0.0,
72
+ "completions/clipped_ratio": 0.025,
73
+ "completions/max_length": 185.6,
74
+ "completions/max_terminated_length": 171.6,
75
+ "completions/mean_length": 83.65,
76
+ "completions/mean_terminated_length": 79.14285736083984,
77
+ "completions/min_length": 16.6,
78
+ "completions/min_terminated_length": 16.6,
79
+ "entropy": 0.14225535104051232,
80
+ "epoch": 7.5,
81
+ "frac_reward_zero_std": 0.3,
82
+ "grad_norm": 0.265625,
83
  "learning_rate": 3.5e-06,
84
+ "loss": 0.0717179834842682,
85
+ "num_tokens": 40740.0,
86
+ "reward": 0.10207997858524323,
87
+ "reward_std": 0.7454913818277419,
88
+ "rewards/reward_fn/mean": 0.10207997858524323,
89
+ "rewards/reward_fn/std": 0.7454914333298802,
90
  "step": 15,
91
+ "step_time": 29.626422570000432
92
  },
93
  {
94
  "clip_ratio/high_max": 0.0,
 
96
  "clip_ratio/low_mean": 0.0,
97
  "clip_ratio/low_min": 0.0,
98
  "clip_ratio/region_mean": 0.0,
99
+ "completions/clipped_ratio": 0.1,
100
+ "completions/max_length": 202.0,
101
+ "completions/max_terminated_length": 163.0,
102
+ "completions/mean_length": 98.85,
103
+ "completions/mean_terminated_length": 81.79285888671875,
104
+ "completions/min_length": 19.2,
105
+ "completions/min_terminated_length": 19.2,
106
+ "entropy": 0.1534867493668571,
107
+ "epoch": 10.0,
108
+ "frac_reward_zero_std": 0.4,
109
+ "grad_norm": 0.2431640625,
110
  "learning_rate": 4.75e-06,
111
+ "loss": 0.09726614952087402,
112
+ "num_tokens": 54862.0,
113
+ "reward": 0.25181599259376525,
114
+ "reward_std": 0.6045640033902601,
115
+ "rewards/reward_fn/mean": 0.25181599259376525,
116
+ "rewards/reward_fn/std": 0.6045640454394743,
117
  "step": 20,
118
+ "step_time": 31.625062462999267
119
  },
120
  {
121
  "clip_ratio/high_max": 0.0,
 
123
  "clip_ratio/low_mean": 0.0,
124
  "clip_ratio/low_min": 0.0,
125
  "clip_ratio/region_mean": 0.0,
126
+ "completions/clipped_ratio": 0.125,
127
+ "completions/max_length": 233.6,
128
+ "completions/max_terminated_length": 194.4,
129
+ "completions/mean_length": 99.525,
130
+ "completions/mean_terminated_length": 77.78690643310547,
131
+ "completions/min_length": 18.8,
132
+ "completions/min_terminated_length": 18.8,
133
+ "entropy": 0.1538564210291952,
134
+ "epoch": 12.5,
135
+ "frac_reward_zero_std": 0.6,
136
+ "grad_norm": 0.0,
137
+ "learning_rate": 4.981481481481482e-06,
138
+ "loss": 0.13929661512374877,
139
+ "num_tokens": 69039.0,
140
+ "reward": 0.0006859898567199707,
141
+ "reward_std": 1.0108385920524596,
142
+ "rewards/reward_fn/mean": 0.0006859898567199707,
143
+ "rewards/reward_fn/std": 1.0108386158943177,
144
  "step": 25,
145
+ "step_time": 35.65783783460065
146
  },
147
  {
148
  "clip_ratio/high_max": 0.0,
 
150
  "clip_ratio/low_mean": 0.0,
151
  "clip_ratio/low_min": 0.0,
152
  "clip_ratio/region_mean": 0.0,
153
+ "completions/clipped_ratio": 0.05,
154
+ "completions/max_length": 181.8,
155
+ "completions/max_terminated_length": 153.0,
156
+ "completions/mean_length": 65.05,
157
+ "completions/mean_terminated_length": 56.03928680419922,
158
+ "completions/min_length": 16.6,
159
+ "completions/min_terminated_length": 16.6,
160
+ "entropy": 0.13224927680566906,
161
+ "epoch": 15.0,
162
+ "frac_reward_zero_std": 0.5,
163
+ "grad_norm": 0.1875,
164
+ "learning_rate": 4.958333333333334e-06,
165
+ "loss": -0.003383058309555054,
166
+ "num_tokens": 81545.0,
167
+ "reward": 0.3994179755449295,
168
+ "reward_std": 0.5622525057464373,
169
+ "rewards/reward_fn/mean": 0.3994179755449295,
170
+ "rewards/reward_fn/std": 0.56225254482124,
171
  "step": 30,
172
+ "step_time": 29.112151033400732
173
  },
174
  {
175
  "clip_ratio/high_max": 0.0,
 
177
  "clip_ratio/low_mean": 0.0,
178
  "clip_ratio/low_min": 0.0,
179
  "clip_ratio/region_mean": 0.0,
180
+ "completions/clipped_ratio": 0.05,
181
+ "completions/max_length": 131.0,
182
+ "completions/max_terminated_length": 128.6,
183
+ "completions/mean_length": 61.075,
184
+ "completions/mean_terminated_length": 53.282144165039064,
185
+ "completions/min_length": 17.6,
186
+ "completions/min_terminated_length": 17.6,
187
+ "entropy": 0.11974610288161784,
188
+ "epoch": 17.5,
189
+ "frac_reward_zero_std": 0.7,
190
+ "grad_norm": 0.220703125,
191
+ "learning_rate": 4.935185185185186e-06,
192
+ "loss": 0.004663025587797165,
193
+ "num_tokens": 93892.0,
194
+ "reward": 0.3469119846820831,
195
+ "reward_std": 0.5775202971824911,
196
+ "rewards/reward_fn/mean": 0.3469119846820831,
197
+ "rewards/reward_fn/std": 0.5775203009106917,
198
  "step": 35,
199
+ "step_time": 22.457519923200017
200
  },
201
  {
202
  "clip_ratio/high_max": 0.0,
 
204
  "clip_ratio/low_mean": 0.0,
205
  "clip_ratio/low_min": 0.0,
206
  "clip_ratio/region_mean": 0.0,
207
+ "completions/clipped_ratio": 0.075,
208
+ "completions/max_length": 179.6,
209
+ "completions/max_terminated_length": 131.2,
210
+ "completions/mean_length": 78.025,
211
+ "completions/mean_terminated_length": 65.70357513427734,
212
+ "completions/min_length": 19.0,
213
+ "completions/min_terminated_length": 19.0,
214
+ "entropy": 0.12550681543070824,
215
+ "epoch": 20.0,
216
+ "frac_reward_zero_std": 0.4,
217
+ "grad_norm": 0.4609375,
218
+ "learning_rate": 4.9120370370370375e-06,
219
+ "loss": 0.036792796850204465,
220
+ "num_tokens": 107209.0,
221
+ "reward": 0.3498039901256561,
222
+ "reward_std": 0.7408117946935817,
223
+ "rewards/reward_fn/mean": 0.3498039901256561,
224
+ "rewards/reward_fn/std": 0.7408118456369266,
225
  "step": 40,
226
+ "step_time": 28.692756705999635
227
  },
228
  {
229
  "clip_ratio/high_max": 0.0,
 
231
  "clip_ratio/low_mean": 0.0,
232
  "clip_ratio/low_min": 0.0,
233
  "clip_ratio/region_mean": 0.0,
234
+ "completions/clipped_ratio": 0.025,
235
+ "completions/max_length": 189.0,
236
+ "completions/max_terminated_length": 178.8,
237
+ "completions/mean_length": 92.675,
238
+ "completions/mean_terminated_length": 89.51071472167969,
239
+ "completions/min_length": 17.8,
240
+ "completions/min_terminated_length": 17.8,
241
+ "entropy": 0.15165529411751777,
242
+ "epoch": 22.5,
243
+ "frac_reward_zero_std": 0.5,
244
+ "grad_norm": 0.232421875,
245
+ "learning_rate": 4.888888888888889e-06,
246
+ "loss": -0.004727205634117127,
247
+ "num_tokens": 120868.0,
248
+ "reward": 0.30072798430919645,
249
+ "reward_std": 0.7614144545921591,
250
+ "rewards/reward_fn/mean": 0.30072798430919645,
251
+ "rewards/reward_fn/std": 0.7614145380415721,
252
  "step": 45,
253
+ "step_time": 30.053675796201425
254
  },
255
  {
256
  "clip_ratio/high_max": 0.0,
 
258
  "clip_ratio/low_mean": 0.0,
259
  "clip_ratio/low_min": 0.0,
260
  "clip_ratio/region_mean": 0.0,
261
+ "completions/clipped_ratio": 0.075,
262
+ "completions/max_length": 208.6,
263
+ "completions/max_terminated_length": 189.6,
264
+ "completions/mean_length": 102.625,
265
+ "completions/mean_terminated_length": 90.11666870117188,
266
+ "completions/min_length": 18.8,
267
+ "completions/min_terminated_length": 18.8,
268
+ "entropy": 0.15195838457439095,
269
+ "epoch": 25.0,
270
+ "frac_reward_zero_std": 0.6,
271
+ "grad_norm": 0.208984375,
272
+ "learning_rate": 4.865740740740741e-06,
273
+ "loss": 0.04500017166137695,
274
+ "num_tokens": 135121.0,
275
+ "reward": 0.5009119868278503,
276
+ "reward_std": 0.5213470441231038,
277
+ "rewards/reward_fn/mean": 0.5009119868278503,
278
+ "rewards/reward_fn/std": 0.5213470560469432,
279
  "step": 50,
280
+ "step_time": 32.55890014459801
281
  }
282
  ],
283
  "logging_steps": 5,
284
+ "max_steps": 1100,
285
+ "num_input_tokens_seen": 135121,
286
+ "num_train_epochs": 550,
287
  "save_steps": 50,
288
  "stateful_callbacks": {
289
  "TrainerControl": {
checkpoint-50/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cf50531ca359d39fbfa05dc0896e03532a83d53b029182b6d7f757efab0c97a
3
  size 7185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354b92cb3fc579afc26aea1b098796649abd9be8f2b683beb58a14a6bab0c7d4
3
  size 7185