Timusgeorge commited on
Commit
e63870d
·
verified ·
1 Parent(s): 8bda97a

Upload post_training_eval.json

Browse files
Files changed (1) hide show
  1. outputs/post_training_eval.json +373 -0
outputs/post_training_eval.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base": {
3
+ "results": [
4
+ {
5
+ "seed": 42,
6
+ "difficulty": "easy",
7
+ "score": 0.1736,
8
+ "correct_flags": 1,
9
+ "false_positives": 0,
10
+ "correct_approvals": 0,
11
+ "missed_errors": 5,
12
+ "total_errors": 6,
13
+ "actions_taken": 4,
14
+ "actions_parsed": 4
15
+ },
16
+ {
17
+ "seed": 137,
18
+ "difficulty": "easy",
19
+ "score": 0.0167,
20
+ "correct_flags": 0,
21
+ "false_positives": 1,
22
+ "correct_approvals": 0,
23
+ "missed_errors": 6,
24
+ "total_errors": 6,
25
+ "actions_taken": 4,
26
+ "actions_parsed": 4
27
+ },
28
+ {
29
+ "seed": 256,
30
+ "difficulty": "easy",
31
+ "score": 0.0389,
32
+ "correct_flags": 0,
33
+ "false_positives": 0,
34
+ "correct_approvals": 0,
35
+ "missed_errors": 6,
36
+ "total_errors": 6,
37
+ "actions_taken": 7,
38
+ "actions_parsed": 7
39
+ },
40
+ {
41
+ "seed": 512,
42
+ "difficulty": "easy",
43
+ "score": 0.0333,
44
+ "correct_flags": 0,
45
+ "false_positives": 0,
46
+ "correct_approvals": 0,
47
+ "missed_errors": 6,
48
+ "total_errors": 6,
49
+ "actions_taken": 6,
50
+ "actions_parsed": 6
51
+ },
52
+ {
53
+ "seed": 1024,
54
+ "difficulty": "easy",
55
+ "score": 0.1736,
56
+ "correct_flags": 1,
57
+ "false_positives": 0,
58
+ "correct_approvals": 0,
59
+ "missed_errors": 5,
60
+ "total_errors": 6,
61
+ "actions_taken": 4,
62
+ "actions_parsed": 4
63
+ },
64
+ {
65
+ "seed": 42,
66
+ "difficulty": "medium",
67
+ "score": 0.0179,
68
+ "correct_flags": 0,
69
+ "false_positives": 0,
70
+ "correct_approvals": 0,
71
+ "missed_errors": 13,
72
+ "total_errors": 13,
73
+ "actions_taken": 7,
74
+ "actions_parsed": 7
75
+ },
76
+ {
77
+ "seed": 137,
78
+ "difficulty": "medium",
79
+ "score": 0.0256,
80
+ "correct_flags": 0,
81
+ "false_positives": 0,
82
+ "correct_approvals": 0,
83
+ "missed_errors": 13,
84
+ "total_errors": 13,
85
+ "actions_taken": 10,
86
+ "actions_parsed": 10
87
+ },
88
+ {
89
+ "seed": 256,
90
+ "difficulty": "medium",
91
+ "score": 0.01,
92
+ "correct_flags": 0,
93
+ "false_positives": 1,
94
+ "correct_approvals": 0,
95
+ "missed_errors": 13,
96
+ "total_errors": 13,
97
+ "actions_taken": 4,
98
+ "actions_parsed": 4
99
+ },
100
+ {
101
+ "seed": 512,
102
+ "difficulty": "medium",
103
+ "score": 0.0256,
104
+ "correct_flags": 0,
105
+ "false_positives": 0,
106
+ "correct_approvals": 0,
107
+ "missed_errors": 13,
108
+ "total_errors": 13,
109
+ "actions_taken": 10,
110
+ "actions_parsed": 10
111
+ },
112
+ {
113
+ "seed": 1024,
114
+ "difficulty": "medium",
115
+ "score": 0.01,
116
+ "correct_flags": 0,
117
+ "false_positives": 1,
118
+ "correct_approvals": 0,
119
+ "missed_errors": 13,
120
+ "total_errors": 13,
121
+ "actions_taken": 4,
122
+ "actions_parsed": 4
123
+ },
124
+ {
125
+ "seed": 42,
126
+ "difficulty": "hard",
127
+ "score": 0.01,
128
+ "correct_flags": 0,
129
+ "false_positives": 1,
130
+ "correct_approvals": 0,
131
+ "missed_errors": 16,
132
+ "total_errors": 16,
133
+ "actions_taken": 4,
134
+ "actions_parsed": 4
135
+ },
136
+ {
137
+ "seed": 137,
138
+ "difficulty": "hard",
139
+ "score": 0.01,
140
+ "correct_flags": 0,
141
+ "false_positives": 1,
142
+ "correct_approvals": 0,
143
+ "missed_errors": 17,
144
+ "total_errors": 17,
145
+ "actions_taken": 4,
146
+ "actions_parsed": 4
147
+ },
148
+ {
149
+ "seed": 256,
150
+ "difficulty": "hard",
151
+ "score": 0.01,
152
+ "correct_flags": 0,
153
+ "false_positives": 1,
154
+ "correct_approvals": 0,
155
+ "missed_errors": 14,
156
+ "total_errors": 14,
157
+ "actions_taken": 4,
158
+ "actions_parsed": 4
159
+ },
160
+ {
161
+ "seed": 512,
162
+ "difficulty": "hard",
163
+ "score": 0.0214,
164
+ "correct_flags": 0,
165
+ "false_positives": 0,
166
+ "correct_approvals": 0,
167
+ "missed_errors": 14,
168
+ "total_errors": 14,
169
+ "actions_taken": 9,
170
+ "actions_parsed": 9
171
+ },
172
+ {
173
+ "seed": 1024,
174
+ "difficulty": "hard",
175
+ "score": 0.0235,
176
+ "correct_flags": 0,
177
+ "false_positives": 0,
178
+ "correct_approvals": 0,
179
+ "missed_errors": 17,
180
+ "total_errors": 17,
181
+ "actions_taken": 12,
182
+ "actions_parsed": 12
183
+ }
184
+ ],
185
+ "overall": 0.04
186
+ },
187
+ "trained": {
188
+ "results": [
189
+ {
190
+ "seed": 42,
191
+ "difficulty": "easy",
192
+ "score": 0.2958,
193
+ "correct_flags": 1,
194
+ "false_positives": 0,
195
+ "correct_approvals": 2,
196
+ "missed_errors": 5,
197
+ "total_errors": 6,
198
+ "actions_taken": 7,
199
+ "actions_parsed": 7
200
+ },
201
+ {
202
+ "seed": 137,
203
+ "difficulty": "easy",
204
+ "score": 0.25,
205
+ "correct_flags": 0,
206
+ "false_positives": 1,
207
+ "correct_approvals": 2,
208
+ "missed_errors": 6,
209
+ "total_errors": 6,
210
+ "actions_taken": 9,
211
+ "actions_parsed": 9
212
+ },
213
+ {
214
+ "seed": 256,
215
+ "difficulty": "easy",
216
+ "score": 0.3402,
217
+ "correct_flags": 1,
218
+ "false_positives": 0,
219
+ "correct_approvals": 4,
220
+ "missed_errors": 5,
221
+ "total_errors": 6,
222
+ "actions_taken": 9,
223
+ "actions_parsed": 9
224
+ },
225
+ {
226
+ "seed": 512,
227
+ "difficulty": "easy",
228
+ "score": 0.2712,
229
+ "correct_flags": 1,
230
+ "false_positives": 2,
231
+ "correct_approvals": 1,
232
+ "missed_errors": 5,
233
+ "total_errors": 6,
234
+ "actions_taken": 12,
235
+ "actions_parsed": 12
236
+ },
237
+ {
238
+ "seed": 1024,
239
+ "difficulty": "easy",
240
+ "score": 0.2791,
241
+ "correct_flags": 1,
242
+ "false_positives": 0,
243
+ "correct_approvals": 1,
244
+ "missed_errors": 5,
245
+ "total_errors": 6,
246
+ "actions_taken": 10,
247
+ "actions_parsed": 10
248
+ },
249
+ {
250
+ "seed": 42,
251
+ "difficulty": "medium",
252
+ "score": 0.1308,
253
+ "correct_flags": 0,
254
+ "false_positives": 1,
255
+ "correct_approvals": 2,
256
+ "missed_errors": 13,
257
+ "total_errors": 13,
258
+ "actions_taken": 12,
259
+ "actions_parsed": 12
260
+ },
261
+ {
262
+ "seed": 137,
263
+ "difficulty": "medium",
264
+ "score": 0.01,
265
+ "correct_flags": 0,
266
+ "false_positives": 0,
267
+ "correct_approvals": 0,
268
+ "missed_errors": 13,
269
+ "total_errors": 13,
270
+ "actions_taken": 0,
271
+ "actions_parsed": 0
272
+ },
273
+ {
274
+ "seed": 256,
275
+ "difficulty": "medium",
276
+ "score": 0.0923,
277
+ "correct_flags": 0,
278
+ "false_positives": 1,
279
+ "correct_approvals": 1,
280
+ "missed_errors": 13,
281
+ "total_errors": 13,
282
+ "actions_taken": 10,
283
+ "actions_parsed": 10
284
+ },
285
+ {
286
+ "seed": 512,
287
+ "difficulty": "medium",
288
+ "score": 0.2393,
289
+ "correct_flags": 1,
290
+ "false_positives": 0,
291
+ "correct_approvals": 6,
292
+ "missed_errors": 12,
293
+ "total_errors": 13,
294
+ "actions_taken": 12,
295
+ "actions_parsed": 12
296
+ },
297
+ {
298
+ "seed": 1024,
299
+ "difficulty": "medium",
300
+ "score": 0.1735,
301
+ "correct_flags": 1,
302
+ "false_positives": 3,
303
+ "correct_approvals": 1,
304
+ "missed_errors": 12,
305
+ "total_errors": 13,
306
+ "actions_taken": 16,
307
+ "actions_parsed": 16
308
+ },
309
+ {
310
+ "seed": 42,
311
+ "difficulty": "hard",
312
+ "score": 0.0271,
313
+ "correct_flags": 0,
314
+ "false_positives": 1,
315
+ "correct_approvals": 0,
316
+ "missed_errors": 16,
317
+ "total_errors": 16,
318
+ "actions_taken": 14,
319
+ "actions_parsed": 14
320
+ },
321
+ {
322
+ "seed": 137,
323
+ "difficulty": "hard",
324
+ "score": 0.0235,
325
+ "correct_flags": 0,
326
+ "false_positives": 1,
327
+ "correct_approvals": 0,
328
+ "missed_errors": 17,
329
+ "total_errors": 17,
330
+ "actions_taken": 13,
331
+ "actions_parsed": 13
332
+ },
333
+ {
334
+ "seed": 256,
335
+ "difficulty": "hard",
336
+ "score": 0.0262,
337
+ "correct_flags": 0,
338
+ "false_positives": 1,
339
+ "correct_approvals": 0,
340
+ "missed_errors": 14,
341
+ "total_errors": 14,
342
+ "actions_taken": 12,
343
+ "actions_parsed": 12
344
+ },
345
+ {
346
+ "seed": 512,
347
+ "difficulty": "hard",
348
+ "score": 0.0771,
349
+ "correct_flags": 1,
350
+ "false_positives": 0,
351
+ "correct_approvals": 0,
352
+ "missed_errors": 13,
353
+ "total_errors": 14,
354
+ "actions_taken": 4,
355
+ "actions_parsed": 4
356
+ },
357
+ {
358
+ "seed": 1024,
359
+ "difficulty": "hard",
360
+ "score": 0.0638,
361
+ "correct_flags": 1,
362
+ "false_positives": 0,
363
+ "correct_approvals": 0,
364
+ "missed_errors": 16,
365
+ "total_errors": 17,
366
+ "actions_taken": 4,
367
+ "actions_parsed": 4
368
+ }
369
+ ],
370
+ "overall": 0.1533
371
+ },
372
+ "improvement": 283.25
373
+ }