cmpatino HF Staff commited on
Commit
59427d7
·
verified ·
1 Parent(s): b06b13f

Upload results/bon_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. results/bon_results.json +780 -0
results/bon_results.json ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "unique_id": "test/intermediate_algebra/1566.json",
4
+ "level": 3,
5
+ "subject": "Intermediate Algebra",
6
+ "ground_truth": "2k",
7
+ "greedy_answer": "2a + ka + \\frac{k^2a}{4}",
8
+ "greedy_correct": false,
9
+ "weighted_bon_answer": "0",
10
+ "weighted_bon_correct": false,
11
+ "standard_bon_answer": "0",
12
+ "standard_bon_correct": false,
13
+ "majority_vote_answer": "0",
14
+ "majority_vote_correct": false,
15
+ "n_correct_in_16": 0,
16
+ "answer_score_breakdown": {
17
+ "0": 0.8788591101765633,
18
+ "-1": 0.13640767335891724,
19
+ "2kh": 0.08962094038724899,
20
+ "2(a + c - k)": 0.08848164230585098,
21
+ "a(1 - 2h) + 4ak^2 - 4akh + ah^2 + k": 0.0501532219350338,
22
+ "2a + ah^2 + 2k": 0.05872105434536934,
23
+ "a + b + 2c + k - h": 0.045371297746896744,
24
+ "2k^2": 0.1810130625963211,
25
+ "4k": 0.1739559918642044,
26
+ "2(2k)": 0.08108767122030258,
27
+ "4a + b + c + e + f + 2ak": 0.020251676440238953,
28
+ "2d(1 + h) + b(h + c) - 2k + h": 0.03740683197975159
29
+ },
30
+ "prm_scores": [
31
+ 0.38959258794784546,
32
+ 0.13640767335891724,
33
+ 0.08962094038724899,
34
+ 0.08848164230585098,
35
+ 0.0501532219350338,
36
+ 0.05872105434536934,
37
+ 0.045371297746896744,
38
+ 0.10005511343479156,
39
+ 0.16545328497886658,
40
+ 0.1400100141763687,
41
+ 0.08374810963869095,
42
+ 0.1810130625963211,
43
+ 0.1739559918642044,
44
+ 0.08108767122030258,
45
+ 0.020251676440238953,
46
+ 0.03740683197975159
47
+ ]
48
+ },
49
+ {
50
+ "unique_id": "test/algebra/1265.json",
51
+ "level": 2,
52
+ "subject": "Algebra",
53
+ "ground_truth": "\\frac{3}{2}",
54
+ "greedy_answer": "3",
55
+ "greedy_correct": false,
56
+ "weighted_bon_answer": "\\frac{3}{2}",
57
+ "weighted_bon_correct": true,
58
+ "standard_bon_answer": "\\frac{3}{2}",
59
+ "standard_bon_correct": true,
60
+ "majority_vote_answer": "\\frac{3}{2}",
61
+ "majority_vote_correct": true,
62
+ "n_correct_in_16": 12,
63
+ "answer_score_breakdown": {
64
+ "\\frac{3}{2}": 9.525228440761566,
65
+ "2": 0.04569903016090393,
66
+ "1": 0.035691916942596436,
67
+ "4^{\\frac{3}{2}}": 0.7074317932128906,
68
+ "\\frac{1}{2}": 0.02362700365483761
69
+ },
70
+ "prm_scores": [
71
+ 0.9931477904319763,
72
+ 0.08908459544181824,
73
+ 0.9114416241645813,
74
+ 0.04569903016090393,
75
+ 0.8072997331619263,
76
+ 0.9522070288658142,
77
+ 0.035691916942596436,
78
+ 0.3795792758464813,
79
+ 0.9849416017532349,
80
+ 0.7074317932128906,
81
+ 0.02362700365483761,
82
+ 0.982453465461731,
83
+ 0.9895228743553162,
84
+ 0.7293780446052551,
85
+ 0.9169456958770752,
86
+ 0.7892267107963562
87
+ ]
88
+ },
89
+ {
90
+ "unique_id": "test/algebra/2036.json",
91
+ "level": 3,
92
+ "subject": "Algebra",
93
+ "ground_truth": "3\\sqrt{13}",
94
+ "greedy_answer": "3\\sqrt{13}",
95
+ "greedy_correct": true,
96
+ "weighted_bon_answer": "3\\sqrt{13}",
97
+ "weighted_bon_correct": true,
98
+ "standard_bon_answer": "3\\sqrt{13}",
99
+ "standard_bon_correct": true,
100
+ "majority_vote_answer": "3\\sqrt{13}",
101
+ "majority_vote_correct": true,
102
+ "n_correct_in_16": 16,
103
+ "answer_score_breakdown": {
104
+ "3\\sqrt{13}": 15.872192740440369
105
+ },
106
+ "prm_scores": [
107
+ 0.9924262762069702,
108
+ 0.9941288232803345,
109
+ 0.9937927722930908,
110
+ 0.996006190776825,
111
+ 0.9884900450706482,
112
+ 0.9937055706977844,
113
+ 0.9957823753356934,
114
+ 0.9897115230560303,
115
+ 0.9855998754501343,
116
+ 0.9935632944107056,
117
+ 0.9969964027404785,
118
+ 0.9893608689308167,
119
+ 0.994041383266449,
120
+ 0.9888609051704407,
121
+ 0.9883406162261963,
122
+ 0.991385817527771
123
+ ]
124
+ },
125
+ {
126
+ "unique_id": "test/algebra/114.json",
127
+ "level": 1,
128
+ "subject": "Algebra",
129
+ "ground_truth": "4",
130
+ "greedy_answer": "4",
131
+ "greedy_correct": true,
132
+ "weighted_bon_answer": "4",
133
+ "weighted_bon_correct": true,
134
+ "standard_bon_answer": "4",
135
+ "standard_bon_correct": true,
136
+ "majority_vote_answer": "4",
137
+ "majority_vote_correct": true,
138
+ "n_correct_in_16": 16,
139
+ "answer_score_breakdown": {
140
+ "4": 15.427059769630432
141
+ },
142
+ "prm_scores": [
143
+ 0.9632772207260132,
144
+ 0.9753599762916565,
145
+ 0.8430228233337402,
146
+ 0.9805755615234375,
147
+ 0.9829703569412231,
148
+ 0.9770132899284363,
149
+ 0.9656716585159302,
150
+ 0.9811027646064758,
151
+ 0.9701114296913147,
152
+ 0.8994367718696594,
153
+ 0.9830636382102966,
154
+ 0.9811835289001465,
155
+ 0.9836723208427429,
156
+ 0.9814204573631287,
157
+ 0.9797684550285339,
158
+ 0.9794095158576965
159
+ ]
160
+ },
161
+ {
162
+ "unique_id": "test/number_theory/847.json",
163
+ "level": 3,
164
+ "subject": "Number Theory",
165
+ "ground_truth": "4210_{5}",
166
+ "greedy_answer": "4210_5",
167
+ "greedy_correct": false,
168
+ "weighted_bon_answer": "4210_5",
169
+ "weighted_bon_correct": false,
170
+ "standard_bon_answer": "4210",
171
+ "standard_bon_correct": false,
172
+ "majority_vote_answer": "4210_5",
173
+ "majority_vote_correct": false,
174
+ "n_correct_in_16": 0,
175
+ "answer_score_breakdown": {
176
+ "422_5": 0.4499802887439728,
177
+ "4210_5": 5.877732008695602,
178
+ "111_5": 0.026935415342450142,
179
+ "4221_5": 0.2756074368953705,
180
+ "420_5": 0.11346089839935303,
181
+ "4444_5": 0.013120382092893124,
182
+ "4210": 1.7479697465896606
183
+ },
184
+ "prm_scores": [
185
+ 0.4499802887439728,
186
+ 0.583568274974823,
187
+ 0.9537349343299866,
188
+ 0.026935415342450142,
189
+ 0.5232129096984863,
190
+ 0.2756074368953705,
191
+ 0.11346089839935303,
192
+ 0.020330563187599182,
193
+ 0.013120382092893124,
194
+ 0.9368847012519836,
195
+ 0.8960930109024048,
196
+ 0.7835883498191833,
197
+ 0.37749049067497253,
198
+ 0.9643813967704773,
199
+ 0.8129322528839111,
200
+ 0.7938154339790344
201
+ ]
202
+ },
203
+ {
204
+ "unique_id": "test/intermediate_algebra/1898.json",
205
+ "level": 3,
206
+ "subject": "Intermediate Algebra",
207
+ "ground_truth": "-5",
208
+ "greedy_answer": "-5",
209
+ "greedy_correct": true,
210
+ "weighted_bon_answer": "-5",
211
+ "weighted_bon_correct": true,
212
+ "standard_bon_answer": "-5",
213
+ "standard_bon_correct": true,
214
+ "majority_vote_answer": "-5",
215
+ "majority_vote_correct": true,
216
+ "n_correct_in_16": 14,
217
+ "answer_score_breakdown": {
218
+ "-5": 13.510545134544373,
219
+ "-17.5": 0.0118635855615139,
220
+ "7 \\times (-3) = -21": 0.17152179777622223
221
+ },
222
+ "prm_scores": [
223
+ 0.990667998790741,
224
+ 0.9914085268974304,
225
+ 0.8539301753044128,
226
+ 0.991192638874054,
227
+ 0.9093068838119507,
228
+ 0.9817348122596741,
229
+ 0.0118635855615139,
230
+ 0.9950287938117981,
231
+ 0.17152179777622223,
232
+ 0.8527193665504456,
233
+ 0.9926127195358276,
234
+ 0.9962577819824219,
235
+ 0.9950143694877625,
236
+ 0.9685443639755249,
237
+ 0.9975405931472778,
238
+ 0.9945861101150513
239
+ ]
240
+ },
241
+ {
242
+ "unique_id": "test/intermediate_algebra/515.json",
243
+ "level": 2,
244
+ "subject": "Intermediate Algebra",
245
+ "ground_truth": "\\frac43",
246
+ "greedy_answer": "\\frac{4}{3}",
247
+ "greedy_correct": false,
248
+ "weighted_bon_answer": "\\frac{4}{3}",
249
+ "weighted_bon_correct": false,
250
+ "standard_bon_answer": "\\frac{4}{3}",
251
+ "standard_bon_correct": false,
252
+ "majority_vote_answer": "\\frac{4}{3}",
253
+ "majority_vote_correct": false,
254
+ "n_correct_in_16": 0,
255
+ "answer_score_breakdown": {
256
+ "\\frac{4}{3}": 12.993444621562958,
257
+ "y=\\frac{4}{3}": 0.7456207871437073
258
+ },
259
+ "prm_scores": [
260
+ 0.9757433533668518,
261
+ 0.9152928590774536,
262
+ 0.9262116551399231,
263
+ 0.9329642653465271,
264
+ 0.8159711956977844,
265
+ 0.973721981048584,
266
+ 0.8773332834243774,
267
+ 0.9459983110427856,
268
+ 0.7456207871437073,
269
+ 0.9413409233093262,
270
+ 0.9691257476806641,
271
+ 0.8798498511314392,
272
+ 0.9651709794998169,
273
+ 0.9703044295310974,
274
+ 0.9156308770179749,
275
+ 0.953955888748169
276
+ ]
277
+ },
278
+ {
279
+ "unique_id": "test/prealgebra/1247.json",
280
+ "level": 2,
281
+ "subject": "Prealgebra",
282
+ "ground_truth": "5",
283
+ "greedy_answer": "5",
284
+ "greedy_correct": true,
285
+ "weighted_bon_answer": "5",
286
+ "weighted_bon_correct": true,
287
+ "standard_bon_answer": "5",
288
+ "standard_bon_correct": true,
289
+ "majority_vote_answer": "5",
290
+ "majority_vote_correct": true,
291
+ "n_correct_in_16": 14,
292
+ "answer_score_breakdown": {
293
+ "5": 13.84566581249237,
294
+ "6": 0.04288877546787262,
295
+ "7": 0.021614858880639076
296
+ },
297
+ "prm_scores": [
298
+ 0.9874890446662903,
299
+ 0.9890925884246826,
300
+ 0.9906784296035767,
301
+ 0.9951796531677246,
302
+ 0.9833526611328125,
303
+ 0.9943066239356995,
304
+ 0.04288877546787262,
305
+ 0.9912968873977661,
306
+ 0.996322751045227,
307
+ 0.021614858880639076,
308
+ 0.9761733412742615,
309
+ 0.9781678318977356,
310
+ 0.9939509630203247,
311
+ 0.9837079644203186,
312
+ 0.9914649128913879,
313
+ 0.994482159614563
314
+ ]
315
+ },
316
+ {
317
+ "unique_id": "test/algebra/518.json",
318
+ "level": 3,
319
+ "subject": "Algebra",
320
+ "ground_truth": "\\frac 34",
321
+ "greedy_answer": "\\frac{4}{3}",
322
+ "greedy_correct": false,
323
+ "weighted_bon_answer": "\\frac{3}{4}",
324
+ "weighted_bon_correct": false,
325
+ "standard_bon_answer": "\\frac{3}{4}",
326
+ "standard_bon_correct": false,
327
+ "majority_vote_answer": "\\frac{3}{4}",
328
+ "majority_vote_correct": false,
329
+ "n_correct_in_16": 0,
330
+ "answer_score_breakdown": {
331
+ "\\frac{3}{4}": 8.010839819908142,
332
+ "\\frac{4}{3}": 0.8271265625953674,
333
+ "1": 0.0782640278339386,
334
+ "\\frac{16}{9}": 0.08907447010278702
335
+ },
336
+ "prm_scores": [
337
+ 0.9174371957778931,
338
+ 0.8461792469024658,
339
+ 0.9599809050559998,
340
+ 0.277146577835083,
341
+ 0.8468650579452515,
342
+ 0.9560564160346985,
343
+ 0.4508325159549713,
344
+ 0.9334309101104736,
345
+ 0.7827900052070618,
346
+ 0.2229481041431427,
347
+ 0.9262115359306335,
348
+ 0.9293907880783081,
349
+ 0.7586770057678223,
350
+ 0.0782640278339386,
351
+ 0.08907447010278702,
352
+ 0.09914746880531311
353
+ ]
354
+ },
355
+ {
356
+ "unique_id": "test/geometry/178.json",
357
+ "level": 3,
358
+ "subject": "Geometry",
359
+ "ground_truth": "2",
360
+ "greedy_answer": "2",
361
+ "greedy_correct": true,
362
+ "weighted_bon_answer": "2",
363
+ "weighted_bon_correct": true,
364
+ "standard_bon_answer": "2",
365
+ "standard_bon_correct": true,
366
+ "majority_vote_answer": "2",
367
+ "majority_vote_correct": true,
368
+ "n_correct_in_16": 14,
369
+ "answer_score_breakdown": {
370
+ "2": 13.525502800941467,
371
+ "2\\sqrt{5}": 0.02531031146645546,
372
+ "\\sqrt{2}": 0.09983386844396591
373
+ },
374
+ "prm_scores": [
375
+ 0.9753232598304749,
376
+ 0.9023347496986389,
377
+ 0.9684244990348816,
378
+ 0.9912661910057068,
379
+ 0.9640752673149109,
380
+ 0.02531031146645546,
381
+ 0.9858566522598267,
382
+ 0.9645191431045532,
383
+ 0.9965499639511108,
384
+ 0.9715750217437744,
385
+ 0.9929128289222717,
386
+ 0.9601553082466125,
387
+ 0.9364507794380188,
388
+ 0.09983386844396591,
389
+ 0.9597406387329102,
390
+ 0.9563184976577759
391
+ ]
392
+ },
393
+ {
394
+ "unique_id": "test/intermediate_algebra/860.json",
395
+ "level": 2,
396
+ "subject": "Intermediate Algebra",
397
+ "ground_truth": "\\text{ellipse}",
398
+ "greedy_answer": null,
399
+ "greedy_correct": false,
400
+ "weighted_bon_answer": "\\text{ellipse}",
401
+ "weighted_bon_correct": true,
402
+ "standard_bon_answer": "\\text{circle}",
403
+ "standard_bon_correct": false,
404
+ "majority_vote_answer": "\\text{ellipse}",
405
+ "majority_vote_correct": true,
406
+ "n_correct_in_16": 5,
407
+ "answer_score_breakdown": {
408
+ "\\text{ellipse}": 0.4748514872044325,
409
+ "\\text{circle}": 0.19727817177772522
410
+ },
411
+ "prm_scores": [
412
+ 0.15957516431808472,
413
+ 0.12498065829277039,
414
+ 0.13995392620563507,
415
+ 0.11687459796667099,
416
+ 0.17295509576797485,
417
+ 0.025766996666789055,
418
+ 0.12410514056682587,
419
+ 0.09207811206579208,
420
+ 0.1817529797554016,
421
+ 0.14232255518436432,
422
+ 0.19727817177772522,
423
+ 0.12983134388923645,
424
+ 0.1009967178106308,
425
+ 0.06759987026453018,
426
+ 0.4757905900478363,
427
+ 0.23078562319278717
428
+ ]
429
+ },
430
+ {
431
+ "unique_id": "test/number_theory/22.json",
432
+ "level": 3,
433
+ "subject": "Number Theory",
434
+ "ground_truth": "2",
435
+ "greedy_answer": "6",
436
+ "greedy_correct": false,
437
+ "weighted_bon_answer": "2",
438
+ "weighted_bon_correct": true,
439
+ "standard_bon_answer": "2",
440
+ "standard_bon_correct": true,
441
+ "majority_vote_answer": "0",
442
+ "majority_vote_correct": false,
443
+ "n_correct_in_16": 5,
444
+ "answer_score_breakdown": {
445
+ "0": 0.18049156479537487,
446
+ "1": 0.22785289399325848,
447
+ "2": 2.4990325272083282,
448
+ "6": 0.7855422422289848,
449
+ "5": 0.1700340360403061
450
+ },
451
+ "prm_scores": [
452
+ 0.04065684974193573,
453
+ 0.1668420284986496,
454
+ 0.6439529061317444,
455
+ 0.12490694969892502,
456
+ 0.6606352925300598,
457
+ 0.5643002986907959,
458
+ 0.028800513595342636,
459
+ 0.1513165831565857,
460
+ 0.6899784803390503,
461
+ 0.022786086425185204,
462
+ 0.1700340360403061,
463
+ 0.03950735554099083,
464
+ 0.053026698529720306,
465
+ 0.02150350995361805,
466
+ 0.035221416503190994,
467
+ 0.449484258890152
468
+ ]
469
+ },
470
+ {
471
+ "unique_id": "test/algebra/2551.json",
472
+ "level": 1,
473
+ "subject": "Algebra",
474
+ "ground_truth": "10",
475
+ "greedy_answer": "10",
476
+ "greedy_correct": true,
477
+ "weighted_bon_answer": "10",
478
+ "weighted_bon_correct": true,
479
+ "standard_bon_answer": "10",
480
+ "standard_bon_correct": true,
481
+ "majority_vote_answer": "10",
482
+ "majority_vote_correct": true,
483
+ "n_correct_in_16": 15,
484
+ "answer_score_breakdown": {
485
+ "10": 14.916400849819183,
486
+ "3": 0.3374885618686676
487
+ },
488
+ "prm_scores": [
489
+ 0.9954132437705994,
490
+ 0.994917631149292,
491
+ 0.9930395483970642,
492
+ 0.9946964979171753,
493
+ 0.9958835244178772,
494
+ 0.9911690950393677,
495
+ 0.9968792200088501,
496
+ 0.9965872764587402,
497
+ 0.3374885618686676,
498
+ 0.9960399866104126,
499
+ 0.994235098361969,
500
+ 0.9959813356399536,
501
+ 0.9918780326843262,
502
+ 0.9883254766464233,
503
+ 0.9951592087745667,
504
+ 0.9961956739425659
505
+ ]
506
+ },
507
+ {
508
+ "unique_id": "test/algebra/24.json",
509
+ "level": 1,
510
+ "subject": "Algebra",
511
+ "ground_truth": "2000",
512
+ "greedy_answer": "2000",
513
+ "greedy_correct": true,
514
+ "weighted_bon_answer": "2000",
515
+ "weighted_bon_correct": true,
516
+ "standard_bon_answer": "2000",
517
+ "standard_bon_correct": true,
518
+ "majority_vote_answer": "2000",
519
+ "majority_vote_correct": true,
520
+ "n_correct_in_16": 15,
521
+ "answer_score_breakdown": {
522
+ "2000": 14.198634266853333
523
+ },
524
+ "prm_scores": [
525
+ 0.9872841835021973,
526
+ 0.9413870573043823,
527
+ 0.9889976382255554,
528
+ 0.9923239946365356,
529
+ 0.9830780625343323,
530
+ 0.9846431016921997,
531
+ 0.9720523953437805,
532
+ 0.9805113673210144,
533
+ 0.9480443000793457,
534
+ 0.9811661839485168,
535
+ 0.9770664572715759,
536
+ 0.7148209810256958,
537
+ 0.846014678478241,
538
+ 0.7874448895454407,
539
+ 0.9811532497406006,
540
+ 0.9200906157493591
541
+ ]
542
+ },
543
+ {
544
+ "unique_id": "test/intermediate_algebra/1714.json",
545
+ "level": 1,
546
+ "subject": "Intermediate Algebra",
547
+ "ground_truth": "(5,\\infty)",
548
+ "greedy_answer": "(5, \\infty)",
549
+ "greedy_correct": false,
550
+ "weighted_bon_answer": "(5, \\infty)",
551
+ "weighted_bon_correct": false,
552
+ "standard_bon_answer": "(5, \\infty)",
553
+ "standard_bon_correct": false,
554
+ "majority_vote_answer": "(5, \\infty)",
555
+ "majority_vote_correct": false,
556
+ "n_correct_in_16": 0,
557
+ "answer_score_breakdown": {
558
+ "(5, \\infty)": 9.382436901330948,
559
+ "(-\\infty, 5) \\cup (5, \\infty)": 0.05053659901022911,
560
+ "(-\\infty, 5)": 0.29756873846054077,
561
+ "(-\\infty, 5) \\cup (5, +\\infty)": 0.042136698961257935
562
+ },
563
+ "prm_scores": [
564
+ 0.9431021809577942,
565
+ 0.9451705813407898,
566
+ 0.9344722628593445,
567
+ 0.8696272373199463,
568
+ 0.05053659901022911,
569
+ 0.8996963500976562,
570
+ 0.8450660109519958,
571
+ 0.8897750377655029,
572
+ 0.6697665452957153,
573
+ 0.9658312797546387,
574
+ 0.8602502942085266,
575
+ 0.793439507484436,
576
+ 0.9303714036941528,
577
+ 0.2990742623806,
578
+ 0.29756873846054077,
579
+ 0.042136698961257935
580
+ ]
581
+ },
582
+ {
583
+ "unique_id": "test/precalculus/541.json",
584
+ "level": 2,
585
+ "subject": "Precalculus",
586
+ "ground_truth": "\\left( \\frac{3}{2}, -13 \\right)",
587
+ "greedy_answer": "\\left( \\frac{3}{2}, -13 \\right)",
588
+ "greedy_correct": true,
589
+ "weighted_bon_answer": "\\left( \\frac{3}{2}, -13 \\right)",
590
+ "weighted_bon_correct": true,
591
+ "standard_bon_answer": "\\left(\\frac{3}{2}, -13\\right)",
592
+ "standard_bon_correct": false,
593
+ "majority_vote_answer": "\\left( \\frac{3}{2}, -13 \\right)",
594
+ "majority_vote_correct": true,
595
+ "n_correct_in_16": 8,
596
+ "answer_score_breakdown": {
597
+ "\\left(\\frac{3}{2}, -13\\right)": 2.828850492835045,
598
+ "\\left( \\frac{3}{2}, -13 \\right)": 6.08513518422842,
599
+ "(1, -1)": 0.004213116597384214,
600
+ "\\left(m, b\\right) = (0, -13)": 0.039167147129774094,
601
+ "\\left(\\frac{3}{2}, 23\\right)": 0.014369720593094826,
602
+ "(3, -25)": 0.08800287544727325
603
+ },
604
+ "prm_scores": [
605
+ 0.6061789989471436,
606
+ 0.9395573735237122,
607
+ 0.004213116597384214,
608
+ 0.039167147129774094,
609
+ 0.2117328941822052,
610
+ 0.014369720593094826,
611
+ 0.12005805224180222,
612
+ 0.9928120970726013,
613
+ 0.897468626499176,
614
+ 0.9848745465278625,
615
+ 0.9866357445716858,
616
+ 0.9895706176757812,
617
+ 0.08800287544727325,
618
+ 0.9581692814826965,
619
+ 0.24028877913951874,
620
+ 0.9866386651992798
621
+ ]
622
+ },
623
+ {
624
+ "unique_id": "test/geometry/248.json",
625
+ "level": 2,
626
+ "subject": "Geometry",
627
+ "ground_truth": "5",
628
+ "greedy_answer": "5",
629
+ "greedy_correct": true,
630
+ "weighted_bon_answer": "5",
631
+ "weighted_bon_correct": true,
632
+ "standard_bon_answer": "5",
633
+ "standard_bon_correct": true,
634
+ "majority_vote_answer": "5",
635
+ "majority_vote_correct": true,
636
+ "n_correct_in_16": 16,
637
+ "answer_score_breakdown": {
638
+ "5": 15.969408690929413
639
+ },
640
+ "prm_scores": [
641
+ 0.9981712102890015,
642
+ 0.9989147186279297,
643
+ 0.9964959025382996,
644
+ 0.9971887469291687,
645
+ 0.9961889386177063,
646
+ 0.9972904920578003,
647
+ 0.9975918531417847,
648
+ 0.9987127780914307,
649
+ 0.9991645812988281,
650
+ 0.9987006187438965,
651
+ 0.9983342289924622,
652
+ 0.9986819624900818,
653
+ 0.9985302686691284,
654
+ 0.9986613988876343,
655
+ 0.9977825284004211,
656
+ 0.9989984631538391
657
+ ]
658
+ },
659
+ {
660
+ "unique_id": "test/number_theory/1032.json",
661
+ "level": 3,
662
+ "subject": "Number Theory",
663
+ "ground_truth": "2220",
664
+ "greedy_answer": "2",
665
+ "greedy_correct": false,
666
+ "weighted_bon_answer": "222",
667
+ "weighted_bon_correct": false,
668
+ "standard_bon_answer": "222",
669
+ "standard_bon_correct": false,
670
+ "majority_vote_answer": "222",
671
+ "majority_vote_correct": false,
672
+ "n_correct_in_16": 2,
673
+ "answer_score_breakdown": {
674
+ "222": 1.5699958242475986,
675
+ "2220": 1.2304129600524902,
676
+ "2": 0.050813913345336914,
677
+ "232": 0.015271982178092003,
678
+ "60": 0.1477309763431549,
679
+ "22": 0.027951523661613464,
680
+ "\\text{No such number exists}": 0.016663117334246635,
681
+ "225": 0.04364263266324997
682
+ },
683
+ "prm_scores": [
684
+ 0.2567778527736664,
685
+ 0.5941802263259888,
686
+ 0.028183938935399055,
687
+ 0.015271982178092003,
688
+ 0.1477309763431549,
689
+ 0.4586124122142792,
690
+ 0.025898529216647148,
691
+ 0.02262997440993786,
692
+ 0.7697120308876038,
693
+ 0.053086504340171814,
694
+ 0.027951523661613464,
695
+ 0.016663117334246635,
696
+ 0.04364263266324997,
697
+ 0.03180702403187752,
698
+ 0.6362327337265015,
699
+ 0.028957907110452652
700
+ ]
701
+ },
702
+ {
703
+ "unique_id": "test/number_theory/45.json",
704
+ "level": 3,
705
+ "subject": "Number Theory",
706
+ "ground_truth": "23",
707
+ "greedy_answer": "15",
708
+ "greedy_correct": false,
709
+ "weighted_bon_answer": "23",
710
+ "weighted_bon_correct": true,
711
+ "standard_bon_answer": "23",
712
+ "standard_bon_correct": true,
713
+ "majority_vote_answer": "23",
714
+ "majority_vote_correct": true,
715
+ "n_correct_in_16": 8,
716
+ "answer_score_breakdown": {
717
+ "143": 0.16297440230846405,
718
+ "23": 5.357615023851395,
719
+ "55": 0.10273180902004242,
720
+ "683": 0.008670372888445854
721
+ },
722
+ "prm_scores": [
723
+ 0.015562820248305798,
724
+ 0.6751413345336914,
725
+ 0.8827845454216003,
726
+ 0.011336199007928371,
727
+ 0.03494024649262428,
728
+ 0.25825634598731995,
729
+ 0.9719118475914001,
730
+ 0.10273180902004242,
731
+ 0.5386189818382263,
732
+ 0.07233847677707672,
733
+ 0.012693990021944046,
734
+ 0.8937028646469116,
735
+ 0.3831983804702759,
736
+ 0.008670372888445854,
737
+ 0.754000723361969,
738
+ 0.01610266976058483
739
+ ]
740
+ },
741
+ {
742
+ "unique_id": "test/intermediate_algebra/431.json",
743
+ "level": 1,
744
+ "subject": "Intermediate Algebra",
745
+ "ground_truth": "(a+5)(b+2)",
746
+ "greedy_answer": "(a + 5)(b + 2)",
747
+ "greedy_correct": false,
748
+ "weighted_bon_answer": "(a + 5)(b + 2)",
749
+ "weighted_bon_correct": false,
750
+ "standard_bon_answer": "(a + 5)(b + 2)",
751
+ "standard_bon_correct": false,
752
+ "majority_vote_answer": "(a + 5)(b + 2)",
753
+ "majority_vote_correct": false,
754
+ "n_correct_in_16": 1,
755
+ "answer_score_breakdown": {
756
+ "(a + 5)(b + 2)": 9.67332112789154,
757
+ "(b + 2)(a + 5)": 3.951826512813568,
758
+ "(a+5)(b+2)": 0.9961338043212891,
759
+ "b(ab + 5) + 2(a + 5)": 0.2790054976940155
760
+ },
761
+ "prm_scores": [
762
+ 0.9980248212814331,
763
+ 0.998023509979248,
764
+ 0.9921653270721436,
765
+ 0.998106837272644,
766
+ 0.9981840252876282,
767
+ 0.7056999802589417,
768
+ 0.9969874024391174,
769
+ 0.9956356883049011,
770
+ 0.9968495965003967,
771
+ 0.9970625042915344,
772
+ 0.9912362098693848,
773
+ 0.9936162233352661,
774
+ 0.9961338043212891,
775
+ 0.9657171964645386,
776
+ 0.9978383183479309,
777
+ 0.2790054976940155
778
+ ]
779
+ }
780
+ ]