Ill-Ness commited on
Commit
5bfb4e3
·
verified ·
1 Parent(s): a8c2293

Delete benchmark_report.json

Browse files
Files changed (1) hide show
  1. benchmark_report.json +0 -3246
benchmark_report.json DELETED
@@ -1,3246 +0,0 @@
1
- {
2
- "model_source": "latest",
3
- "resolved_model": "/artifacts/20260514-174711/full-safetensors",
4
- "attention_impl": "sdpa",
5
- "note": "Local Modal benchmark run. GPQA examples are not included in this report.",
6
- "benchmarks": {
7
- "gpqa_diamond": {
8
- "dataset": "Idavidrein/gpqa",
9
- "config": "gpqa_diamond",
10
- "split": "train",
11
- "metric": "multiple_choice_accuracy",
12
- "value": 0.2265625,
13
- "correct": 29,
14
- "total": 128,
15
- "samples": [
16
- {
17
- "row": 0,
18
- "target": "D",
19
- "prediction": "C",
20
- "raw_prediction": "<think>\n\n</think>\n\nC. nucleosome",
21
- "correct": false
22
- },
23
- {
24
- "row": 1,
25
- "target": "D",
26
- "prediction": "D",
27
- "raw_prediction": "<think>\n\n</think>\n\nD. cytos",
28
- "correct": true
29
- },
30
- {
31
- "row": 2,
32
- "target": "B",
33
- "prediction": "C",
34
- "raw_prediction": "<think>\n\n</think>\n\nC",
35
- "correct": false
36
- },
37
- {
38
- "row": 3,
39
- "target": "A",
40
- "prediction": "A",
41
- "raw_prediction": "<think>\n\n</think>\n\nA",
42
- "correct": true
43
- },
44
- {
45
- "row": 4,
46
- "target": "D",
47
- "prediction": "B",
48
- "raw_prediction": "<think>\n\n</think>\n\nB",
49
- "correct": false
50
- },
51
- {
52
- "row": 5,
53
- "target": "C",
54
- "prediction": "A",
55
- "raw_prediction": "<think>\n\n</think>\n\nA",
56
- "correct": false
57
- },
58
- {
59
- "row": 6,
60
- "target": "C",
61
- "prediction": "A",
62
- "raw_prediction": "<think>\n\n</think>\n\nA. 3",
63
- "correct": false
64
- },
65
- {
66
- "row": 7,
67
- "target": "D",
68
- "prediction": "B",
69
- "raw_prediction": "<think>\n\n</think>\n\nB. the receptor",
70
- "correct": false
71
- },
72
- {
73
- "row": 8,
74
- "target": "A",
75
- "prediction": "B",
76
- "raw_prediction": "<think>\n\n</think>\n\nB",
77
- "correct": false
78
- },
79
- {
80
- "row": 9,
81
- "target": "B",
82
- "prediction": "C",
83
- "raw_prediction": "<think>\n</think>\n\nTo calculate the imaginary",
84
- "correct": false
85
- },
86
- {
87
- "row": 10,
88
- "target": "C",
89
- "prediction": "A",
90
- "raw_prediction": "<think>\n\n</think>\n\nA",
91
- "correct": false
92
- },
93
- {
94
- "row": 11,
95
- "target": "A",
96
- "prediction": "A",
97
- "raw_prediction": "<think>\n\n</think>\n\nThe mass dimension of",
98
- "correct": true
99
- },
100
- {
101
- "row": 12,
102
- "target": "A",
103
- "prediction": "C",
104
- "raw_prediction": "<think>\n\n</think>\n\nC",
105
- "correct": false
106
- },
107
- {
108
- "row": 13,
109
- "target": "D",
110
- "prediction": "B",
111
- "raw_prediction": "<think>\n\n</think>\n\nB",
112
- "correct": false
113
- },
114
- {
115
- "row": 14,
116
- "target": "D",
117
- "prediction": "A",
118
- "raw_prediction": "<think>\n</think>\n\nThe mean decay distance",
119
- "correct": false
120
- },
121
- {
122
- "row": 15,
123
- "target": "D",
124
- "prediction": "B",
125
- "raw_prediction": "<think>\n\n</think>\n\nB",
126
- "correct": false
127
- },
128
- {
129
- "row": 16,
130
- "target": "C",
131
- "prediction": "B",
132
- "raw_prediction": "<think>\n\n</think>\n\nB",
133
- "correct": false
134
- },
135
- {
136
- "row": 17,
137
- "target": "C",
138
- "prediction": "C",
139
- "raw_prediction": "<think>\n\n</think>\n\nC",
140
- "correct": true
141
- },
142
- {
143
- "row": 18,
144
- "target": "B",
145
- "prediction": "C",
146
- "raw_prediction": "<think>\n</think>\n\nTo calculate the non",
147
- "correct": false
148
- },
149
- {
150
- "row": 19,
151
- "target": "C",
152
- "prediction": "C",
153
- "raw_prediction": "<think>\n</think>\n\nThe correct answer is",
154
- "correct": true
155
- },
156
- {
157
- "row": 20,
158
- "target": "C",
159
- "prediction": "B",
160
- "raw_prediction": "<think>\n\n</think>\n\nB",
161
- "correct": false
162
- },
163
- {
164
- "row": 21,
165
- "target": "A",
166
- "prediction": "B",
167
- "raw_prediction": "<think>\n\n</think>\n\nB. Magnon",
168
- "correct": false
169
- },
170
- {
171
- "row": 22,
172
- "target": "C",
173
- "prediction": "A",
174
- "raw_prediction": "<think>\n\n</think>\n\nA",
175
- "correct": false
176
- },
177
- {
178
- "row": 23,
179
- "target": "A",
180
- "prediction": "A",
181
- "raw_prediction": "<think>\nOkay, let's try to",
182
- "correct": true
183
- },
184
- {
185
- "row": 24,
186
- "target": "D",
187
- "prediction": "A",
188
- "raw_prediction": "<think>\n\n</think>\n\nA",
189
- "correct": false
190
- },
191
- {
192
- "row": 25,
193
- "target": "D",
194
- "prediction": "B",
195
- "raw_prediction": "<think>\n\n</think>\n\nB. ~0",
196
- "correct": false
197
- },
198
- {
199
- "row": 26,
200
- "target": "B",
201
- "prediction": "B",
202
- "raw_prediction": "<think>\n\n</think>\n\nB",
203
- "correct": true
204
- },
205
- {
206
- "row": 27,
207
- "target": "D",
208
- "prediction": "B",
209
- "raw_prediction": "<think>\n\n</think>\n\nB. ~",
210
- "correct": false
211
- },
212
- {
213
- "row": 28,
214
- "target": "D",
215
- "prediction": "B",
216
- "raw_prediction": "<think>\n\n</think>\n\nB",
217
- "correct": false
218
- },
219
- {
220
- "row": 29,
221
- "target": "B",
222
- "prediction": "A",
223
- "raw_prediction": "<think>\n\n</think>\n\nA",
224
- "correct": false
225
- },
226
- {
227
- "row": 30,
228
- "target": "D",
229
- "prediction": "A",
230
- "raw_prediction": "<think>\n\n</think>\n\nA",
231
- "correct": false
232
- },
233
- {
234
- "row": 31,
235
- "target": "C",
236
- "prediction": "A",
237
- "raw_prediction": "<think>\n\n</think>\n\nA",
238
- "correct": false
239
- },
240
- {
241
- "row": 32,
242
- "target": "D",
243
- "prediction": "B",
244
- "raw_prediction": "<think>\n\n</think>\n\nThe orbital period of",
245
- "correct": false
246
- },
247
- {
248
- "row": 33,
249
- "target": "A",
250
- "prediction": "A",
251
- "raw_prediction": "<think>\n\n</think>\n\nThe factor by which",
252
- "correct": true
253
- },
254
- {
255
- "row": 34,
256
- "target": "A",
257
- "prediction": "D",
258
- "raw_prediction": "<think>\n\n</think>\n\nD",
259
- "correct": false
260
- },
261
- {
262
- "row": 35,
263
- "target": "B",
264
- "prediction": "B",
265
- "raw_prediction": "<think>\n\n</think>\n\nB",
266
- "correct": true
267
- },
268
- {
269
- "row": 36,
270
- "target": "C",
271
- "prediction": "B",
272
- "raw_prediction": "<think>\n\n</think>\n\nB",
273
- "correct": false
274
- },
275
- {
276
- "row": 37,
277
- "target": "A",
278
- "prediction": "C",
279
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
280
- "correct": false
281
- },
282
- {
283
- "row": 38,
284
- "target": "A",
285
- "prediction": "C",
286
- "raw_prediction": "<think>\n</think>\n\nNone of the choices",
287
- "correct": false
288
- },
289
- {
290
- "row": 39,
291
- "target": "A",
292
- "prediction": "B",
293
- "raw_prediction": "<think>\n\n</think>\n\nB",
294
- "correct": false
295
- },
296
- {
297
- "row": 40,
298
- "target": "A",
299
- "prediction": "C",
300
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
301
- "correct": false
302
- },
303
- {
304
- "row": 41,
305
- "target": "B",
306
- "prediction": "B",
307
- "raw_prediction": "<think>\n\n</think>\n\nB",
308
- "correct": true
309
- },
310
- {
311
- "row": 42,
312
- "target": "C",
313
- "prediction": "B",
314
- "raw_prediction": "<think>\n\n</think>\n\nB",
315
- "correct": false
316
- },
317
- {
318
- "row": 43,
319
- "target": "B",
320
- "prediction": "A",
321
- "raw_prediction": "<think>\n\n</think>\n\nA. Violet",
322
- "correct": false
323
- },
324
- {
325
- "row": 44,
326
- "target": "D",
327
- "prediction": "A",
328
- "raw_prediction": "<think>\nOkay, let's try to",
329
- "correct": false
330
- },
331
- {
332
- "row": 45,
333
- "target": "B",
334
- "prediction": "A",
335
- "raw_prediction": "<think>\nOkay, let's try to",
336
- "correct": false
337
- },
338
- {
339
- "row": 46,
340
- "target": "C",
341
- "prediction": "D",
342
- "raw_prediction": "<think>\n\n</think>\n\nThe answer is D",
343
- "correct": false
344
- },
345
- {
346
- "row": 47,
347
- "target": "D",
348
- "prediction": "B",
349
- "raw_prediction": "<think>\n\n</think>\n\nB",
350
- "correct": false
351
- },
352
- {
353
- "row": 48,
354
- "target": "B",
355
- "prediction": "D",
356
- "raw_prediction": "<think>\n\n</think>\n\nD",
357
- "correct": false
358
- },
359
- {
360
- "row": 49,
361
- "target": "D",
362
- "prediction": "A",
363
- "raw_prediction": "<think>\nOkay, let's try to",
364
- "correct": false
365
- },
366
- {
367
- "row": 50,
368
- "target": "A",
369
- "prediction": "A",
370
- "raw_prediction": "<think>\n\n</think>\n\nA",
371
- "correct": true
372
- },
373
- {
374
- "row": 51,
375
- "target": "A",
376
- "prediction": "A",
377
- "raw_prediction": "<think>\nOkay, let's try to",
378
- "correct": true
379
- },
380
- {
381
- "row": 52,
382
- "target": "C",
383
- "prediction": "B",
384
- "raw_prediction": "<think>\n\n</think>\n\nB. 4",
385
- "correct": false
386
- },
387
- {
388
- "row": 53,
389
- "target": "B",
390
- "prediction": "D",
391
- "raw_prediction": "<think>\n</think>\n\nTo determine the structure",
392
- "correct": false
393
- },
394
- {
395
- "row": 54,
396
- "target": "A",
397
- "prediction": "C",
398
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
399
- "correct": false
400
- },
401
- {
402
- "row": 55,
403
- "target": "A",
404
- "prediction": "C",
405
- "raw_prediction": "<think>\n\n</think>\n\nC",
406
- "correct": false
407
- },
408
- {
409
- "row": 56,
410
- "target": "B",
411
- "prediction": "C",
412
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
413
- "correct": false
414
- },
415
- {
416
- "row": 57,
417
- "target": "B",
418
- "prediction": "C",
419
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
420
- "correct": false
421
- },
422
- {
423
- "row": 58,
424
- "target": "A",
425
- "prediction": "C",
426
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
427
- "correct": false
428
- },
429
- {
430
- "row": 59,
431
- "target": "A",
432
- "prediction": "A",
433
- "raw_prediction": "<think>\n\n</think>\n\nA",
434
- "correct": true
435
- },
436
- {
437
- "row": 60,
438
- "target": "A",
439
- "prediction": "A",
440
- "raw_prediction": "<think>\n\n</think>\n\nA",
441
- "correct": true
442
- },
443
- {
444
- "row": 61,
445
- "target": "B",
446
- "prediction": "D",
447
- "raw_prediction": "<think>\n</think>\n\nTo find the normalized",
448
- "correct": false
449
- },
450
- {
451
- "row": 62,
452
- "target": "D",
453
- "prediction": "A",
454
- "raw_prediction": "<think>\nOkay, let's try to",
455
- "correct": false
456
- },
457
- {
458
- "row": 63,
459
- "target": "B",
460
- "prediction": "C",
461
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
462
- "correct": false
463
- },
464
- {
465
- "row": 64,
466
- "target": "C",
467
- "prediction": "D",
468
- "raw_prediction": "<think>\n\n</think>\n\nD",
469
- "correct": false
470
- },
471
- {
472
- "row": 65,
473
- "target": "A",
474
- "prediction": "D",
475
- "raw_prediction": "<think>\n\n</think>\n\nD",
476
- "correct": false
477
- },
478
- {
479
- "row": 66,
480
- "target": "B",
481
- "prediction": "C",
482
- "raw_prediction": "<think>\n</think>\n\nThe correct answer is",
483
- "correct": false
484
- },
485
- {
486
- "row": 67,
487
- "target": "C",
488
- "prediction": "C",
489
- "raw_prediction": "<think>\n</think>\n\nThe correct answer is",
490
- "correct": true
491
- },
492
- {
493
- "row": 68,
494
- "target": "B",
495
- "prediction": "A",
496
- "raw_prediction": "<think>\n\n</think>\n\nA",
497
- "correct": false
498
- },
499
- {
500
- "row": 69,
501
- "target": "A",
502
- "prediction": "A",
503
- "raw_prediction": "<think>\n\n</think>\n\nA. Mutant",
504
- "correct": true
505
- },
506
- {
507
- "row": 70,
508
- "target": "A",
509
- "prediction": "A",
510
- "raw_prediction": "<think>\nOkay, let's try to",
511
- "correct": true
512
- },
513
- {
514
- "row": 71,
515
- "target": "D",
516
- "prediction": "B",
517
- "raw_prediction": "<think>\n\n</think>\n\nB",
518
- "correct": false
519
- },
520
- {
521
- "row": 72,
522
- "target": "B",
523
- "prediction": "D",
524
- "raw_prediction": "<think>\n\n</think>\n\nD",
525
- "correct": false
526
- },
527
- {
528
- "row": 73,
529
- "target": "C",
530
- "prediction": "D",
531
- "raw_prediction": "<think>\n</think>\n\nTo find the concentration",
532
- "correct": false
533
- },
534
- {
535
- "row": 74,
536
- "target": "C",
537
- "prediction": "C",
538
- "raw_prediction": "<think>\n\n</think>\n\nC",
539
- "correct": true
540
- },
541
- {
542
- "row": 75,
543
- "target": "C",
544
- "prediction": "A",
545
- "raw_prediction": "<think>\nOkay, let's try to",
546
- "correct": false
547
- },
548
- {
549
- "row": 76,
550
- "target": "B",
551
- "prediction": "B",
552
- "raw_prediction": "<think>\n\n</think>\n\nB",
553
- "correct": true
554
- },
555
- {
556
- "row": 77,
557
- "target": "C",
558
- "prediction": "A",
559
- "raw_prediction": "<think>\nOkay, let's try to",
560
- "correct": false
561
- },
562
- {
563
- "row": 78,
564
- "target": "C",
565
- "prediction": "D",
566
- "raw_prediction": "<think>\n\n</think>\n\nD",
567
- "correct": false
568
- },
569
- {
570
- "row": 79,
571
- "target": "D",
572
- "prediction": "D",
573
- "raw_prediction": "<think>\n\n</think>\n\nD",
574
- "correct": true
575
- },
576
- {
577
- "row": 80,
578
- "target": "A",
579
- "prediction": "C",
580
- "raw_prediction": "<think>\n\n</think>\n\nC",
581
- "correct": false
582
- },
583
- {
584
- "row": 81,
585
- "target": "D",
586
- "prediction": "B",
587
- "raw_prediction": "<think>\n</think>\n\nB. (9",
588
- "correct": false
589
- },
590
- {
591
- "row": 82,
592
- "target": "B",
593
- "prediction": "B",
594
- "raw_prediction": "<think>\n\n</think>\n\nB",
595
- "correct": true
596
- },
597
- {
598
- "row": 83,
599
- "target": "C",
600
- "prediction": "A",
601
- "raw_prediction": "<think>\n\n</think>\n\nA",
602
- "correct": false
603
- },
604
- {
605
- "row": 84,
606
- "target": "D",
607
- "prediction": "C",
608
- "raw_prediction": "<think>\n\n</think>\n\nC. At repeats",
609
- "correct": false
610
- },
611
- {
612
- "row": 85,
613
- "target": "D",
614
- "prediction": "A",
615
- "raw_prediction": "<think>\n\n</think>\n\nA",
616
- "correct": false
617
- },
618
- {
619
- "row": 86,
620
- "target": "A",
621
- "prediction": "B",
622
- "raw_prediction": "<think>\n\n</think>\n\nB",
623
- "correct": false
624
- },
625
- {
626
- "row": 87,
627
- "target": "C",
628
- "prediction": "A",
629
- "raw_prediction": "<think>\n\n</think>\n\nA",
630
- "correct": false
631
- },
632
- {
633
- "row": 88,
634
- "target": "A",
635
- "prediction": "B",
636
- "raw_prediction": "<think>\n\n</think>\n\nB",
637
- "correct": false
638
- },
639
- {
640
- "row": 89,
641
- "target": "B",
642
- "prediction": "C",
643
- "raw_prediction": "<think>\n\n</think>\n\nC",
644
- "correct": false
645
- },
646
- {
647
- "row": 90,
648
- "target": "C",
649
- "prediction": "A",
650
- "raw_prediction": "<think>\n\n</think>\n\nA",
651
- "correct": false
652
- },
653
- {
654
- "row": 91,
655
- "target": "A",
656
- "prediction": "C",
657
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
658
- "correct": false
659
- },
660
- {
661
- "row": 92,
662
- "target": "B",
663
- "prediction": "B",
664
- "raw_prediction": "<think>\n\n</think>\n\nB",
665
- "correct": true
666
- },
667
- {
668
- "row": 93,
669
- "target": "A",
670
- "prediction": "C",
671
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
672
- "correct": false
673
- },
674
- {
675
- "row": 94,
676
- "target": "B",
677
- "prediction": "A",
678
- "raw_prediction": "<think>\n\n</think>\n\nA. 0",
679
- "correct": false
680
- },
681
- {
682
- "row": 95,
683
- "target": "D",
684
- "prediction": "A",
685
- "raw_prediction": "<think>\n\n</think>\n\nA",
686
- "correct": false
687
- },
688
- {
689
- "row": 96,
690
- "target": "A",
691
- "prediction": "B",
692
- "raw_prediction": "<think>\n\n</think>\n\nB",
693
- "correct": false
694
- },
695
- {
696
- "row": 97,
697
- "target": "D",
698
- "prediction": "C",
699
- "raw_prediction": "<think>\n\n</think>\n\nC",
700
- "correct": false
701
- },
702
- {
703
- "row": 98,
704
- "target": "D",
705
- "prediction": "B",
706
- "raw_prediction": "<think>\n\n</think>\n\nB. 1",
707
- "correct": false
708
- },
709
- {
710
- "row": 99,
711
- "target": "D",
712
- "prediction": "B",
713
- "raw_prediction": "<think>\n\n</think>\n\nB",
714
- "correct": false
715
- },
716
- {
717
- "row": 100,
718
- "target": "C",
719
- "prediction": "C",
720
- "raw_prediction": "<think>\n\n</think>\n\nC",
721
- "correct": true
722
- },
723
- {
724
- "row": 101,
725
- "target": "C",
726
- "prediction": "B",
727
- "raw_prediction": "<think>\n\n</think>\n\nB. 4",
728
- "correct": false
729
- },
730
- {
731
- "row": 102,
732
- "target": "C",
733
- "prediction": "B",
734
- "raw_prediction": "<think>\n\n</think>\n\nB",
735
- "correct": false
736
- },
737
- {
738
- "row": 103,
739
- "target": "B",
740
- "prediction": "A",
741
- "raw_prediction": "<think>\n\n</think>\n\nA",
742
- "correct": false
743
- },
744
- {
745
- "row": 104,
746
- "target": "C",
747
- "prediction": "C",
748
- "raw_prediction": "<think>\n\n</think>\n\nC",
749
- "correct": true
750
- },
751
- {
752
- "row": 105,
753
- "target": "A",
754
- "prediction": "A",
755
- "raw_prediction": "<think>\n\n</think>\n\nA",
756
- "correct": true
757
- },
758
- {
759
- "row": 106,
760
- "target": "B",
761
- "prediction": "B",
762
- "raw_prediction": "<think>\n\n</think>\n\nB",
763
- "correct": true
764
- },
765
- {
766
- "row": 107,
767
- "target": "D",
768
- "prediction": "B",
769
- "raw_prediction": "<think>\n\n</think>\n\nB. 5",
770
- "correct": false
771
- },
772
- {
773
- "row": 108,
774
- "target": "B",
775
- "prediction": "A",
776
- "raw_prediction": "<think>\n\n</think>\n\nA",
777
- "correct": false
778
- },
779
- {
780
- "row": 109,
781
- "target": "B",
782
- "prediction": "A",
783
- "raw_prediction": "<think>\n\n</think>\n\nA",
784
- "correct": false
785
- },
786
- {
787
- "row": 110,
788
- "target": "B",
789
- "prediction": "A",
790
- "raw_prediction": "<think>\n\n</think>\n\nA",
791
- "correct": false
792
- },
793
- {
794
- "row": 111,
795
- "target": "C",
796
- "prediction": "B",
797
- "raw_prediction": "<think>\n\n</think>\n\nB. ~ r",
798
- "correct": false
799
- },
800
- {
801
- "row": 112,
802
- "target": "A",
803
- "prediction": "B",
804
- "raw_prediction": "<think>\n</think>\n\nTo solve this problem",
805
- "correct": false
806
- },
807
- {
808
- "row": 113,
809
- "target": "A",
810
- "prediction": "B",
811
- "raw_prediction": "<think>\n\n</think>\n\nB",
812
- "correct": false
813
- },
814
- {
815
- "row": 114,
816
- "target": "B",
817
- "prediction": "B",
818
- "raw_prediction": "<think>\n\n</think>\n\nB",
819
- "correct": true
820
- },
821
- {
822
- "row": 115,
823
- "target": "C",
824
- "prediction": "A",
825
- "raw_prediction": "<think>\n\n</think>\n\nA. 4",
826
- "correct": false
827
- },
828
- {
829
- "row": 116,
830
- "target": "C",
831
- "prediction": "B",
832
- "raw_prediction": "<think>\n\n</think>\n\nB",
833
- "correct": false
834
- },
835
- {
836
- "row": 117,
837
- "target": "C",
838
- "prediction": "B",
839
- "raw_prediction": "<think>\n\n</think>\n\nB. stronger \u2013",
840
- "correct": false
841
- },
842
- {
843
- "row": 118,
844
- "target": "D",
845
- "prediction": "B",
846
- "raw_prediction": "<think>\n</think>\n\nB. 2",
847
- "correct": false
848
- },
849
- {
850
- "row": 119,
851
- "target": "B",
852
- "prediction": "A",
853
- "raw_prediction": "<think>\n\n</think>\n\nA",
854
- "correct": false
855
- },
856
- {
857
- "row": 120,
858
- "target": "D",
859
- "prediction": "A",
860
- "raw_prediction": "<think>\n\n</think>\n\nA. 1",
861
- "correct": false
862
- },
863
- {
864
- "row": 121,
865
- "target": "B",
866
- "prediction": "A",
867
- "raw_prediction": "<think>\n\n</think>\n\nA",
868
- "correct": false
869
- },
870
- {
871
- "row": 122,
872
- "target": "D",
873
- "prediction": "A",
874
- "raw_prediction": "<think>\n\n</think>\n\nA",
875
- "correct": false
876
- },
877
- {
878
- "row": 123,
879
- "target": "D",
880
- "prediction": "B",
881
- "raw_prediction": "<think>\n\n</think>\n\nB. 7",
882
- "correct": false
883
- },
884
- {
885
- "row": 124,
886
- "target": "A",
887
- "prediction": "A",
888
- "raw_prediction": "<think>\n\n</think>\n\nA",
889
- "correct": true
890
- },
891
- {
892
- "row": 125,
893
- "target": "C",
894
- "prediction": "A",
895
- "raw_prediction": "<think>\nOkay, let's try to",
896
- "correct": false
897
- },
898
- {
899
- "row": 126,
900
- "target": "D",
901
- "prediction": "A",
902
- "raw_prediction": "<think>\n\n</think>\n\nA",
903
- "correct": false
904
- },
905
- {
906
- "row": 127,
907
- "target": "B",
908
- "prediction": "B",
909
- "raw_prediction": "<think>\n\n</think>\n\nB",
910
- "correct": true
911
- }
912
- ]
913
- },
914
- "mmlu_pro": {
915
- "dataset": "TIGER-Lab/MMLU-Pro",
916
- "split": "test",
917
- "metric": "multiple_choice_accuracy",
918
- "value": 0.3046875,
919
- "correct": 78,
920
- "total": 256,
921
- "category_accuracy": {
922
- "biology": 0.47058823529411764,
923
- "business": 0.17647058823529413,
924
- "chemistry": 0.27586206896551724,
925
- "computer science": 0.4166666666666667,
926
- "economics": 0.45,
927
- "engineering": 0.13333333333333333,
928
- "health": 0.29411764705882354,
929
- "history": 0.36363636363636365,
930
- "law": 0.21052631578947367,
931
- "math": 0.13636363636363635,
932
- "other": 0.23809523809523808,
933
- "philosophy": 0.375,
934
- "physics": 0.26666666666666666,
935
- "psychology": 0.6111111111111112
936
- },
937
- "samples": [
938
- {
939
- "row": 0,
940
- "question_id": 9993,
941
- "category": "physics",
942
- "target": "A",
943
- "prediction": "E",
944
- "raw_prediction": "<think>\n\n</think>\n\nE",
945
- "correct": false
946
- },
947
- {
948
- "row": 1,
949
- "question_id": 6273,
950
- "category": "health",
951
- "target": "E",
952
- "prediction": "H",
953
- "raw_prediction": "<think>\n\n</think>\n\nH",
954
- "correct": false
955
- },
956
- {
957
- "row": 2,
958
- "question_id": 8064,
959
- "category": "math",
960
- "target": "B",
961
- "prediction": "H",
962
- "raw_prediction": "<think>\n</think>\n\nTo solve the system",
963
- "correct": false
964
- },
965
- {
966
- "row": 3,
967
- "question_id": 4918,
968
- "category": "history",
969
- "target": "I",
970
- "prediction": "I",
971
- "raw_prediction": "<think>\n\n</think>\n\nI. Railroad construction",
972
- "correct": true
973
- },
974
- {
975
- "row": 4,
976
- "question_id": 6963,
977
- "category": "economics",
978
- "target": "E",
979
- "prediction": "C",
980
- "raw_prediction": "<think>\n\n</think>\n\nC",
981
- "correct": false
982
- },
983
- {
984
- "row": 5,
985
- "question_id": 7871,
986
- "category": "math",
987
- "target": "H",
988
- "prediction": "I",
989
- "raw_prediction": "<think>\n\n</think>\n\nI. 0",
990
- "correct": false
991
- },
992
- {
993
- "row": 6,
994
- "question_id": 11518,
995
- "category": "engineering",
996
- "target": "D",
997
- "prediction": "H",
998
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
999
- "correct": false
1000
- },
1001
- {
1002
- "row": 7,
1003
- "question_id": 12253,
1004
- "category": "engineering",
1005
- "target": "H",
1006
- "prediction": "B",
1007
- "raw_prediction": "<think>\n\n</think>\n\nB. (a",
1008
- "correct": false
1009
- },
1010
- {
1011
- "row": 8,
1012
- "question_id": 358,
1013
- "category": "business",
1014
- "target": "F",
1015
- "prediction": "B",
1016
- "raw_prediction": "<think>\n\n</think>\n\nB",
1017
- "correct": false
1018
- },
1019
- {
1020
- "row": 9,
1021
- "question_id": 6909,
1022
- "category": "economics",
1023
- "target": "C",
1024
- "prediction": "I",
1025
- "raw_prediction": "<think>\n\n</think>\n\nI. (ii",
1026
- "correct": false
1027
- },
1028
- {
1029
- "row": 10,
1030
- "question_id": 7050,
1031
- "category": "economics",
1032
- "target": "B",
1033
- "prediction": "B",
1034
- "raw_prediction": "<think>\n\n</think>\n\nB",
1035
- "correct": true
1036
- },
1037
- {
1038
- "row": 11,
1039
- "question_id": 10176,
1040
- "category": "physics",
1041
- "target": "D",
1042
- "prediction": "B",
1043
- "raw_prediction": "<think>\n\n</think>\n\nB. 0",
1044
- "correct": false
1045
- },
1046
- {
1047
- "row": 12,
1048
- "question_id": 495,
1049
- "category": "business",
1050
- "target": "B",
1051
- "prediction": "B",
1052
- "raw_prediction": "<think>\n\n</think>\n\nB",
1053
- "correct": true
1054
- },
1055
- {
1056
- "row": 13,
1057
- "question_id": 2165,
1058
- "category": "psychology",
1059
- "target": "G",
1060
- "prediction": "C",
1061
- "raw_prediction": "<think>\n\n</think>\n\nC",
1062
- "correct": false
1063
- },
1064
- {
1065
- "row": 14,
1066
- "question_id": 4808,
1067
- "category": "history",
1068
- "target": "C",
1069
- "prediction": "F",
1070
- "raw_prediction": "<think>\n\n</think>\n\nF",
1071
- "correct": false
1072
- },
1073
- {
1074
- "row": 15,
1075
- "question_id": 11336,
1076
- "category": "engineering",
1077
- "target": "D",
1078
- "prediction": "B",
1079
- "raw_prediction": "<think>\n\n</think>\n\nB. (a",
1080
- "correct": false
1081
- },
1082
- {
1083
- "row": 16,
1084
- "question_id": 5842,
1085
- "category": "other",
1086
- "target": "D",
1087
- "prediction": "B",
1088
- "raw_prediction": "<think>\n\n</think>\n\nB",
1089
- "correct": false
1090
- },
1091
- {
1092
- "row": 17,
1093
- "question_id": 6420,
1094
- "category": "health",
1095
- "target": "H",
1096
- "prediction": "E",
1097
- "raw_prediction": "<think>\n\n</think>\n\nE",
1098
- "correct": false
1099
- },
1100
- {
1101
- "row": 18,
1102
- "question_id": 11178,
1103
- "category": "philosophy",
1104
- "target": "I",
1105
- "prediction": "I",
1106
- "raw_prediction": "<think>\n\n</think>\n\nI",
1107
- "correct": true
1108
- },
1109
- {
1110
- "row": 19,
1111
- "question_id": 5084,
1112
- "category": "other",
1113
- "target": "D",
1114
- "prediction": "B",
1115
- "raw_prediction": "<think>\n\n</think>\n\nB",
1116
- "correct": false
1117
- },
1118
- {
1119
- "row": 20,
1120
- "question_id": 10651,
1121
- "category": "computer science",
1122
- "target": "H",
1123
- "prediction": "B",
1124
- "raw_prediction": "<think>\n\n</think>\n\nB. Petabyte",
1125
- "correct": false
1126
- },
1127
- {
1128
- "row": 21,
1129
- "question_id": 7342,
1130
- "category": "economics",
1131
- "target": "F",
1132
- "prediction": "D",
1133
- "raw_prediction": "<think>\n\n</think>\n\nD",
1134
- "correct": false
1135
- },
1136
- {
1137
- "row": 22,
1138
- "question_id": 522,
1139
- "category": "business",
1140
- "target": "I",
1141
- "prediction": "B",
1142
- "raw_prediction": "<think>\n\n</think>\n\nB",
1143
- "correct": false
1144
- },
1145
- {
1146
- "row": 23,
1147
- "question_id": 10635,
1148
- "category": "computer science",
1149
- "target": "A",
1150
- "prediction": "A",
1151
- "raw_prediction": "<think>\n\n</think>\n\nA",
1152
- "correct": true
1153
- },
1154
- {
1155
- "row": 24,
1156
- "question_id": 11393,
1157
- "category": "engineering",
1158
- "target": "E",
1159
- "prediction": "H",
1160
- "raw_prediction": "<think>\nOkay, let's try to",
1161
- "correct": false
1162
- },
1163
- {
1164
- "row": 25,
1165
- "question_id": 3238,
1166
- "category": "biology",
1167
- "target": "C",
1168
- "prediction": "C",
1169
- "raw_prediction": "<think>\n\n</think>\n\nC",
1170
- "correct": true
1171
- },
1172
- {
1173
- "row": 26,
1174
- "question_id": 10755,
1175
- "category": "computer science",
1176
- "target": "E",
1177
- "prediction": "G",
1178
- "raw_prediction": "<think>\n\n</think>\n\nG",
1179
- "correct": false
1180
- },
1181
- {
1182
- "row": 27,
1183
- "question_id": 6403,
1184
- "category": "health",
1185
- "target": "H",
1186
- "prediction": "H",
1187
- "raw_prediction": "<think>\n\n</think>\n\nH",
1188
- "correct": true
1189
- },
1190
- {
1191
- "row": 28,
1192
- "question_id": 1604,
1193
- "category": "law",
1194
- "target": "C",
1195
- "prediction": "C",
1196
- "raw_prediction": "<think>\n\n</think>\n\nC",
1197
- "correct": true
1198
- },
1199
- {
1200
- "row": 29,
1201
- "question_id": 1748,
1202
- "category": "law",
1203
- "target": "D",
1204
- "prediction": "J",
1205
- "raw_prediction": "<think>\n\n</think>\n\nJ",
1206
- "correct": false
1207
- },
1208
- {
1209
- "row": 30,
1210
- "question_id": 2061,
1211
- "category": "psychology",
1212
- "target": "G",
1213
- "prediction": "G",
1214
- "raw_prediction": "<think>\n\n</think>\n\nG",
1215
- "correct": true
1216
- },
1217
- {
1218
- "row": 31,
1219
- "question_id": 1729,
1220
- "category": "law",
1221
- "target": "A",
1222
- "prediction": "C",
1223
- "raw_prediction": "<think>\n\n</think>\n\nC",
1224
- "correct": false
1225
- },
1226
- {
1227
- "row": 32,
1228
- "question_id": 5415,
1229
- "category": "other",
1230
- "target": "I",
1231
- "prediction": "B",
1232
- "raw_prediction": "<think>\n\n</think>\n\nB",
1233
- "correct": false
1234
- },
1235
- {
1236
- "row": 33,
1237
- "question_id": 11066,
1238
- "category": "philosophy",
1239
- "target": "A",
1240
- "prediction": "B",
1241
- "raw_prediction": "<think>\n\n</think>\n\nB",
1242
- "correct": false
1243
- },
1244
- {
1245
- "row": 34,
1246
- "question_id": 8748,
1247
- "category": "math",
1248
- "target": "H",
1249
- "prediction": "I",
1250
- "raw_prediction": "<think>\n\n</think>\n\nI. 1",
1251
- "correct": false
1252
- },
1253
- {
1254
- "row": 35,
1255
- "question_id": 3558,
1256
- "category": "chemistry",
1257
- "target": "J",
1258
- "prediction": "H",
1259
- "raw_prediction": "<think>\n\n</think>\n\nThe reaction is ex",
1260
- "correct": false
1261
- },
1262
- {
1263
- "row": 36,
1264
- "question_id": 7634,
1265
- "category": "economics",
1266
- "target": "H",
1267
- "prediction": "H",
1268
- "raw_prediction": "<think>\nOkay, let's try to",
1269
- "correct": true
1270
- },
1271
- {
1272
- "row": 37,
1273
- "question_id": 9620,
1274
- "category": "physics",
1275
- "target": "D",
1276
- "prediction": "H",
1277
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
1278
- "correct": false
1279
- },
1280
- {
1281
- "row": 38,
1282
- "question_id": 5757,
1283
- "category": "other",
1284
- "target": "F",
1285
- "prediction": "I",
1286
- "raw_prediction": "<think>\n\n</think>\n\nI. 2",
1287
- "correct": false
1288
- },
1289
- {
1290
- "row": 39,
1291
- "question_id": 4804,
1292
- "category": "history",
1293
- "target": "G",
1294
- "prediction": "E",
1295
- "raw_prediction": "<think>\n\n</think>\n\nE",
1296
- "correct": false
1297
- },
1298
- {
1299
- "row": 40,
1300
- "question_id": 497,
1301
- "category": "business",
1302
- "target": "C",
1303
- "prediction": "I",
1304
- "raw_prediction": "<think>\n\n</think>\n\nI. Supply loops",
1305
- "correct": false
1306
- },
1307
- {
1308
- "row": 41,
1309
- "question_id": 3006,
1310
- "category": "biology",
1311
- "target": "E",
1312
- "prediction": "E",
1313
- "raw_prediction": "<think>\n\n</think>\n\nE",
1314
- "correct": true
1315
- },
1316
- {
1317
- "row": 42,
1318
- "question_id": 12229,
1319
- "category": "engineering",
1320
- "target": "C",
1321
- "prediction": "B",
1322
- "raw_prediction": "<think>\n\n</think>\n\nB. 8",
1323
- "correct": false
1324
- },
1325
- {
1326
- "row": 43,
1327
- "question_id": 2357,
1328
- "category": "psychology",
1329
- "target": "I",
1330
- "prediction": "I",
1331
- "raw_prediction": "<think>\n\n</think>\n\nI",
1332
- "correct": true
1333
- },
1334
- {
1335
- "row": 44,
1336
- "question_id": 651,
1337
- "category": "business",
1338
- "target": "I",
1339
- "prediction": "H",
1340
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
1341
- "correct": false
1342
- },
1343
- {
1344
- "row": 45,
1345
- "question_id": 9149,
1346
- "category": "physics",
1347
- "target": "I",
1348
- "prediction": "H",
1349
- "raw_prediction": "<think>\nOkay, let's try to",
1350
- "correct": false
1351
- },
1352
- {
1353
- "row": 46,
1354
- "question_id": 3139,
1355
- "category": "biology",
1356
- "target": "I",
1357
- "prediction": "I",
1358
- "raw_prediction": "<think>\n\n</think>\n\nI",
1359
- "correct": true
1360
- },
1361
- {
1362
- "row": 47,
1363
- "question_id": 6953,
1364
- "category": "economics",
1365
- "target": "I",
1366
- "prediction": "I",
1367
- "raw_prediction": "<think>\n\n</think>\n\nI",
1368
- "correct": true
1369
- },
1370
- {
1371
- "row": 48,
1372
- "question_id": 11278,
1373
- "category": "philosophy",
1374
- "target": "G",
1375
- "prediction": "G",
1376
- "raw_prediction": "<think>\n\n</think>\n\nG",
1377
- "correct": true
1378
- },
1379
- {
1380
- "row": 49,
1381
- "question_id": 1457,
1382
- "category": "law",
1383
- "target": "E",
1384
- "prediction": "B",
1385
- "raw_prediction": "<think>\n\n</think>\n\nB. admissible",
1386
- "correct": false
1387
- },
1388
- {
1389
- "row": 50,
1390
- "question_id": 3847,
1391
- "category": "chemistry",
1392
- "target": "F",
1393
- "prediction": "B",
1394
- "raw_prediction": "<think>\n\n</think>\n\nB. 2",
1395
- "correct": false
1396
- },
1397
- {
1398
- "row": 51,
1399
- "question_id": 6117,
1400
- "category": "health",
1401
- "target": "F",
1402
- "prediction": "C",
1403
- "raw_prediction": "<think>\n\n</think>\n\nC. Thorac",
1404
- "correct": false
1405
- },
1406
- {
1407
- "row": 52,
1408
- "question_id": 4862,
1409
- "category": "history",
1410
- "target": "J",
1411
- "prediction": "A",
1412
- "raw_prediction": "<think>\n\n</think>\n\nA",
1413
- "correct": false
1414
- },
1415
- {
1416
- "row": 53,
1417
- "question_id": 9107,
1418
- "category": "physics",
1419
- "target": "A",
1420
- "prediction": "A",
1421
- "raw_prediction": "<think>\n\n</think>\n\nA",
1422
- "correct": true
1423
- },
1424
- {
1425
- "row": 54,
1426
- "question_id": 9323,
1427
- "category": "physics",
1428
- "target": "A",
1429
- "prediction": "A",
1430
- "raw_prediction": "<think>\n\n</think>\n\nA",
1431
- "correct": true
1432
- },
1433
- {
1434
- "row": 55,
1435
- "question_id": 7353,
1436
- "category": "economics",
1437
- "target": "G",
1438
- "prediction": "B",
1439
- "raw_prediction": "<think>\n</think>\n\nB. Long-run",
1440
- "correct": false
1441
- },
1442
- {
1443
- "row": 56,
1444
- "question_id": 7824,
1445
- "category": "math",
1446
- "target": "H",
1447
- "prediction": "B",
1448
- "raw_prediction": "<think>\n\n</think>\n\nB. 1",
1449
- "correct": false
1450
- },
1451
- {
1452
- "row": 57,
1453
- "question_id": 713,
1454
- "category": "business",
1455
- "target": "F",
1456
- "prediction": "B",
1457
- "raw_prediction": "<think>\n\n</think>\n\nB",
1458
- "correct": false
1459
- },
1460
- {
1461
- "row": 58,
1462
- "question_id": 4716,
1463
- "category": "history",
1464
- "target": "B",
1465
- "prediction": "B",
1466
- "raw_prediction": "<think>\n\n</think>\n\nB. Containment",
1467
- "correct": true
1468
- },
1469
- {
1470
- "row": 59,
1471
- "question_id": 8275,
1472
- "category": "math",
1473
- "target": "A",
1474
- "prediction": "C",
1475
- "raw_prediction": "<think>\n\n</think>\n\nC",
1476
- "correct": false
1477
- },
1478
- {
1479
- "row": 60,
1480
- "question_id": 2988,
1481
- "category": "biology",
1482
- "target": "C",
1483
- "prediction": "C",
1484
- "raw_prediction": "<think>\n\n</think>\n\nC",
1485
- "correct": true
1486
- },
1487
- {
1488
- "row": 61,
1489
- "question_id": 8229,
1490
- "category": "math",
1491
- "target": "G",
1492
- "prediction": "G",
1493
- "raw_prediction": "<think>\n\n</think>\n\nG",
1494
- "correct": true
1495
- },
1496
- {
1497
- "row": 62,
1498
- "question_id": 4677,
1499
- "category": "history",
1500
- "target": "H",
1501
- "prediction": "A",
1502
- "raw_prediction": "<think>\n\n</think>\n\nA",
1503
- "correct": false
1504
- },
1505
- {
1506
- "row": 63,
1507
- "question_id": 261,
1508
- "category": "business",
1509
- "target": "J",
1510
- "prediction": "B",
1511
- "raw_prediction": "<think>\n\n</think>\n\nB",
1512
- "correct": false
1513
- },
1514
- {
1515
- "row": 64,
1516
- "question_id": 5456,
1517
- "category": "other",
1518
- "target": "H",
1519
- "prediction": "I",
1520
- "raw_prediction": "<think>\n\n</think>\n\nI",
1521
- "correct": false
1522
- },
1523
- {
1524
- "row": 65,
1525
- "question_id": 2222,
1526
- "category": "psychology",
1527
- "target": "B",
1528
- "prediction": "B",
1529
- "raw_prediction": "<think>\n\n</think>\n\nB. continuous",
1530
- "correct": true
1531
- },
1532
- {
1533
- "row": 66,
1534
- "question_id": 6524,
1535
- "category": "health",
1536
- "target": "F",
1537
- "prediction": "A",
1538
- "raw_prediction": "<think>\n\n</think>\n\nA",
1539
- "correct": false
1540
- },
1541
- {
1542
- "row": 67,
1543
- "question_id": 1280,
1544
- "category": "law",
1545
- "target": "D",
1546
- "prediction": "D",
1547
- "raw_prediction": "<think>\n\n</think>\n\nD",
1548
- "correct": true
1549
- },
1550
- {
1551
- "row": 68,
1552
- "question_id": 2943,
1553
- "category": "biology",
1554
- "target": "C",
1555
- "prediction": "A",
1556
- "raw_prediction": "<think>\n\n</think>\n\nA",
1557
- "correct": false
1558
- },
1559
- {
1560
- "row": 69,
1561
- "question_id": 782,
1562
- "category": "business",
1563
- "target": "G",
1564
- "prediction": "B",
1565
- "raw_prediction": "<think>\n\n</think>\n\nB",
1566
- "correct": false
1567
- },
1568
- {
1569
- "row": 70,
1570
- "question_id": 9336,
1571
- "category": "physics",
1572
- "target": "A",
1573
- "prediction": "H",
1574
- "raw_prediction": "<think>\nOkay, let's try to",
1575
- "correct": false
1576
- },
1577
- {
1578
- "row": 71,
1579
- "question_id": 3786,
1580
- "category": "chemistry",
1581
- "target": "B",
1582
- "prediction": "C",
1583
- "raw_prediction": "<think>\n\n</think>\n\nC",
1584
- "correct": false
1585
- },
1586
- {
1587
- "row": 72,
1588
- "question_id": 1934,
1589
- "category": "law",
1590
- "target": "B",
1591
- "prediction": "C",
1592
- "raw_prediction": "<think>\n\n</think>\n\nC",
1593
- "correct": false
1594
- },
1595
- {
1596
- "row": 73,
1597
- "question_id": 1355,
1598
- "category": "law",
1599
- "target": "G",
1600
- "prediction": "C",
1601
- "raw_prediction": "<think>\n\n</think>\n\nC",
1602
- "correct": false
1603
- },
1604
- {
1605
- "row": 74,
1606
- "question_id": 10290,
1607
- "category": "physics",
1608
- "target": "I",
1609
- "prediction": "H",
1610
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
1611
- "correct": false
1612
- },
1613
- {
1614
- "row": 75,
1615
- "question_id": 10710,
1616
- "category": "computer science",
1617
- "target": "J",
1618
- "prediction": "B",
1619
- "raw_prediction": "<think>\n\n</think>\n\nB",
1620
- "correct": false
1621
- },
1622
- {
1623
- "row": 76,
1624
- "question_id": 9376,
1625
- "category": "physics",
1626
- "target": "H",
1627
- "prediction": "H",
1628
- "raw_prediction": "<think>\nOkay, let's try to",
1629
- "correct": true
1630
- },
1631
- {
1632
- "row": 77,
1633
- "question_id": 9590,
1634
- "category": "physics",
1635
- "target": "C",
1636
- "prediction": "A",
1637
- "raw_prediction": "<think>\n\n</think>\n\nA.",
1638
- "correct": false
1639
- },
1640
- {
1641
- "row": 78,
1642
- "question_id": 10731,
1643
- "category": "computer science",
1644
- "target": "E",
1645
- "prediction": "E",
1646
- "raw_prediction": "<think>\n\n</think>\n\nE",
1647
- "correct": true
1648
- },
1649
- {
1650
- "row": 79,
1651
- "question_id": 5985,
1652
- "category": "other",
1653
- "target": "B",
1654
- "prediction": "B",
1655
- "raw_prediction": "<think>\n\n</think>\n\nB. Deprec",
1656
- "correct": true
1657
- },
1658
- {
1659
- "row": 80,
1660
- "question_id": 10832,
1661
- "category": "philosophy",
1662
- "target": "I",
1663
- "prediction": "J",
1664
- "raw_prediction": "<think>\n\n</think>\n\nJ",
1665
- "correct": false
1666
- },
1667
- {
1668
- "row": 81,
1669
- "question_id": 4516,
1670
- "category": "chemistry",
1671
- "target": "F",
1672
- "prediction": "F",
1673
- "raw_prediction": "<think>\n\n</think>\n\nF",
1674
- "correct": true
1675
- },
1676
- {
1677
- "row": 82,
1678
- "question_id": 2819,
1679
- "category": "biology",
1680
- "target": "C",
1681
- "prediction": "C",
1682
- "raw_prediction": "<think>\n\n</think>\n\nC",
1683
- "correct": true
1684
- },
1685
- {
1686
- "row": 83,
1687
- "question_id": 9104,
1688
- "category": "physics",
1689
- "target": "J",
1690
- "prediction": "C",
1691
- "raw_prediction": "<think>\n\n</think>\n\nC",
1692
- "correct": false
1693
- },
1694
- {
1695
- "row": 84,
1696
- "question_id": 6202,
1697
- "category": "health",
1698
- "target": "F",
1699
- "prediction": "C",
1700
- "raw_prediction": "<think>\n\n</think>\n\nC",
1701
- "correct": false
1702
- },
1703
- {
1704
- "row": 85,
1705
- "question_id": 9391,
1706
- "category": "physics",
1707
- "target": "E",
1708
- "prediction": "A",
1709
- "raw_prediction": "<think>\n\n</think>\n\nA. 2",
1710
- "correct": false
1711
- },
1712
- {
1713
- "row": 86,
1714
- "question_id": 2057,
1715
- "category": "psychology",
1716
- "target": "G",
1717
- "prediction": "E",
1718
- "raw_prediction": "<think>\n\n</think>\n\nE",
1719
- "correct": false
1720
- },
1721
- {
1722
- "row": 87,
1723
- "question_id": 10269,
1724
- "category": "physics",
1725
- "target": "G",
1726
- "prediction": "G",
1727
- "raw_prediction": "<think>\n\n</think>\n\nG",
1728
- "correct": true
1729
- },
1730
- {
1731
- "row": 88,
1732
- "question_id": 8251,
1733
- "category": "math",
1734
- "target": "J",
1735
- "prediction": "A",
1736
- "raw_prediction": "<think>\n\n</think>\n\nA. 5",
1737
- "correct": false
1738
- },
1739
- {
1740
- "row": 89,
1741
- "question_id": 9896,
1742
- "category": "physics",
1743
- "target": "D",
1744
- "prediction": "H",
1745
- "raw_prediction": "<think>\nOkay, let's tackle this",
1746
- "correct": false
1747
- },
1748
- {
1749
- "row": 90,
1750
- "question_id": 2187,
1751
- "category": "psychology",
1752
- "target": "D",
1753
- "prediction": "D",
1754
- "raw_prediction": "<think>\n\n</think>\n\nD",
1755
- "correct": true
1756
- },
1757
- {
1758
- "row": 91,
1759
- "question_id": 7411,
1760
- "category": "economics",
1761
- "target": "D",
1762
- "prediction": "D",
1763
- "raw_prediction": "<think>\n\n</think>\n\nD",
1764
- "correct": true
1765
- },
1766
- {
1767
- "row": 92,
1768
- "question_id": 5446,
1769
- "category": "other",
1770
- "target": "B",
1771
- "prediction": "H",
1772
- "raw_prediction": "<think>\n</think>\n\nTo find the cumulative",
1773
- "correct": false
1774
- },
1775
- {
1776
- "row": 93,
1777
- "question_id": 9167,
1778
- "category": "physics",
1779
- "target": "C",
1780
- "prediction": "C",
1781
- "raw_prediction": "<think>\n\n</think>\n\nC",
1782
- "correct": true
1783
- },
1784
- {
1785
- "row": 94,
1786
- "question_id": 9034,
1787
- "category": "math",
1788
- "target": "F",
1789
- "prediction": "H",
1790
- "raw_prediction": "<think>\nOkay, let's see.",
1791
- "correct": false
1792
- },
1793
- {
1794
- "row": 95,
1795
- "question_id": 4604,
1796
- "category": "chemistry",
1797
- "target": "J",
1798
- "prediction": "A",
1799
- "raw_prediction": "<think>\n\n</think>\n\nA",
1800
- "correct": false
1801
- },
1802
- {
1803
- "row": 96,
1804
- "question_id": 647,
1805
- "category": "business",
1806
- "target": "B",
1807
- "prediction": "B",
1808
- "raw_prediction": "<think>\n\n</think>\n\nB. $4",
1809
- "correct": true
1810
- },
1811
- {
1812
- "row": 97,
1813
- "question_id": 6577,
1814
- "category": "health",
1815
- "target": "A",
1816
- "prediction": "A",
1817
- "raw_prediction": "<think>\n\n</think>\n\nA",
1818
- "correct": true
1819
- },
1820
- {
1821
- "row": 98,
1822
- "question_id": 4563,
1823
- "category": "chemistry",
1824
- "target": "E",
1825
- "prediction": "H",
1826
- "raw_prediction": "<think>\n\n</think>\n\nThe energy of the",
1827
- "correct": false
1828
- },
1829
- {
1830
- "row": 99,
1831
- "question_id": 357,
1832
- "category": "business",
1833
- "target": "H",
1834
- "prediction": "A",
1835
- "raw_prediction": "<think>\n\n</think>\n\nA",
1836
- "correct": false
1837
- },
1838
- {
1839
- "row": 100,
1840
- "question_id": 1354,
1841
- "category": "law",
1842
- "target": "D",
1843
- "prediction": "H",
1844
- "raw_prediction": "<think>\nOkay, let's try to",
1845
- "correct": false
1846
- },
1847
- {
1848
- "row": 101,
1849
- "question_id": 8710,
1850
- "category": "math",
1851
- "target": "C",
1852
- "prediction": "H",
1853
- "raw_prediction": "<think>\nOkay, let's try to",
1854
- "correct": false
1855
- },
1856
- {
1857
- "row": 102,
1858
- "question_id": 6866,
1859
- "category": "economics",
1860
- "target": "G",
1861
- "prediction": "B",
1862
- "raw_prediction": "<think>\n\n</think>\n\nB. Increase of",
1863
- "correct": false
1864
- },
1865
- {
1866
- "row": 103,
1867
- "question_id": 7269,
1868
- "category": "economics",
1869
- "target": "C",
1870
- "prediction": "C",
1871
- "raw_prediction": "<think>\n\n</think>\n\nC",
1872
- "correct": true
1873
- },
1874
- {
1875
- "row": 104,
1876
- "question_id": 7474,
1877
- "category": "economics",
1878
- "target": "A",
1879
- "prediction": "D",
1880
- "raw_prediction": "<think>\n\n</think>\n\nD",
1881
- "correct": false
1882
- },
1883
- {
1884
- "row": 105,
1885
- "question_id": 9241,
1886
- "category": "physics",
1887
- "target": "C",
1888
- "prediction": "A",
1889
- "raw_prediction": "<think>\n\n</think>\n\nA. Concave",
1890
- "correct": false
1891
- },
1892
- {
1893
- "row": 106,
1894
- "question_id": 1687,
1895
- "category": "law",
1896
- "target": "B",
1897
- "prediction": "I",
1898
- "raw_prediction": "<think>\n\n</think>\n\nI",
1899
- "correct": false
1900
- },
1901
- {
1902
- "row": 107,
1903
- "question_id": 4245,
1904
- "category": "chemistry",
1905
- "target": "B",
1906
- "prediction": "B",
1907
- "raw_prediction": "<think>\n\n</think>\n\nB",
1908
- "correct": true
1909
- },
1910
- {
1911
- "row": 108,
1912
- "question_id": 6485,
1913
- "category": "health",
1914
- "target": "E",
1915
- "prediction": "H",
1916
- "raw_prediction": "<think>\n\n</think>\n\nH",
1917
- "correct": false
1918
- },
1919
- {
1920
- "row": 109,
1921
- "question_id": 1988,
1922
- "category": "psychology",
1923
- "target": "E",
1924
- "prediction": "C",
1925
- "raw_prediction": "<think>\n\n</think>\n\nC",
1926
- "correct": false
1927
- },
1928
- {
1929
- "row": 110,
1930
- "question_id": 6687,
1931
- "category": "health",
1932
- "target": "B",
1933
- "prediction": "B",
1934
- "raw_prediction": "<think>\n\n</think>\n\nB. It is",
1935
- "correct": true
1936
- },
1937
- {
1938
- "row": 111,
1939
- "question_id": 2087,
1940
- "category": "psychology",
1941
- "target": "G",
1942
- "prediction": "G",
1943
- "raw_prediction": "<think>\n\n</think>\n\nG",
1944
- "correct": true
1945
- },
1946
- {
1947
- "row": 112,
1948
- "question_id": 5821,
1949
- "category": "other",
1950
- "target": "I",
1951
- "prediction": "A",
1952
- "raw_prediction": "<think>\n\n</think>\n\nA",
1953
- "correct": false
1954
- },
1955
- {
1956
- "row": 113,
1957
- "question_id": 10895,
1958
- "category": "philosophy",
1959
- "target": "H",
1960
- "prediction": "E",
1961
- "raw_prediction": "<think>\n\n</think>\n\nE",
1962
- "correct": false
1963
- },
1964
- {
1965
- "row": 114,
1966
- "question_id": 10303,
1967
- "category": "physics",
1968
- "target": "H",
1969
- "prediction": "H",
1970
- "raw_prediction": "<think>\n\n</think>\n\nThe vibrational contribution",
1971
- "correct": true
1972
- },
1973
- {
1974
- "row": 115,
1975
- "question_id": 880,
1976
- "category": "law",
1977
- "target": "D",
1978
- "prediction": "A",
1979
- "raw_prediction": "<think>\n\n</think>\n\nA",
1980
- "correct": false
1981
- },
1982
- {
1983
- "row": 116,
1984
- "question_id": 6385,
1985
- "category": "health",
1986
- "target": "D",
1987
- "prediction": "H",
1988
- "raw_prediction": "<think>\n\n</think>\n\nH",
1989
- "correct": false
1990
- },
1991
- {
1992
- "row": 117,
1993
- "question_id": 9746,
1994
- "category": "physics",
1995
- "target": "B",
1996
- "prediction": "H",
1997
- "raw_prediction": "<think>\nOkay, let's try to",
1998
- "correct": false
1999
- },
2000
- {
2001
- "row": 118,
2002
- "question_id": 2810,
2003
- "category": "biology",
2004
- "target": "F",
2005
- "prediction": "G",
2006
- "raw_prediction": "<think>\n\n</think>\n\nG",
2007
- "correct": false
2008
- },
2009
- {
2010
- "row": 119,
2011
- "question_id": 8633,
2012
- "category": "math",
2013
- "target": "F",
2014
- "prediction": "A",
2015
- "raw_prediction": "<think>\n\n</think>\n\nA.",
2016
- "correct": false
2017
- },
2018
- {
2019
- "row": 120,
2020
- "question_id": 494,
2021
- "category": "business",
2022
- "target": "H",
2023
- "prediction": "B",
2024
- "raw_prediction": "<think>\n\n</think>\n\nB",
2025
- "correct": false
2026
- },
2027
- {
2028
- "row": 121,
2029
- "question_id": 9143,
2030
- "category": "physics",
2031
- "target": "H",
2032
- "prediction": "B",
2033
- "raw_prediction": "<think>\n\n</think>\n\nB. 1",
2034
- "correct": false
2035
- },
2036
- {
2037
- "row": 122,
2038
- "question_id": 7666,
2039
- "category": "economics",
2040
- "target": "A",
2041
- "prediction": "A",
2042
- "raw_prediction": "<think>\n\n</think>\n\nA",
2043
- "correct": true
2044
- },
2045
- {
2046
- "row": 123,
2047
- "question_id": 229,
2048
- "category": "business",
2049
- "target": "F",
2050
- "prediction": "A",
2051
- "raw_prediction": "<think>\n\n</think>\n\nA",
2052
- "correct": false
2053
- },
2054
- {
2055
- "row": 124,
2056
- "question_id": 10673,
2057
- "category": "computer science",
2058
- "target": "G",
2059
- "prediction": "H",
2060
- "raw_prediction": "<think>\n\n</think>\n\nThe minimum vertex cover",
2061
- "correct": false
2062
- },
2063
- {
2064
- "row": 125,
2065
- "question_id": 3692,
2066
- "category": "chemistry",
2067
- "target": "A",
2068
- "prediction": "C",
2069
- "raw_prediction": "<think>\n\n</think>\n\nC. 3",
2070
- "correct": false
2071
- },
2072
- {
2073
- "row": 126,
2074
- "question_id": 5329,
2075
- "category": "other",
2076
- "target": "F",
2077
- "prediction": "D",
2078
- "raw_prediction": "<think>\n\n</think>\n\nD",
2079
- "correct": false
2080
- },
2081
- {
2082
- "row": 127,
2083
- "question_id": 4066,
2084
- "category": "chemistry",
2085
- "target": "A",
2086
- "prediction": "A",
2087
- "raw_prediction": "<think>\n\n</think>\n\nA. -",
2088
- "correct": true
2089
- },
2090
- {
2091
- "row": 128,
2092
- "question_id": 5520,
2093
- "category": "other",
2094
- "target": "J",
2095
- "prediction": "J",
2096
- "raw_prediction": "<think>\n\n</think>\n\nJ",
2097
- "correct": true
2098
- },
2099
- {
2100
- "row": 129,
2101
- "question_id": 10255,
2102
- "category": "physics",
2103
- "target": "J",
2104
- "prediction": "B",
2105
- "raw_prediction": "<think>\n\n</think>\n\nB",
2106
- "correct": false
2107
- },
2108
- {
2109
- "row": 130,
2110
- "question_id": 9005,
2111
- "category": "math",
2112
- "target": "I",
2113
- "prediction": "G",
2114
- "raw_prediction": "<think>\n\n</think>\n\nG",
2115
- "correct": false
2116
- },
2117
- {
2118
- "row": 131,
2119
- "question_id": 5727,
2120
- "category": "other",
2121
- "target": "A",
2122
- "prediction": "A",
2123
- "raw_prediction": "<think>\n\n</think>\n\nA",
2124
- "correct": true
2125
- },
2126
- {
2127
- "row": 132,
2128
- "question_id": 219,
2129
- "category": "business",
2130
- "target": "I",
2131
- "prediction": "A",
2132
- "raw_prediction": "<think>\n\n</think>\n\nA",
2133
- "correct": false
2134
- },
2135
- {
2136
- "row": 133,
2137
- "question_id": 5471,
2138
- "category": "other",
2139
- "target": "H",
2140
- "prediction": "G",
2141
- "raw_prediction": "<think>\n\n</think>\n\nG. Predatory",
2142
- "correct": false
2143
- },
2144
- {
2145
- "row": 134,
2146
- "question_id": 11236,
2147
- "category": "philosophy",
2148
- "target": "E",
2149
- "prediction": "C",
2150
- "raw_prediction": "<think>\n\n</think>\n\nC",
2151
- "correct": false
2152
- },
2153
- {
2154
- "row": 135,
2155
- "question_id": 8467,
2156
- "category": "math",
2157
- "target": "C",
2158
- "prediction": "A",
2159
- "raw_prediction": "<think>\n\n</think>\n\nA",
2160
- "correct": false
2161
- },
2162
- {
2163
- "row": 136,
2164
- "question_id": 3969,
2165
- "category": "chemistry",
2166
- "target": "B",
2167
- "prediction": "A",
2168
- "raw_prediction": "<think>\nOkay, let's try to",
2169
- "correct": false
2170
- },
2171
- {
2172
- "row": 137,
2173
- "question_id": 7585,
2174
- "category": "economics",
2175
- "target": "A",
2176
- "prediction": "I",
2177
- "raw_prediction": "<think>\n\n</think>\n\nI. The production",
2178
- "correct": false
2179
- },
2180
- {
2181
- "row": 138,
2182
- "question_id": 7897,
2183
- "category": "math",
2184
- "target": "H",
2185
- "prediction": "H",
2186
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
2187
- "correct": true
2188
- },
2189
- {
2190
- "row": 139,
2191
- "question_id": 3898,
2192
- "category": "chemistry",
2193
- "target": "H",
2194
- "prediction": "H",
2195
- "raw_prediction": "<think>\nOkay, let's try to",
2196
- "correct": true
2197
- },
2198
- {
2199
- "row": 140,
2200
- "question_id": 4233,
2201
- "category": "chemistry",
2202
- "target": "H",
2203
- "prediction": "B",
2204
- "raw_prediction": "<think>\n\n</think>\n\nB",
2205
- "correct": false
2206
- },
2207
- {
2208
- "row": 141,
2209
- "question_id": 438,
2210
- "category": "business",
2211
- "target": "D",
2212
- "prediction": "D",
2213
- "raw_prediction": "<think>\n\n</think>\n\nD. Market equivalence",
2214
- "correct": true
2215
- },
2216
- {
2217
- "row": 142,
2218
- "question_id": 9497,
2219
- "category": "physics",
2220
- "target": "I",
2221
- "prediction": "I",
2222
- "raw_prediction": "<think>\n\n</think>\n\nI.",
2223
- "correct": true
2224
- },
2225
- {
2226
- "row": 143,
2227
- "question_id": 4434,
2228
- "category": "chemistry",
2229
- "target": "J",
2230
- "prediction": "H",
2231
- "raw_prediction": "<think>\n\n</think>\n\nH",
2232
- "correct": false
2233
- },
2234
- {
2235
- "row": 144,
2236
- "question_id": 8239,
2237
- "category": "math",
2238
- "target": "D",
2239
- "prediction": "A",
2240
- "raw_prediction": "<think>\n\n</think>\n\nA",
2241
- "correct": false
2242
- },
2243
- {
2244
- "row": 145,
2245
- "question_id": 3479,
2246
- "category": "biology",
2247
- "target": "E",
2248
- "prediction": "B",
2249
- "raw_prediction": "<think>\n\n</think>\n\nB",
2250
- "correct": false
2251
- },
2252
- {
2253
- "row": 146,
2254
- "question_id": 9331,
2255
- "category": "physics",
2256
- "target": "E",
2257
- "prediction": "J",
2258
- "raw_prediction": "<think>\n\n</think>\n\nJ",
2259
- "correct": false
2260
- },
2261
- {
2262
- "row": 147,
2263
- "question_id": 8485,
2264
- "category": "math",
2265
- "target": "D",
2266
- "prediction": "B",
2267
- "raw_prediction": "<think>\n\n</think>\n\nB",
2268
- "correct": false
2269
- },
2270
- {
2271
- "row": 148,
2272
- "question_id": 9440,
2273
- "category": "physics",
2274
- "target": "I",
2275
- "prediction": "C",
2276
- "raw_prediction": "<think>\n\n</think>\n\nC",
2277
- "correct": false
2278
- },
2279
- {
2280
- "row": 149,
2281
- "question_id": 10811,
2282
- "category": "philosophy",
2283
- "target": "B",
2284
- "prediction": "B",
2285
- "raw_prediction": "<think>\n\n</think>\n\nB",
2286
- "correct": true
2287
- },
2288
- {
2289
- "row": 150,
2290
- "question_id": 7890,
2291
- "category": "math",
2292
- "target": "J",
2293
- "prediction": "G",
2294
- "raw_prediction": "<think>\n\n</think>\n\nG",
2295
- "correct": false
2296
- },
2297
- {
2298
- "row": 151,
2299
- "question_id": 4144,
2300
- "category": "chemistry",
2301
- "target": "A",
2302
- "prediction": "B",
2303
- "raw_prediction": "<think>\n\n</think>\n\nB. 7",
2304
- "correct": false
2305
- },
2306
- {
2307
- "row": 152,
2308
- "question_id": 12016,
2309
- "category": "engineering",
2310
- "target": "F",
2311
- "prediction": "H",
2312
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
2313
- "correct": false
2314
- },
2315
- {
2316
- "row": 153,
2317
- "question_id": 4583,
2318
- "category": "chemistry",
2319
- "target": "D",
2320
- "prediction": "D",
2321
- "raw_prediction": "<think>\n\n</think>\n\nD",
2322
- "correct": true
2323
- },
2324
- {
2325
- "row": 154,
2326
- "question_id": 11640,
2327
- "category": "engineering",
2328
- "target": "F",
2329
- "prediction": "B",
2330
- "raw_prediction": "<think>\n\n</think>\n\nB. (1",
2331
- "correct": false
2332
- },
2333
- {
2334
- "row": 155,
2335
- "question_id": 4228,
2336
- "category": "chemistry",
2337
- "target": "I",
2338
- "prediction": "A",
2339
- "raw_prediction": "<think>\n\n</think>\n\nA",
2340
- "correct": false
2341
- },
2342
- {
2343
- "row": 156,
2344
- "question_id": 115,
2345
- "category": "business",
2346
- "target": "J",
2347
- "prediction": "C",
2348
- "raw_prediction": "<think>\n\n</think>\n\nC",
2349
- "correct": false
2350
- },
2351
- {
2352
- "row": 157,
2353
- "question_id": 6438,
2354
- "category": "health",
2355
- "target": "D",
2356
- "prediction": "D",
2357
- "raw_prediction": "<think>\n\n</think>\n\nD",
2358
- "correct": true
2359
- },
2360
- {
2361
- "row": 158,
2362
- "question_id": 1556,
2363
- "category": "law",
2364
- "target": "B",
2365
- "prediction": "B",
2366
- "raw_prediction": "<think>\n\n</think>\n\nB. Yes,",
2367
- "correct": true
2368
- },
2369
- {
2370
- "row": 159,
2371
- "question_id": 3122,
2372
- "category": "biology",
2373
- "target": "H",
2374
- "prediction": "E",
2375
- "raw_prediction": "<think>\n\n</think>\n\nE",
2376
- "correct": false
2377
- },
2378
- {
2379
- "row": 160,
2380
- "question_id": 4108,
2381
- "category": "chemistry",
2382
- "target": "C",
2383
- "prediction": "H",
2384
- "raw_prediction": "<think>\n\n</think>\n\nThe van der Wa",
2385
- "correct": false
2386
- },
2387
- {
2388
- "row": 161,
2389
- "question_id": 6229,
2390
- "category": "health",
2391
- "target": "E",
2392
- "prediction": "A",
2393
- "raw_prediction": "<think>\n\n</think>\n\nA",
2394
- "correct": false
2395
- },
2396
- {
2397
- "row": 162,
2398
- "question_id": 10591,
2399
- "category": "computer science",
2400
- "target": "A",
2401
- "prediction": "H",
2402
- "raw_prediction": "<think>\nOkay, let's try to",
2403
- "correct": false
2404
- },
2405
- {
2406
- "row": 163,
2407
- "question_id": 11457,
2408
- "category": "engineering",
2409
- "target": "F",
2410
- "prediction": "B",
2411
- "raw_prediction": "<think>\n\n</think>\n\nB. 4",
2412
- "correct": false
2413
- },
2414
- {
2415
- "row": 164,
2416
- "question_id": 4050,
2417
- "category": "chemistry",
2418
- "target": "C",
2419
- "prediction": "A",
2420
- "raw_prediction": "<think>\n\n</think>\n\nA",
2421
- "correct": false
2422
- },
2423
- {
2424
- "row": 165,
2425
- "question_id": 5886,
2426
- "category": "other",
2427
- "target": "I",
2428
- "prediction": "A",
2429
- "raw_prediction": "<think>\n\n</think>\n\nA",
2430
- "correct": false
2431
- },
2432
- {
2433
- "row": 166,
2434
- "question_id": 5119,
2435
- "category": "other",
2436
- "target": "F",
2437
- "prediction": "B",
2438
- "raw_prediction": "<think>\n\n</think>\n\nB. $1",
2439
- "correct": false
2440
- },
2441
- {
2442
- "row": 167,
2443
- "question_id": 11685,
2444
- "category": "engineering",
2445
- "target": "A",
2446
- "prediction": "B",
2447
- "raw_prediction": "<think>\n\n</think>\n\nB. 3",
2448
- "correct": false
2449
- },
2450
- {
2451
- "row": 168,
2452
- "question_id": 2132,
2453
- "category": "psychology",
2454
- "target": "C",
2455
- "prediction": "C",
2456
- "raw_prediction": "<think>\n\n</think>\n\nC",
2457
- "correct": true
2458
- },
2459
- {
2460
- "row": 169,
2461
- "question_id": 3977,
2462
- "category": "chemistry",
2463
- "target": "D",
2464
- "prediction": "D",
2465
- "raw_prediction": "<think>\n\n</think>\n\nD",
2466
- "correct": true
2467
- },
2468
- {
2469
- "row": 170,
2470
- "question_id": 6761,
2471
- "category": "health",
2472
- "target": "G",
2473
- "prediction": "G",
2474
- "raw_prediction": "<think>\n\n</think>\n\nG. Nyst",
2475
- "correct": true
2476
- },
2477
- {
2478
- "row": 171,
2479
- "question_id": 11787,
2480
- "category": "engineering",
2481
- "target": "C",
2482
- "prediction": "H",
2483
- "raw_prediction": "<think>\nOkay, let's try to",
2484
- "correct": false
2485
- },
2486
- {
2487
- "row": 172,
2488
- "question_id": 5745,
2489
- "category": "other",
2490
- "target": "A",
2491
- "prediction": "A",
2492
- "raw_prediction": "<think>\n\n</think>\n\nA",
2493
- "correct": true
2494
- },
2495
- {
2496
- "row": 173,
2497
- "question_id": 3278,
2498
- "category": "biology",
2499
- "target": "F",
2500
- "prediction": "H",
2501
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
2502
- "correct": false
2503
- },
2504
- {
2505
- "row": 174,
2506
- "question_id": 677,
2507
- "category": "business",
2508
- "target": "H",
2509
- "prediction": "C",
2510
- "raw_prediction": "<think>\n\n</think>\n\nC. $3",
2511
- "correct": false
2512
- },
2513
- {
2514
- "row": 175,
2515
- "question_id": 2841,
2516
- "category": "biology",
2517
- "target": "H",
2518
- "prediction": "I",
2519
- "raw_prediction": "<think>\n\n</think>\n\nI. Bones provide",
2520
- "correct": false
2521
- },
2522
- {
2523
- "row": 176,
2524
- "question_id": 12129,
2525
- "category": "engineering",
2526
- "target": "I",
2527
- "prediction": "B",
2528
- "raw_prediction": "<think>\n\n</think>\n\nB",
2529
- "correct": false
2530
- },
2531
- {
2532
- "row": 177,
2533
- "question_id": 4471,
2534
- "category": "chemistry",
2535
- "target": "D",
2536
- "prediction": "B",
2537
- "raw_prediction": "<think>\n\n</think>\n\nB. 1",
2538
- "correct": false
2539
- },
2540
- {
2541
- "row": 178,
2542
- "question_id": 4782,
2543
- "category": "history",
2544
- "target": "I",
2545
- "prediction": "I",
2546
- "raw_prediction": "<think>\n\n</think>\n\nI",
2547
- "correct": true
2548
- },
2549
- {
2550
- "row": 179,
2551
- "question_id": 7377,
2552
- "category": "economics",
2553
- "target": "F",
2554
- "prediction": "H",
2555
- "raw_prediction": "<think>\n</think>\n\nThe correct answer is",
2556
- "correct": false
2557
- },
2558
- {
2559
- "row": 180,
2560
- "question_id": 6772,
2561
- "category": "health",
2562
- "target": "G",
2563
- "prediction": "A",
2564
- "raw_prediction": "<think>\n\n</think>\n\nA",
2565
- "correct": false
2566
- },
2567
- {
2568
- "row": 181,
2569
- "question_id": 10589,
2570
- "category": "computer science",
2571
- "target": "A",
2572
- "prediction": "A",
2573
- "raw_prediction": "<think>\n\n</think>\n\nA",
2574
- "correct": true
2575
- },
2576
- {
2577
- "row": 182,
2578
- "question_id": 6079,
2579
- "category": "health",
2580
- "target": "I",
2581
- "prediction": "G",
2582
- "raw_prediction": "<think>\n\n</think>\n\nG",
2583
- "correct": false
2584
- },
2585
- {
2586
- "row": 183,
2587
- "question_id": 791,
2588
- "category": "business",
2589
- "target": "J",
2590
- "prediction": "B",
2591
- "raw_prediction": "<think>\n\n</think>\n\nB",
2592
- "correct": false
2593
- },
2594
- {
2595
- "row": 184,
2596
- "question_id": 2221,
2597
- "category": "psychology",
2598
- "target": "C",
2599
- "prediction": "C",
2600
- "raw_prediction": "<think>\n\n</think>\n\nC",
2601
- "correct": true
2602
- },
2603
- {
2604
- "row": 185,
2605
- "question_id": 10652,
2606
- "category": "computer science",
2607
- "target": "C",
2608
- "prediction": "C",
2609
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
2610
- "correct": true
2611
- },
2612
- {
2613
- "row": 186,
2614
- "question_id": 4109,
2615
- "category": "chemistry",
2616
- "target": "H",
2617
- "prediction": "B",
2618
- "raw_prediction": "<think>\n\n</think>\n\nB. 6",
2619
- "correct": false
2620
- },
2621
- {
2622
- "row": 187,
2623
- "question_id": 5303,
2624
- "category": "other",
2625
- "target": "H",
2626
- "prediction": "E",
2627
- "raw_prediction": "<think>\n\n</think>\n\nE",
2628
- "correct": false
2629
- },
2630
- {
2631
- "row": 188,
2632
- "question_id": 6150,
2633
- "category": "health",
2634
- "target": "J",
2635
- "prediction": "B",
2636
- "raw_prediction": "<think>\n\n</think>\n\nB",
2637
- "correct": false
2638
- },
2639
- {
2640
- "row": 189,
2641
- "question_id": 3533,
2642
- "category": "chemistry",
2643
- "target": "D",
2644
- "prediction": "I",
2645
- "raw_prediction": "<think>\n\n</think>\n\nI",
2646
- "correct": false
2647
- },
2648
- {
2649
- "row": 190,
2650
- "question_id": 1635,
2651
- "category": "law",
2652
- "target": "C",
2653
- "prediction": "E",
2654
- "raw_prediction": "<think>\n\n</think>\n\nE",
2655
- "correct": false
2656
- },
2657
- {
2658
- "row": 191,
2659
- "question_id": 7643,
2660
- "category": "economics",
2661
- "target": "F",
2662
- "prediction": "F",
2663
- "raw_prediction": "<think>\n\n</think>\n\nF",
2664
- "correct": true
2665
- },
2666
- {
2667
- "row": 192,
2668
- "question_id": 4998,
2669
- "category": "history",
2670
- "target": "B",
2671
- "prediction": "B",
2672
- "raw_prediction": "<think>\n\n</think>\n\nB",
2673
- "correct": true
2674
- },
2675
- {
2676
- "row": 193,
2677
- "question_id": 2296,
2678
- "category": "psychology",
2679
- "target": "A",
2680
- "prediction": "B",
2681
- "raw_prediction": "<think>\n\n</think>\n\nB",
2682
- "correct": false
2683
- },
2684
- {
2685
- "row": 194,
2686
- "question_id": 7929,
2687
- "category": "math",
2688
- "target": "I",
2689
- "prediction": "A",
2690
- "raw_prediction": "<think>\n\n</think>\n\nA",
2691
- "correct": false
2692
- },
2693
- {
2694
- "row": 195,
2695
- "question_id": 7419,
2696
- "category": "economics",
2697
- "target": "J",
2698
- "prediction": "J",
2699
- "raw_prediction": "<think>\n\n</think>\n\nJ",
2700
- "correct": true
2701
- },
2702
- {
2703
- "row": 196,
2704
- "question_id": 2879,
2705
- "category": "biology",
2706
- "target": "F",
2707
- "prediction": "D",
2708
- "raw_prediction": "<think>\n\n</think>\n\nD",
2709
- "correct": false
2710
- },
2711
- {
2712
- "row": 197,
2713
- "question_id": 4891,
2714
- "category": "history",
2715
- "target": "F",
2716
- "prediction": "C",
2717
- "raw_prediction": "<think>\n\n</think>\n\nC",
2718
- "correct": false
2719
- },
2720
- {
2721
- "row": 198,
2722
- "question_id": 7648,
2723
- "category": "economics",
2724
- "target": "I",
2725
- "prediction": "I",
2726
- "raw_prediction": "<think>\n\n</think>\n\nI. This is",
2727
- "correct": true
2728
- },
2729
- {
2730
- "row": 199,
2731
- "question_id": 6881,
2732
- "category": "economics",
2733
- "target": "G",
2734
- "prediction": "C",
2735
- "raw_prediction": "<think>\n\n</think>\n\nC",
2736
- "correct": false
2737
- },
2738
- {
2739
- "row": 200,
2740
- "question_id": 11771,
2741
- "category": "engineering",
2742
- "target": "F",
2743
- "prediction": "B",
2744
- "raw_prediction": "<think>\n\n</think>\n\nB",
2745
- "correct": false
2746
- },
2747
- {
2748
- "row": 201,
2749
- "question_id": 10076,
2750
- "category": "physics",
2751
- "target": "A",
2752
- "prediction": "A",
2753
- "raw_prediction": "<think>\n\n</think>\n\nA. 6",
2754
- "correct": true
2755
- },
2756
- {
2757
- "row": 202,
2758
- "question_id": 4101,
2759
- "category": "chemistry",
2760
- "target": "D",
2761
- "prediction": "I",
2762
- "raw_prediction": "<think>\n\n</think>\n\nI. 1",
2763
- "correct": false
2764
- },
2765
- {
2766
- "row": 203,
2767
- "question_id": 11036,
2768
- "category": "philosophy",
2769
- "target": "B",
2770
- "prediction": "A",
2771
- "raw_prediction": "<think>\n\n</think>\n\nA",
2772
- "correct": false
2773
- },
2774
- {
2775
- "row": 204,
2776
- "question_id": 8640,
2777
- "category": "math",
2778
- "target": "A",
2779
- "prediction": "I",
2780
- "raw_prediction": "<think>\n\n</think>\n\nI. -0",
2781
- "correct": false
2782
- },
2783
- {
2784
- "row": 205,
2785
- "question_id": 9893,
2786
- "category": "physics",
2787
- "target": "G",
2788
- "prediction": "A",
2789
- "raw_prediction": "<think>\n\n</think>\n\nA. Acceleration",
2790
- "correct": false
2791
- },
2792
- {
2793
- "row": 206,
2794
- "question_id": 10650,
2795
- "category": "computer science",
2796
- "target": "D",
2797
- "prediction": "A",
2798
- "raw_prediction": "<think>\n\n</think>\n\nA",
2799
- "correct": false
2800
- },
2801
- {
2802
- "row": 207,
2803
- "question_id": 4401,
2804
- "category": "chemistry",
2805
- "target": "I",
2806
- "prediction": "H",
2807
- "raw_prediction": "<think>\nOkay, let's try to",
2808
- "correct": false
2809
- },
2810
- {
2811
- "row": 208,
2812
- "question_id": 3026,
2813
- "category": "biology",
2814
- "target": "G",
2815
- "prediction": "H",
2816
- "raw_prediction": "<think>\n\n</think>\n\nH. 2",
2817
- "correct": false
2818
- },
2819
- {
2820
- "row": 209,
2821
- "question_id": 5910,
2822
- "category": "other",
2823
- "target": "H",
2824
- "prediction": "B",
2825
- "raw_prediction": "<think>\n\n</think>\n\nB. April",
2826
- "correct": false
2827
- },
2828
- {
2829
- "row": 210,
2830
- "question_id": 1217,
2831
- "category": "law",
2832
- "target": "E",
2833
- "prediction": "A",
2834
- "raw_prediction": "<think>\n\n</think>\n\nA",
2835
- "correct": false
2836
- },
2837
- {
2838
- "row": 211,
2839
- "question_id": 1283,
2840
- "category": "law",
2841
- "target": "I",
2842
- "prediction": "I",
2843
- "raw_prediction": "<think>\n\n</think>\n\nI.",
2844
- "correct": true
2845
- },
2846
- {
2847
- "row": 212,
2848
- "question_id": 10268,
2849
- "category": "physics",
2850
- "target": "A",
2851
- "prediction": "H",
2852
- "raw_prediction": "<think>\nOkay, let's try to",
2853
- "correct": false
2854
- },
2855
- {
2856
- "row": 213,
2857
- "question_id": 5127,
2858
- "category": "other",
2859
- "target": "C",
2860
- "prediction": "C",
2861
- "raw_prediction": "<think>\n\n</think>\n\nC",
2862
- "correct": true
2863
- },
2864
- {
2865
- "row": 214,
2866
- "question_id": 8502,
2867
- "category": "math",
2868
- "target": "B",
2869
- "prediction": "H",
2870
- "raw_prediction": "<think>\nOkay, let's try to",
2871
- "correct": false
2872
- },
2873
- {
2874
- "row": 215,
2875
- "question_id": 2239,
2876
- "category": "psychology",
2877
- "target": "I",
2878
- "prediction": "I",
2879
- "raw_prediction": "<think>\n\n</think>\n\nI",
2880
- "correct": true
2881
- },
2882
- {
2883
- "row": 216,
2884
- "question_id": 2332,
2885
- "category": "psychology",
2886
- "target": "B",
2887
- "prediction": "C",
2888
- "raw_prediction": "<think>\n\n</think>\n\nC. person-centered",
2889
- "correct": false
2890
- },
2891
- {
2892
- "row": 217,
2893
- "question_id": 2168,
2894
- "category": "psychology",
2895
- "target": "G",
2896
- "prediction": "G",
2897
- "raw_prediction": "<think>\n\n</think>\n\nG. prefront",
2898
- "correct": true
2899
- },
2900
- {
2901
- "row": 218,
2902
- "question_id": 7354,
2903
- "category": "economics",
2904
- "target": "B",
2905
- "prediction": "A",
2906
- "raw_prediction": "<think>\n\n</think>\n\nA",
2907
- "correct": false
2908
- },
2909
- {
2910
- "row": 219,
2911
- "question_id": 2482,
2912
- "category": "psychology",
2913
- "target": "I",
2914
- "prediction": "I",
2915
- "raw_prediction": "<think>\n\n</think>\n\nI",
2916
- "correct": true
2917
- },
2918
- {
2919
- "row": 220,
2920
- "question_id": 7909,
2921
- "category": "math",
2922
- "target": "A",
2923
- "prediction": "H",
2924
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
2925
- "correct": false
2926
- },
2927
- {
2928
- "row": 221,
2929
- "question_id": 7510,
2930
- "category": "economics",
2931
- "target": "I",
2932
- "prediction": "B",
2933
- "raw_prediction": "<think>\n\n</think>\n\nB. (iii",
2934
- "correct": false
2935
- },
2936
- {
2937
- "row": 222,
2938
- "question_id": 11420,
2939
- "category": "engineering",
2940
- "target": "D",
2941
- "prediction": "H",
2942
- "raw_prediction": "<think>\nOkay, let's tackle this",
2943
- "correct": false
2944
- },
2945
- {
2946
- "row": 223,
2947
- "question_id": 4795,
2948
- "category": "history",
2949
- "target": "D",
2950
- "prediction": "A",
2951
- "raw_prediction": "<think>\n\n</think>\n\nA",
2952
- "correct": false
2953
- },
2954
- {
2955
- "row": 224,
2956
- "question_id": 4158,
2957
- "category": "chemistry",
2958
- "target": "D",
2959
- "prediction": "D",
2960
- "raw_prediction": "<think>\n\n</think>\n\nD",
2961
- "correct": true
2962
- },
2963
- {
2964
- "row": 225,
2965
- "question_id": 11899,
2966
- "category": "engineering",
2967
- "target": "B",
2968
- "prediction": "B",
2969
- "raw_prediction": "<think>\n\n</think>\n\nB",
2970
- "correct": true
2971
- },
2972
- {
2973
- "row": 226,
2974
- "question_id": 5269,
2975
- "category": "other",
2976
- "target": "G",
2977
- "prediction": "I",
2978
- "raw_prediction": "<think>\n\n</think>\n\nI",
2979
- "correct": false
2980
- },
2981
- {
2982
- "row": 227,
2983
- "question_id": 7982,
2984
- "category": "math",
2985
- "target": "I",
2986
- "prediction": "I",
2987
- "raw_prediction": "<think>\n\n</think>\n\nI think I need",
2988
- "correct": true
2989
- },
2990
- {
2991
- "row": 228,
2992
- "question_id": 9452,
2993
- "category": "physics",
2994
- "target": "A",
2995
- "prediction": "H",
2996
- "raw_prediction": "<think>\n\n</think>\n\nThe moment of inertia",
2997
- "correct": false
2998
- },
2999
- {
3000
- "row": 229,
3001
- "question_id": 2609,
3002
- "category": "psychology",
3003
- "target": "B",
3004
- "prediction": "B",
3005
- "raw_prediction": "<think>\n\n</think>\n\nB",
3006
- "correct": true
3007
- },
3008
- {
3009
- "row": 230,
3010
- "question_id": 2016,
3011
- "category": "psychology",
3012
- "target": "C",
3013
- "prediction": "B",
3014
- "raw_prediction": "<think>\n\n</think>\n\nB",
3015
- "correct": false
3016
- },
3017
- {
3018
- "row": 231,
3019
- "question_id": 2659,
3020
- "category": "psychology",
3021
- "target": "I",
3022
- "prediction": "C",
3023
- "raw_prediction": "<think>\n\n</think>\n\nC. cognitive therapy",
3024
- "correct": false
3025
- },
3026
- {
3027
- "row": 232,
3028
- "question_id": 1240,
3029
- "category": "law",
3030
- "target": "I",
3031
- "prediction": "B",
3032
- "raw_prediction": "<think>\n\n</think>\n\nB",
3033
- "correct": false
3034
- },
3035
- {
3036
- "row": 233,
3037
- "question_id": 9146,
3038
- "category": "physics",
3039
- "target": "D",
3040
- "prediction": "H",
3041
- "raw_prediction": "<think>\n\n</think>\n\nLet's break down",
3042
- "correct": false
3043
- },
3044
- {
3045
- "row": 234,
3046
- "question_id": 1825,
3047
- "category": "law",
3048
- "target": "B",
3049
- "prediction": "F",
3050
- "raw_prediction": "<think>\n\n</think>\n\nF. admissible",
3051
- "correct": false
3052
- },
3053
- {
3054
- "row": 235,
3055
- "question_id": 10217,
3056
- "category": "physics",
3057
- "target": "I",
3058
- "prediction": "C",
3059
- "raw_prediction": "<think>\n\n</think>\n\nC. 6",
3060
- "correct": false
3061
- },
3062
- {
3063
- "row": 236,
3064
- "question_id": 4369,
3065
- "category": "chemistry",
3066
- "target": "I",
3067
- "prediction": "H",
3068
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
3069
- "correct": false
3070
- },
3071
- {
3072
- "row": 237,
3073
- "question_id": 2806,
3074
- "category": "biology",
3075
- "target": "C",
3076
- "prediction": "D",
3077
- "raw_prediction": "<think>\n\n</think>\n\nD",
3078
- "correct": false
3079
- },
3080
- {
3081
- "row": 238,
3082
- "question_id": 1678,
3083
- "category": "law",
3084
- "target": "C",
3085
- "prediction": "A",
3086
- "raw_prediction": "<think>\n\n</think>\n\nA",
3087
- "correct": false
3088
- },
3089
- {
3090
- "row": 239,
3091
- "question_id": 1418,
3092
- "category": "law",
3093
- "target": "B",
3094
- "prediction": "A",
3095
- "raw_prediction": "<think>\n\n</think>\n\nA",
3096
- "correct": false
3097
- },
3098
- {
3099
- "row": 240,
3100
- "question_id": 3129,
3101
- "category": "biology",
3102
- "target": "F",
3103
- "prediction": "F",
3104
- "raw_prediction": "<think>\n\n</think>\n\nF",
3105
- "correct": true
3106
- },
3107
- {
3108
- "row": 241,
3109
- "question_id": 4366,
3110
- "category": "chemistry",
3111
- "target": "H",
3112
- "prediction": "H",
3113
- "raw_prediction": "<think>\n\n</think>\n\nH",
3114
- "correct": true
3115
- },
3116
- {
3117
- "row": 242,
3118
- "question_id": 10504,
3119
- "category": "computer science",
3120
- "target": "F",
3121
- "prediction": "F",
3122
- "raw_prediction": "<think>\n\n</think>\n\nF",
3123
- "correct": true
3124
- },
3125
- {
3126
- "row": 243,
3127
- "question_id": 10740,
3128
- "category": "computer science",
3129
- "target": "E",
3130
- "prediction": "H",
3131
- "raw_prediction": "<think>\n\n</think>\n\nThe SNR is",
3132
- "correct": false
3133
- },
3134
- {
3135
- "row": 244,
3136
- "question_id": 5458,
3137
- "category": "other",
3138
- "target": "G",
3139
- "prediction": "C",
3140
- "raw_prediction": "<think>\n\n</think>\n\nC",
3141
- "correct": false
3142
- },
3143
- {
3144
- "row": 245,
3145
- "question_id": 4889,
3146
- "category": "history",
3147
- "target": "A",
3148
- "prediction": "I",
3149
- "raw_prediction": "<think>\n\n</think>\n\nI",
3150
- "correct": false
3151
- },
3152
- {
3153
- "row": 246,
3154
- "question_id": 6375,
3155
- "category": "health",
3156
- "target": "B",
3157
- "prediction": "H",
3158
- "raw_prediction": "<think>\n\n</think>\n\nH",
3159
- "correct": false
3160
- },
3161
- {
3162
- "row": 247,
3163
- "question_id": 905,
3164
- "category": "law",
3165
- "target": "A",
3166
- "prediction": "H",
3167
- "raw_prediction": "<think>\nOkay, let's try to",
3168
- "correct": false
3169
- },
3170
- {
3171
- "row": 248,
3172
- "question_id": 3475,
3173
- "category": "biology",
3174
- "target": "B",
3175
- "prediction": "B",
3176
- "raw_prediction": "<think>\n\n</think>\n\nB",
3177
- "correct": true
3178
- },
3179
- {
3180
- "row": 249,
3181
- "question_id": 7903,
3182
- "category": "math",
3183
- "target": "G",
3184
- "prediction": "A",
3185
- "raw_prediction": "<think>\n\n</think>\n\nA",
3186
- "correct": false
3187
- },
3188
- {
3189
- "row": 250,
3190
- "question_id": 5088,
3191
- "category": "other",
3192
- "target": "E",
3193
- "prediction": "C",
3194
- "raw_prediction": "<think>\n\n</think>\n\nC",
3195
- "correct": false
3196
- },
3197
- {
3198
- "row": 251,
3199
- "question_id": 3586,
3200
- "category": "chemistry",
3201
- "target": "A",
3202
- "prediction": "H",
3203
- "raw_prediction": "<think>\nOkay, let's see.",
3204
- "correct": false
3205
- },
3206
- {
3207
- "row": 252,
3208
- "question_id": 2927,
3209
- "category": "biology",
3210
- "target": "C",
3211
- "prediction": "C",
3212
- "raw_prediction": "<think>\n\n</think>\n\nC",
3213
- "correct": true
3214
- },
3215
- {
3216
- "row": 253,
3217
- "question_id": 11343,
3218
- "category": "engineering",
3219
- "target": "I",
3220
- "prediction": "I",
3221
- "raw_prediction": "<think>\n\n</think>\n\nI. k^",
3222
- "correct": true
3223
- },
3224
- {
3225
- "row": 254,
3226
- "question_id": 10056,
3227
- "category": "physics",
3228
- "target": "G",
3229
- "prediction": "H",
3230
- "raw_prediction": "<think>\n\n</think>\n\nThe correct answer is",
3231
- "correct": false
3232
- },
3233
- {
3234
- "row": 255,
3235
- "question_id": 4488,
3236
- "category": "chemistry",
3237
- "target": "F",
3238
- "prediction": "A",
3239
- "raw_prediction": "<think>\n\n</think>\n\nA",
3240
- "correct": false
3241
- }
3242
- ]
3243
- }
3244
- },
3245
- "elapsed_seconds": 71.423494185
3246
- }