Ill-Ness commited on
Commit
6fde20b
·
verified ·
1 Parent(s): 93ca54b

Delete evals/mmlu_pro_test_1000.json

Browse files
Files changed (1) hide show
  1. evals/mmlu_pro_test_1000.json +0 -267
evals/mmlu_pro_test_1000.json DELETED
@@ -1,267 +0,0 @@
1
- {
2
- "benchmark": "MMLU-Pro",
3
- "dataset": "TIGER-Lab/MMLU-Pro",
4
- "split": "test",
5
- "checkpoint": "/artifacts/20260518-164838/full-safetensors",
6
- "samples_evaluated": 1000,
7
- "batch_size": 8,
8
- "accuracy": 0.385,
9
- "correct": 385,
10
- "total": 1000,
11
- "category_scores": {
12
- "business": {
13
- "accuracy": 0.37515842839036756,
14
- "correct": 296,
15
- "total": 789
16
- },
17
- "law": {
18
- "accuracy": 0.4218009478672986,
19
- "correct": 89,
20
- "total": 211
21
- }
22
- },
23
- "samples": [
24
- {
25
- "question_id": 70,
26
- "category": "business",
27
- "gold": "I",
28
- "prediction": "I",
29
- "raw_generation": "I",
30
- "correct": true
31
- },
32
- {
33
- "question_id": 71,
34
- "category": "business",
35
- "gold": "F",
36
- "prediction": "F",
37
- "raw_generation": "F",
38
- "correct": true
39
- },
40
- {
41
- "question_id": 72,
42
- "category": "business",
43
- "gold": "J",
44
- "prediction": "J",
45
- "raw_generation": "J",
46
- "correct": true
47
- },
48
- {
49
- "question_id": 73,
50
- "category": "business",
51
- "gold": "C",
52
- "prediction": "H",
53
- "raw_generation": "H",
54
- "correct": false
55
- },
56
- {
57
- "question_id": 74,
58
- "category": "business",
59
- "gold": "G",
60
- "prediction": "G",
61
- "raw_generation": "G",
62
- "correct": true
63
- },
64
- {
65
- "question_id": 75,
66
- "category": "business",
67
- "gold": "A",
68
- "prediction": "A",
69
- "raw_generation": "A",
70
- "correct": true
71
- },
72
- {
73
- "question_id": 76,
74
- "category": "business",
75
- "gold": "D",
76
- "prediction": "D",
77
- "raw_generation": "D",
78
- "correct": true
79
- },
80
- {
81
- "question_id": 77,
82
- "category": "business",
83
- "gold": "J",
84
- "prediction": "J",
85
- "raw_generation": "J",
86
- "correct": true
87
- },
88
- {
89
- "question_id": 78,
90
- "category": "business",
91
- "gold": "E",
92
- "prediction": "E",
93
- "raw_generation": "E",
94
- "correct": true
95
- },
96
- {
97
- "question_id": 79,
98
- "category": "business",
99
- "gold": "F",
100
- "prediction": "F",
101
- "raw_generation": "F",
102
- "correct": true
103
- },
104
- {
105
- "question_id": 80,
106
- "category": "business",
107
- "gold": "E",
108
- "prediction": "E",
109
- "raw_generation": "E",
110
- "correct": true
111
- },
112
- {
113
- "question_id": 81,
114
- "category": "business",
115
- "gold": "G",
116
- "prediction": "C",
117
- "raw_generation": "C",
118
- "correct": false
119
- },
120
- {
121
- "question_id": 82,
122
- "category": "business",
123
- "gold": "J",
124
- "prediction": "J",
125
- "raw_generation": "J",
126
- "correct": true
127
- },
128
- {
129
- "question_id": 83,
130
- "category": "business",
131
- "gold": "D",
132
- "prediction": "B",
133
- "raw_generation": "B",
134
- "correct": false
135
- },
136
- {
137
- "question_id": 84,
138
- "category": "business",
139
- "gold": "F",
140
- "prediction": "F",
141
- "raw_generation": "F",
142
- "correct": true
143
- },
144
- {
145
- "question_id": 85,
146
- "category": "business",
147
- "gold": "J",
148
- "prediction": "J",
149
- "raw_generation": "J",
150
- "correct": true
151
- },
152
- {
153
- "question_id": 86,
154
- "category": "business",
155
- "gold": "F",
156
- "prediction": "I",
157
- "raw_generation": "I",
158
- "correct": false
159
- },
160
- {
161
- "question_id": 87,
162
- "category": "business",
163
- "gold": "A",
164
- "prediction": "D",
165
- "raw_generation": "D",
166
- "correct": false
167
- },
168
- {
169
- "question_id": 88,
170
- "category": "business",
171
- "gold": "B",
172
- "prediction": "B",
173
- "raw_generation": "B",
174
- "correct": true
175
- },
176
- {
177
- "question_id": 89,
178
- "category": "business",
179
- "gold": "D",
180
- "prediction": "D",
181
- "raw_generation": "D",
182
- "correct": true
183
- },
184
- {
185
- "question_id": 90,
186
- "category": "business",
187
- "gold": "J",
188
- "prediction": "C",
189
- "raw_generation": "C",
190
- "correct": false
191
- },
192
- {
193
- "question_id": 91,
194
- "category": "business",
195
- "gold": "I",
196
- "prediction": "B",
197
- "raw_generation": "B",
198
- "correct": false
199
- },
200
- {
201
- "question_id": 92,
202
- "category": "business",
203
- "gold": "E",
204
- "prediction": "A",
205
- "raw_generation": "A",
206
- "correct": false
207
- },
208
- {
209
- "question_id": 93,
210
- "category": "business",
211
- "gold": "H",
212
- "prediction": "C",
213
- "raw_generation": "C",
214
- "correct": false
215
- },
216
- {
217
- "question_id": 94,
218
- "category": "business",
219
- "gold": "J",
220
- "prediction": "I",
221
- "raw_generation": "I",
222
- "correct": false
223
- },
224
- {
225
- "question_id": 95,
226
- "category": "business",
227
- "gold": "J",
228
- "prediction": "J",
229
- "raw_generation": "J",
230
- "correct": true
231
- },
232
- {
233
- "question_id": 96,
234
- "category": "business",
235
- "gold": "H",
236
- "prediction": "C",
237
- "raw_generation": "C",
238
- "correct": false
239
- },
240
- {
241
- "question_id": 97,
242
- "category": "business",
243
- "gold": "F",
244
- "prediction": "F",
245
- "raw_generation": "F",
246
- "correct": true
247
- },
248
- {
249
- "question_id": 98,
250
- "category": "business",
251
- "gold": "H",
252
- "prediction": "D",
253
- "raw_generation": "D",
254
- "correct": false
255
- },
256
- {
257
- "question_id": 99,
258
- "category": "business",
259
- "gold": "C",
260
- "prediction": "C",
261
- "raw_generation": "C",
262
- "correct": true
263
- }
264
- ],
265
- "attention_impl": "sdpa",
266
- "elapsed_seconds": 91.607788196
267
- }