Ill-Ness commited on
Commit
93ca54b
·
verified ·
1 Parent(s): 21c8b35

Delete evals/mmlu_pro_test_20.json

Browse files
Files changed (1) hide show
  1. evals/mmlu_pro_test_20.json +0 -181
evals/mmlu_pro_test_20.json DELETED
@@ -1,181 +0,0 @@
1
- {
2
- "benchmark": "MMLU-Pro",
3
- "dataset": "TIGER-Lab/MMLU-Pro",
4
- "split": "test",
5
- "checkpoint": "/artifacts/20260518-164838/full-safetensors",
6
- "samples_evaluated": 20,
7
- "accuracy": 0.75,
8
- "correct": 15,
9
- "total": 20,
10
- "category_scores": {
11
- "business": {
12
- "accuracy": 0.75,
13
- "correct": 15,
14
- "total": 20
15
- }
16
- },
17
- "samples": [
18
- {
19
- "question_id": 70,
20
- "category": "business",
21
- "gold": "I",
22
- "prediction": "I",
23
- "raw_generation": "I",
24
- "correct": true
25
- },
26
- {
27
- "question_id": 71,
28
- "category": "business",
29
- "gold": "F",
30
- "prediction": "F",
31
- "raw_generation": "F",
32
- "correct": true
33
- },
34
- {
35
- "question_id": 72,
36
- "category": "business",
37
- "gold": "J",
38
- "prediction": "J",
39
- "raw_generation": "J",
40
- "correct": true
41
- },
42
- {
43
- "question_id": 73,
44
- "category": "business",
45
- "gold": "C",
46
- "prediction": "H",
47
- "raw_generation": "H",
48
- "correct": false
49
- },
50
- {
51
- "question_id": 74,
52
- "category": "business",
53
- "gold": "G",
54
- "prediction": "G",
55
- "raw_generation": "G",
56
- "correct": true
57
- },
58
- {
59
- "question_id": 75,
60
- "category": "business",
61
- "gold": "A",
62
- "prediction": "A",
63
- "raw_generation": "A",
64
- "correct": true
65
- },
66
- {
67
- "question_id": 76,
68
- "category": "business",
69
- "gold": "D",
70
- "prediction": "D",
71
- "raw_generation": "D",
72
- "correct": true
73
- },
74
- {
75
- "question_id": 77,
76
- "category": "business",
77
- "gold": "J",
78
- "prediction": "J",
79
- "raw_generation": "J",
80
- "correct": true
81
- },
82
- {
83
- "question_id": 78,
84
- "category": "business",
85
- "gold": "E",
86
- "prediction": "E",
87
- "raw_generation": "E",
88
- "correct": true
89
- },
90
- {
91
- "question_id": 79,
92
- "category": "business",
93
- "gold": "F",
94
- "prediction": "F",
95
- "raw_generation": "F",
96
- "correct": true
97
- },
98
- {
99
- "question_id": 80,
100
- "category": "business",
101
- "gold": "E",
102
- "prediction": "E",
103
- "raw_generation": "E",
104
- "correct": true
105
- },
106
- {
107
- "question_id": 81,
108
- "category": "business",
109
- "gold": "G",
110
- "prediction": "C",
111
- "raw_generation": "C",
112
- "correct": false
113
- },
114
- {
115
- "question_id": 82,
116
- "category": "business",
117
- "gold": "J",
118
- "prediction": "J",
119
- "raw_generation": "J",
120
- "correct": true
121
- },
122
- {
123
- "question_id": 83,
124
- "category": "business",
125
- "gold": "D",
126
- "prediction": "B",
127
- "raw_generation": "B",
128
- "correct": false
129
- },
130
- {
131
- "question_id": 84,
132
- "category": "business",
133
- "gold": "F",
134
- "prediction": "F",
135
- "raw_generation": "F",
136
- "correct": true
137
- },
138
- {
139
- "question_id": 85,
140
- "category": "business",
141
- "gold": "J",
142
- "prediction": "J",
143
- "raw_generation": "J",
144
- "correct": true
145
- },
146
- {
147
- "question_id": 86,
148
- "category": "business",
149
- "gold": "F",
150
- "prediction": "I",
151
- "raw_generation": "I",
152
- "correct": false
153
- },
154
- {
155
- "question_id": 87,
156
- "category": "business",
157
- "gold": "A",
158
- "prediction": "D",
159
- "raw_generation": "D",
160
- "correct": false
161
- },
162
- {
163
- "question_id": 88,
164
- "category": "business",
165
- "gold": "B",
166
- "prediction": "B",
167
- "raw_generation": "B",
168
- "correct": true
169
- },
170
- {
171
- "question_id": 89,
172
- "category": "business",
173
- "gold": "D",
174
- "prediction": "D",
175
- "raw_generation": "D",
176
- "correct": true
177
- }
178
- ],
179
- "attention_impl": "sdpa",
180
- "elapsed_seconds": 51.712759704999996
181
- }