adil89aminx commited on
Commit
ea91482
·
verified ·
1 Parent(s): 675b024

Upload data/frontier_final_consolidated.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. data/frontier_final_consolidated.json +269 -0
data/frontier_final_consolidated.json ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_models": 31,
3
+ "r_all": 0.7294142090425955,
4
+ "p_all": 3.2388070812895497e-06,
5
+ "slope": 0.5192906843819869,
6
+ "intercept": 45.661201796897174,
7
+ "gpt54_compute_delta_h": 7.799999999999997,
8
+ "per_lab_h_mean": {
9
+ "Anthropic": -9.053494934207958,
10
+ "Google": 2.213727064476174,
11
+ "OpenAI": 3.7187403056387387,
12
+ "DeepSeek": 4.631327282323513,
13
+ "Moonshot": 2.05727364256623,
14
+ "Alibaba": 3.0649899163190355,
15
+ "MiniMax": -2.1083146843325125,
16
+ "Meta": 2.632663091049139
17
+ },
18
+ "n_labs": 8,
19
+ "models": [
20
+ {
21
+ "name": "Claude 3.5 Sonnet",
22
+ "swe": 49.0,
23
+ "gpqa": 59.4,
24
+ "lab": "Anthropic",
25
+ "date": "2024-06",
26
+ "h": -11.706445331614539
27
+ },
28
+ {
29
+ "name": "Claude 3.7 Sonnet",
30
+ "swe": 62.3,
31
+ "gpqa": 68.0,
32
+ "lab": "Anthropic",
33
+ "date": "2025-02",
34
+ "h": -10.013011433894945
35
+ },
36
+ {
37
+ "name": "Claude Haiku 4.5",
38
+ "swe": 73.3,
39
+ "gpqa": 71.0,
40
+ "lab": "Anthropic",
41
+ "date": "2025",
42
+ "h": -12.725208962096815
43
+ },
44
+ {
45
+ "name": "Claude Sonnet 4.5",
46
+ "swe": 77.2,
47
+ "gpqa": 83.4,
48
+ "lab": "Anthropic",
49
+ "date": "2025",
50
+ "h": -2.350442631186553
51
+ },
52
+ {
53
+ "name": "Claude Opus 4.5",
54
+ "swe": 80.9,
55
+ "gpqa": 87.0,
56
+ "lab": "Anthropic",
57
+ "date": "2025",
58
+ "h": -0.6718181633999194
59
+ },
60
+ {
61
+ "name": "Claude Sonnet 4.6",
62
+ "swe": 79.6,
63
+ "gpqa": 74.1,
64
+ "lab": "Anthropic",
65
+ "date": "2026-02",
66
+ "h": -12.89674027370333
67
+ },
68
+ {
69
+ "name": "Claude Opus 4.6",
70
+ "swe": 80.8,
71
+ "gpqa": 91.3,
72
+ "lab": "Anthropic",
73
+ "date": "2026-02",
74
+ "h": 3.6801109050382905
75
+ },
76
+ {
77
+ "name": "Gemini 2.0 Flash",
78
+ "swe": 60.4,
79
+ "gpqa": 65.2,
80
+ "lab": "Google",
81
+ "date": "2024-12",
82
+ "h": -11.82635913356917
83
+ },
84
+ {
85
+ "name": "Gemini 2.5 Pro",
86
+ "swe": 63.8,
87
+ "gpqa": 84.0,
88
+ "lab": "Google",
89
+ "date": "2025",
90
+ "h": 5.208052539532062
91
+ },
92
+ {
93
+ "name": "Gemini 3 Flash",
94
+ "swe": 78.0,
95
+ "gpqa": 90.4,
96
+ "lab": "Google",
97
+ "date": "2025",
98
+ "h": 4.234124821307859
99
+ },
100
+ {
101
+ "name": "Gemini 3 Pro",
102
+ "swe": 76.2,
103
+ "gpqa": 91.9,
104
+ "lab": "Google",
105
+ "date": "2025-12",
106
+ "h": 6.668848053195433
107
+ },
108
+ {
109
+ "name": "Gemini 3.1 Pro",
110
+ "swe": 80.6,
111
+ "gpqa": 94.3,
112
+ "lab": "Google",
113
+ "date": "2026-02",
114
+ "h": 6.783969041914688
115
+ },
116
+ {
117
+ "name": "GPT-4o",
118
+ "swe": 33.2,
119
+ "gpqa": 53.6,
120
+ "lab": "OpenAI",
121
+ "date": "2024-05",
122
+ "h": -9.301652518379136
123
+ },
124
+ {
125
+ "name": "GPT-5",
126
+ "swe": 74.9,
127
+ "gpqa": 85.7,
128
+ "lab": "OpenAI",
129
+ "date": "2025-08",
130
+ "h": 1.143925942892011
131
+ },
132
+ {
133
+ "name": "GPT-5.1",
134
+ "swe": 76.3,
135
+ "gpqa": 88.1,
136
+ "lab": "OpenAI",
137
+ "date": "2025",
138
+ "h": 2.8169189847572227
139
+ },
140
+ {
141
+ "name": "GPT-5.2 Pro",
142
+ "swe": 80.0,
143
+ "gpqa": 93.2,
144
+ "lab": "OpenAI",
145
+ "date": "2025-12",
146
+ "h": 5.995543452543885
147
+ },
148
+ {
149
+ "name": "GPT-5.4 std",
150
+ "swe": 77.2,
151
+ "gpqa": 84.2,
152
+ "lab": "OpenAI",
153
+ "date": "2026-03",
154
+ "h": -1.5504426311865558
155
+ },
156
+ {
157
+ "name": "GPT-5.4 xhigh",
158
+ "swe": 77.2,
159
+ "gpqa": 92.0,
160
+ "lab": "OpenAI",
161
+ "date": "2026-03",
162
+ "h": 6.249557368813441
163
+ },
164
+ {
165
+ "name": "DeepSeek V3.2",
166
+ "swe": 74.4,
167
+ "gpqa": 79.9,
168
+ "lab": "DeepSeek",
169
+ "date": "2025-12",
170
+ "h": -4.396428714916993
171
+ },
172
+ {
173
+ "name": "Kimi K2.5",
174
+ "swe": 76.8,
175
+ "gpqa": 87.6,
176
+ "lab": "Moonshot",
177
+ "date": "2026-01",
178
+ "h": 2.05727364256623
179
+ },
180
+ {
181
+ "name": "Qwen3.5-397B",
182
+ "swe": 76.4,
183
+ "gpqa": 88.4,
184
+ "lab": "Alibaba",
185
+ "date": "2026-02",
186
+ "h": 3.0649899163190355
187
+ },
188
+ {
189
+ "name": "MiniMax M2.5",
190
+ "swe": 80.2,
191
+ "gpqa": 85.2,
192
+ "lab": "MiniMax",
193
+ "date": "2026-02",
194
+ "h": -2.1083146843325125
195
+ },
196
+ {
197
+ "name": "Llama 4 Maverick",
198
+ "swe": 70.3,
199
+ "gpqa": 84.8,
200
+ "lab": "Meta",
201
+ "date": "2025-04",
202
+ "h": 2.632663091049139
203
+ },
204
+ {
205
+ "name": "o1-preview",
206
+ "swe": 41.3,
207
+ "gpqa": 73.3,
208
+ "lab": "OpenAI",
209
+ "date": "2024-09",
210
+ "h": 6.192092938126763
211
+ },
212
+ {
213
+ "name": "o1",
214
+ "swe": 41.0,
215
+ "gpqa": 78.0,
216
+ "lab": "OpenAI",
217
+ "date": "2024-12",
218
+ "h": 11.047880143441361
219
+ },
220
+ {
221
+ "name": "o3-mini",
222
+ "swe": 49.3,
223
+ "gpqa": 79.7,
224
+ "lab": "OpenAI",
225
+ "date": "2025-01",
226
+ "h": 8.43776746307087
227
+ },
228
+ {
229
+ "name": "o3",
230
+ "swe": 69.1,
231
+ "gpqa": 87.7,
232
+ "lab": "OpenAI",
233
+ "date": "2025-04",
234
+ "h": 6.155811912307527
235
+ },
236
+ {
237
+ "name": "Claude 3.5 Haiku",
238
+ "swe": 40.6,
239
+ "gpqa": 41.0,
240
+ "lab": "Anthropic",
241
+ "date": "2024-10",
242
+ "h": -25.744403582805845
243
+ },
244
+ {
245
+ "name": "DeepSeek-V2.5",
246
+ "swe": 16.8,
247
+ "gpqa": 66.2,
248
+ "lab": "DeepSeek",
249
+ "date": "2024-09",
250
+ "h": 11.81471470548545
251
+ },
252
+ {
253
+ "name": "DeepSeek-V3",
254
+ "swe": 42.0,
255
+ "gpqa": 75.9,
256
+ "lab": "DeepSeek",
257
+ "date": "2024-12",
258
+ "h": 8.428589459059381
259
+ },
260
+ {
261
+ "name": "DeepSeek-R1",
262
+ "swe": 44.6,
263
+ "gpqa": 71.5,
264
+ "lab": "DeepSeek",
265
+ "date": "2025-01",
266
+ "h": 2.678433679666213
267
+ }
268
+ ]
269
+ }