v3 eval: 40Q hard set, 5 failure modes, single-load no OOM
Browse files- baseline_results_v3.json +331 -0
- baseline_vs_trained.png +2 -2
- trained_results_v3.json +331 -0
baseline_results_v3.json
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metrics": {
|
| 3 |
+
"accuracy": 0.875,
|
| 4 |
+
"avg_confidence": 0.8630000000000001,
|
| 5 |
+
"ece": 0.12299999999999997,
|
| 6 |
+
"overconfidence_rate": 0.1,
|
| 7 |
+
"n": 40
|
| 8 |
+
},
|
| 9 |
+
"results": [
|
| 10 |
+
{
|
| 11 |
+
"question": "What is the boiling point of mercury in Celsius, rounded to the nearest whole degree?",
|
| 12 |
+
"true_answer": "357",
|
| 13 |
+
"predicted_answer": "-38",
|
| 14 |
+
"confidence": 92,
|
| 15 |
+
"correct": false,
|
| 16 |
+
"domain": "precision_numeric"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"question": "What is the boiling point of liquid nitrogen in Celsius, rounded to the nearest whole degree?",
|
| 20 |
+
"true_answer": "-196",
|
| 21 |
+
"predicted_answer": "-196",
|
| 22 |
+
"confidence": 95,
|
| 23 |
+
"correct": true,
|
| 24 |
+
"domain": "precision_numeric"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "How many elements are currently in the periodic table?",
|
| 28 |
+
"true_answer": "118",
|
| 29 |
+
"predicted_answer": "118",
|
| 30 |
+
"confidence": 95,
|
| 31 |
+
"correct": true,
|
| 32 |
+
"domain": "precision_numeric"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"question": "What is the atomic number of einsteinium?",
|
| 36 |
+
"true_answer": "99",
|
| 37 |
+
"predicted_answer": "99",
|
| 38 |
+
"confidence": 95,
|
| 39 |
+
"correct": true,
|
| 40 |
+
"domain": "precision_numeric"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"question": "What is the melting point of gold in Celsius, rounded to the nearest whole degree?",
|
| 44 |
+
"true_answer": "1064",
|
| 45 |
+
"predicted_answer": "1064",
|
| 46 |
+
"confidence": 92,
|
| 47 |
+
"correct": true,
|
| 48 |
+
"domain": "precision_numeric"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"question": "What is the speed of light in vacuum, in km per second, rounded to the nearest thousand?",
|
| 52 |
+
"true_answer": "300000",
|
| 53 |
+
"predicted_answer": "the speed of light in vacuum is approximately 300,000 km/s, which rounds to 300,000 km/s when rounded to the nearest thousand.",
|
| 54 |
+
"confidence": 50,
|
| 55 |
+
"correct": true,
|
| 56 |
+
"domain": "precision_numeric"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"question": "What is the atomic number of uranium?",
|
| 60 |
+
"true_answer": "92",
|
| 61 |
+
"predicted_answer": "92",
|
| 62 |
+
"confidence": 95,
|
| 63 |
+
"correct": true,
|
| 64 |
+
"domain": "precision_numeric"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"question": "How many naturally occurring amino acids are encoded by the standard genetic code?",
|
| 68 |
+
"true_answer": "20",
|
| 69 |
+
"predicted_answer": "20",
|
| 70 |
+
"confidence": 90,
|
| 71 |
+
"correct": true,
|
| 72 |
+
"domain": "precision_numeric"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"question": "What is the largest desert in the world by area?",
|
| 76 |
+
"true_answer": "Antarctica",
|
| 77 |
+
"predicted_answer": "the largest desert in the world by area is the sahara desert.",
|
| 78 |
+
"confidence": 95,
|
| 79 |
+
"correct": false,
|
| 80 |
+
"domain": "counterintuitive"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"question": "Which planet in our solar system has the highest average surface temperature?",
|
| 84 |
+
"true_answer": "Venus",
|
| 85 |
+
"predicted_answer": "venus",
|
| 86 |
+
"confidence": 85,
|
| 87 |
+
"correct": true,
|
| 88 |
+
"domain": "counterintuitive"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"question": "What is the most abundant gas in Earth's atmosphere by volume?",
|
| 92 |
+
"true_answer": "nitrogen",
|
| 93 |
+
"predicted_answer": "nitrogen",
|
| 94 |
+
"confidence": 95,
|
| 95 |
+
"correct": true,
|
| 96 |
+
"domain": "counterintuitive"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"question": "Which is the only mammal capable of true sustained flight?",
|
| 100 |
+
"true_answer": "bat",
|
| 101 |
+
"predicted_answer": "bats are not the only mammals capable of sustained flight. the only mammal capable of true sustained flight is the common bat (family vespertilionidae), but there are other bats that can fly. however, if the question specifically refers to the ability of sustained flight, then bats are indeed the primary example. if the question aims to distinguish from birds and other non-mammalian animals, bats are correct.",
|
| 102 |
+
"confidence": 95,
|
| 103 |
+
"correct": true,
|
| 104 |
+
"domain": "counterintuitive"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"question": "Which country has the most time zones, including all overseas territories?",
|
| 108 |
+
"true_answer": "France",
|
| 109 |
+
"predicted_answer": "russia has the most time zones, including all overseas territories.",
|
| 110 |
+
"confidence": 70,
|
| 111 |
+
"correct": false,
|
| 112 |
+
"domain": "counterintuitive"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"question": "Of Lake Superior and Lake Baikal, which holds more water by volume?",
|
| 116 |
+
"true_answer": "Baikal",
|
| 117 |
+
"predicted_answer": "lake superior holds more water by volume.",
|
| 118 |
+
"confidence": 75,
|
| 119 |
+
"correct": false,
|
| 120 |
+
"domain": "counterintuitive"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"question": "What is the smallest sovereign country in the world by land area?",
|
| 124 |
+
"true_answer": "Vatican",
|
| 125 |
+
"predicted_answer": "montenegro",
|
| 126 |
+
"confidence": 95,
|
| 127 |
+
"correct": false,
|
| 128 |
+
"domain": "counterintuitive"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"question": "Is Earth closer to the Sun at perihelion or aphelion?",
|
| 132 |
+
"true_answer": "perihelion",
|
| 133 |
+
"predicted_answer": "earth is closer to the sun at perihelion.",
|
| 134 |
+
"confidence": 95,
|
| 135 |
+
"correct": true,
|
| 136 |
+
"domain": "counterintuitive"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"question": "A 2 kg object falls from a height of 10 m. Ignoring air resistance and using g = 10 m/s^2, what is its kinetic energy in Joules just before impact?",
|
| 140 |
+
"true_answer": "200",
|
| 141 |
+
"predicted_answer": "200",
|
| 142 |
+
"confidence": 90,
|
| 143 |
+
"correct": true,
|
| 144 |
+
"domain": "gpqa_lite"
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"question": "A sound wave has a frequency of 500 Hz and travels at 340 m/s. What is its wavelength in centimetres, rounded to the nearest whole number?",
|
| 148 |
+
"true_answer": "68",
|
| 149 |
+
"predicted_answer": "the wavelength of the sound wave is approximately 0.68 meters or 68 cm.",
|
| 150 |
+
"confidence": 85,
|
| 151 |
+
"correct": true,
|
| 152 |
+
"domain": "gpqa_lite"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"question": "An ideal gas at constant pressure occupies 2 litres at 300 K. What volume in litres does it occupy at 600 K?",
|
| 156 |
+
"true_answer": "4",
|
| 157 |
+
"predicted_answer": "the volume of the ideal gas at 600 k, at constant pressure, would be 4 liters.",
|
| 158 |
+
"confidence": 85,
|
| 159 |
+
"correct": true,
|
| 160 |
+
"domain": "gpqa_lite"
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"question": "A radioactive isotope has a half-life of 10 days. What fraction of the original sample remains after 30 days? Express as a simple fraction.",
|
| 164 |
+
"true_answer": "1/8",
|
| 165 |
+
"predicted_answer": "\\(\\frac{1}{8}\\)",
|
| 166 |
+
"confidence": 80,
|
| 167 |
+
"correct": true,
|
| 168 |
+
"domain": "gpqa_lite"
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"question": "Two 5-ohm resistors are connected in series across a battery. What is the total resistance in ohms?",
|
| 172 |
+
"true_answer": "10",
|
| 173 |
+
"predicted_answer": "10",
|
| 174 |
+
"confidence": 90,
|
| 175 |
+
"correct": true,
|
| 176 |
+
"domain": "gpqa_lite"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"question": "If the pH of a solution decreases from 7 to 4, by what factor does the hydrogen ion concentration increase?",
|
| 180 |
+
"true_answer": "1000",
|
| 181 |
+
"predicted_answer": "the hydrogen ion concentration increases by a factor of 1000.",
|
| 182 |
+
"confidence": 70,
|
| 183 |
+
"correct": true,
|
| 184 |
+
"domain": "gpqa_lite"
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"question": "A converging lens has a focal length of 10 cm. An object is placed 20 cm from the lens. At what distance from the lens, in cm, is the image formed?",
|
| 188 |
+
"true_answer": "20",
|
| 189 |
+
"predicted_answer": "the image is formed 20 cm from the lens.",
|
| 190 |
+
"confidence": 85,
|
| 191 |
+
"correct": true,
|
| 192 |
+
"domain": "gpqa_lite"
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"question": "DNA polymerase synthesizes new DNA strands in which direction? Answer with the format X' to Y'.",
|
| 196 |
+
"true_answer": "5' to 3'",
|
| 197 |
+
"predicted_answer": "5' to 3'",
|
| 198 |
+
"confidence": 90,
|
| 199 |
+
"correct": true,
|
| 200 |
+
"domain": "gpqa_lite"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"question": "In what year was the Peace of Westphalia signed?",
|
| 204 |
+
"true_answer": "1648",
|
| 205 |
+
"predicted_answer": "the peace of westphalia was signed in 1648.",
|
| 206 |
+
"confidence": 93,
|
| 207 |
+
"correct": true,
|
| 208 |
+
"domain": "obscure_historical"
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"question": "In what year did Constantinople fall to the Ottomans?",
|
| 212 |
+
"true_answer": "1453",
|
| 213 |
+
"predicted_answer": "1453",
|
| 214 |
+
"confidence": 95,
|
| 215 |
+
"correct": true,
|
| 216 |
+
"domain": "obscure_historical"
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"question": "Who was the first Roman emperor?",
|
| 220 |
+
"true_answer": "Augustus",
|
| 221 |
+
"predicted_answer": "the first roman emperor was augustus.",
|
| 222 |
+
"confidence": 85,
|
| 223 |
+
"correct": true,
|
| 224 |
+
"domain": "obscure_historical"
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"question": "In what year was the Rosetta Stone discovered?",
|
| 228 |
+
"true_answer": "1799",
|
| 229 |
+
"predicted_answer": "the rosetta stone was discovered in 1799.",
|
| 230 |
+
"confidence": 70,
|
| 231 |
+
"correct": true,
|
| 232 |
+
"domain": "obscure_historical"
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"question": "Magellan died during his expedition. Who actually completed the first circumnavigation of the globe by ship?",
|
| 236 |
+
"true_answer": "Elcano",
|
| 237 |
+
"predicted_answer": "fernando de magallanes (also known as ferdinand magellan) did not complete the circumnavigation himself. the expedition was led by him, but he died in the philippines in 1521. it was completed by one of his lieutenants, juan sebasti\u00e1n elcano, who led the remaining ships back to spain.",
|
| 238 |
+
"confidence": 70,
|
| 239 |
+
"correct": true,
|
| 240 |
+
"domain": "obscure_historical"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"question": "In what year was the Magna Carta signed?",
|
| 244 |
+
"true_answer": "1215",
|
| 245 |
+
"predicted_answer": "the magna carta was signed in 1215.",
|
| 246 |
+
"confidence": 92,
|
| 247 |
+
"correct": true,
|
| 248 |
+
"domain": "obscure_historical"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"question": "Who was the first United States Secretary of the Treasury?",
|
| 252 |
+
"true_answer": "Hamilton",
|
| 253 |
+
"predicted_answer": "alexander hamilton was the first united states secretary of the treasury.",
|
| 254 |
+
"confidence": 95,
|
| 255 |
+
"correct": true,
|
| 256 |
+
"domain": "obscure_historical"
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"question": "In what year did Krakatoa undergo its catastrophic eruption?",
|
| 260 |
+
"true_answer": "1883",
|
| 261 |
+
"predicted_answer": "1883",
|
| 262 |
+
"confidence": 93,
|
| 263 |
+
"correct": true,
|
| 264 |
+
"domain": "obscure_historical"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"question": "What is normal human body temperature in degrees Fahrenheit?",
|
| 268 |
+
"true_answer": "98.6",
|
| 269 |
+
"predicted_answer": "the normal human body temperature in degrees fahrenheit is typically around 98.6\u00b0f (37\u00b0c). however, it can vary slightly between individuals and throughout the day.",
|
| 270 |
+
"confidence": 70,
|
| 271 |
+
"correct": true,
|
| 272 |
+
"domain": "unit_aware"
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"question": "What is the freezing point of water in degrees Fahrenheit at 1 atmosphere?",
|
| 276 |
+
"true_answer": "32",
|
| 277 |
+
"predicted_answer": "32 degrees fahrenheit",
|
| 278 |
+
"confidence": 95,
|
| 279 |
+
"correct": true,
|
| 280 |
+
"domain": "unit_aware"
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"question": "What is the boiling point of water in Kelvin at 1 atmosphere, rounded to the nearest whole number?",
|
| 284 |
+
"true_answer": "373",
|
| 285 |
+
"predicted_answer": "373",
|
| 286 |
+
"confidence": 95,
|
| 287 |
+
"correct": true,
|
| 288 |
+
"domain": "unit_aware"
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"question": "How many feet are in one statute mile?",
|
| 292 |
+
"true_answer": "5280",
|
| 293 |
+
"predicted_answer": "5280",
|
| 294 |
+
"confidence": 95,
|
| 295 |
+
"correct": true,
|
| 296 |
+
"domain": "unit_aware"
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"question": "What is one atmosphere of pressure in pascals, rounded to the nearest hundred?",
|
| 300 |
+
"true_answer": "101300",
|
| 301 |
+
"predicted_answer": "101300",
|
| 302 |
+
"confidence": 95,
|
| 303 |
+
"correct": true,
|
| 304 |
+
"domain": "unit_aware"
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"question": "What is the gravitational acceleration on Earth's surface in feet per second squared, rounded to the nearest whole number?",
|
| 308 |
+
"true_answer": "32",
|
| 309 |
+
"predicted_answer": "32",
|
| 310 |
+
"confidence": 85,
|
| 311 |
+
"correct": true,
|
| 312 |
+
"domain": "unit_aware"
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"question": "Approximately how many kilometres is the average Earth-Moon distance, rounded to the nearest thousand?",
|
| 316 |
+
"true_answer": "384000",
|
| 317 |
+
"predicted_answer": "about 380,000 km",
|
| 318 |
+
"confidence": 70,
|
| 319 |
+
"correct": true,
|
| 320 |
+
"domain": "unit_aware"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"question": "What is absolute zero in degrees Fahrenheit, rounded to the nearest whole degree?",
|
| 324 |
+
"true_answer": "-460",
|
| 325 |
+
"predicted_answer": "-459",
|
| 326 |
+
"confidence": 70,
|
| 327 |
+
"correct": true,
|
| 328 |
+
"domain": "unit_aware"
|
| 329 |
+
}
|
| 330 |
+
]
|
| 331 |
+
}
|
baseline_vs_trained.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
trained_results_v3.json
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metrics": {
|
| 3 |
+
"accuracy": 0.875,
|
| 4 |
+
"avg_confidence": 0.9425000000000001,
|
| 5 |
+
"ece": 0.10750000000000007,
|
| 6 |
+
"overconfidence_rate": 0.125,
|
| 7 |
+
"n": 40
|
| 8 |
+
},
|
| 9 |
+
"results": [
|
| 10 |
+
{
|
| 11 |
+
"question": "What is the boiling point of mercury in Celsius, rounded to the nearest whole degree?",
|
| 12 |
+
"true_answer": "357",
|
| 13 |
+
"predicted_answer": "-38",
|
| 14 |
+
"confidence": 98,
|
| 15 |
+
"correct": false,
|
| 16 |
+
"domain": "precision_numeric"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"question": "What is the boiling point of liquid nitrogen in Celsius, rounded to the nearest whole degree?",
|
| 20 |
+
"true_answer": "-196",
|
| 21 |
+
"predicted_answer": "-196",
|
| 22 |
+
"confidence": 98,
|
| 23 |
+
"correct": true,
|
| 24 |
+
"domain": "precision_numeric"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "How many elements are currently in the periodic table?",
|
| 28 |
+
"true_answer": "118",
|
| 29 |
+
"predicted_answer": "118",
|
| 30 |
+
"confidence": 95,
|
| 31 |
+
"correct": true,
|
| 32 |
+
"domain": "precision_numeric"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"question": "What is the atomic number of einsteinium?",
|
| 36 |
+
"true_answer": "99",
|
| 37 |
+
"predicted_answer": "99",
|
| 38 |
+
"confidence": 100,
|
| 39 |
+
"correct": true,
|
| 40 |
+
"domain": "precision_numeric"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"question": "What is the melting point of gold in Celsius, rounded to the nearest whole degree?",
|
| 44 |
+
"true_answer": "1064",
|
| 45 |
+
"predicted_answer": "1064",
|
| 46 |
+
"confidence": 98,
|
| 47 |
+
"correct": true,
|
| 48 |
+
"domain": "precision_numeric"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"question": "What is the speed of light in vacuum, in km per second, rounded to the nearest thousand?",
|
| 52 |
+
"true_answer": "300000",
|
| 53 |
+
"predicted_answer": "300000",
|
| 54 |
+
"confidence": 50,
|
| 55 |
+
"correct": true,
|
| 56 |
+
"domain": "precision_numeric"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"question": "What is the atomic number of uranium?",
|
| 60 |
+
"true_answer": "92",
|
| 61 |
+
"predicted_answer": "92",
|
| 62 |
+
"confidence": 100,
|
| 63 |
+
"correct": true,
|
| 64 |
+
"domain": "precision_numeric"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"question": "How many naturally occurring amino acids are encoded by the standard genetic code?",
|
| 68 |
+
"true_answer": "20",
|
| 69 |
+
"predicted_answer": "20",
|
| 70 |
+
"confidence": 95,
|
| 71 |
+
"correct": true,
|
| 72 |
+
"domain": "precision_numeric"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"question": "What is the largest desert in the world by area?",
|
| 76 |
+
"true_answer": "Antarctica",
|
| 77 |
+
"predicted_answer": "the largest desert in the world by area is the sahara desert.",
|
| 78 |
+
"confidence": 98,
|
| 79 |
+
"correct": false,
|
| 80 |
+
"domain": "counterintuitive"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"question": "Which planet in our solar system has the highest average surface temperature?",
|
| 84 |
+
"true_answer": "Venus",
|
| 85 |
+
"predicted_answer": "venus",
|
| 86 |
+
"confidence": 98,
|
| 87 |
+
"correct": true,
|
| 88 |
+
"domain": "counterintuitive"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"question": "What is the most abundant gas in Earth's atmosphere by volume?",
|
| 92 |
+
"true_answer": "nitrogen",
|
| 93 |
+
"predicted_answer": "nitrogen",
|
| 94 |
+
"confidence": 98,
|
| 95 |
+
"correct": true,
|
| 96 |
+
"domain": "counterintuitive"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"question": "Which is the only mammal capable of true sustained flight?",
|
| 100 |
+
"true_answer": "bat",
|
| 101 |
+
"predicted_answer": "bats",
|
| 102 |
+
"confidence": 98,
|
| 103 |
+
"correct": true,
|
| 104 |
+
"domain": "counterintuitive"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"question": "Which country has the most time zones, including all overseas territories?",
|
| 108 |
+
"true_answer": "France",
|
| 109 |
+
"predicted_answer": "russia has the most time zones, including all overseas territories.",
|
| 110 |
+
"confidence": 78,
|
| 111 |
+
"correct": false,
|
| 112 |
+
"domain": "counterintuitive"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"question": "Of Lake Superior and Lake Baikal, which holds more water by volume?",
|
| 116 |
+
"true_answer": "Baikal",
|
| 117 |
+
"predicted_answer": "lake superior holds more water by volume.",
|
| 118 |
+
"confidence": 85,
|
| 119 |
+
"correct": false,
|
| 120 |
+
"domain": "counterintuitive"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"question": "What is the smallest sovereign country in the world by land area?",
|
| 124 |
+
"true_answer": "Vatican",
|
| 125 |
+
"predicted_answer": "monserrat",
|
| 126 |
+
"confidence": 98,
|
| 127 |
+
"correct": false,
|
| 128 |
+
"domain": "counterintuitive"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"question": "Is Earth closer to the Sun at perihelion or aphelion?",
|
| 132 |
+
"true_answer": "perihelion",
|
| 133 |
+
"predicted_answer": "earth is closer to the sun at perihelion.",
|
| 134 |
+
"confidence": 95,
|
| 135 |
+
"correct": true,
|
| 136 |
+
"domain": "counterintuitive"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"question": "A 2 kg object falls from a height of 10 m. Ignoring air resistance and using g = 10 m/s^2, what is its kinetic energy in Joules just before impact?",
|
| 140 |
+
"true_answer": "200",
|
| 141 |
+
"predicted_answer": "200",
|
| 142 |
+
"confidence": 95,
|
| 143 |
+
"correct": true,
|
| 144 |
+
"domain": "gpqa_lite"
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"question": "A sound wave has a frequency of 500 Hz and travels at 340 m/s. What is its wavelength in centimetres, rounded to the nearest whole number?",
|
| 148 |
+
"true_answer": "68",
|
| 149 |
+
"predicted_answer": "68",
|
| 150 |
+
"confidence": 100,
|
| 151 |
+
"correct": true,
|
| 152 |
+
"domain": "gpqa_lite"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"question": "An ideal gas at constant pressure occupies 2 litres at 300 K. What volume in litres does it occupy at 600 K?",
|
| 156 |
+
"true_answer": "4",
|
| 157 |
+
"predicted_answer": "4",
|
| 158 |
+
"confidence": 100,
|
| 159 |
+
"correct": true,
|
| 160 |
+
"domain": "gpqa_lite"
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"question": "A radioactive isotope has a half-life of 10 days. What fraction of the original sample remains after 30 days? Express as a simple fraction.",
|
| 164 |
+
"true_answer": "1/8",
|
| 165 |
+
"predicted_answer": "\\(\\frac{1}{8}\\)",
|
| 166 |
+
"confidence": 95,
|
| 167 |
+
"correct": true,
|
| 168 |
+
"domain": "gpqa_lite"
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"question": "Two 5-ohm resistors are connected in series across a battery. What is the total resistance in ohms?",
|
| 172 |
+
"true_answer": "10",
|
| 173 |
+
"predicted_answer": "10",
|
| 174 |
+
"confidence": 100,
|
| 175 |
+
"correct": true,
|
| 176 |
+
"domain": "gpqa_lite"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"question": "If the pH of a solution decreases from 7 to 4, by what factor does the hydrogen ion concentration increase?",
|
| 180 |
+
"true_answer": "1000",
|
| 181 |
+
"predicted_answer": "the hydrogen ion concentration increases by a factor of 1000.",
|
| 182 |
+
"confidence": 98,
|
| 183 |
+
"correct": true,
|
| 184 |
+
"domain": "gpqa_lite"
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"question": "A converging lens has a focal length of 10 cm. An object is placed 20 cm from the lens. At what distance from the lens, in cm, is the image formed?",
|
| 188 |
+
"true_answer": "20",
|
| 189 |
+
"predicted_answer": "20",
|
| 190 |
+
"confidence": 100,
|
| 191 |
+
"correct": true,
|
| 192 |
+
"domain": "gpqa_lite"
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"question": "DNA polymerase synthesizes new DNA strands in which direction? Answer with the format X' to Y'.",
|
| 196 |
+
"true_answer": "5' to 3'",
|
| 197 |
+
"predicted_answer": "5' to 3'",
|
| 198 |
+
"confidence": 95,
|
| 199 |
+
"correct": true,
|
| 200 |
+
"domain": "gpqa_lite"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"question": "In what year was the Peace of Westphalia signed?",
|
| 204 |
+
"true_answer": "1648",
|
| 205 |
+
"predicted_answer": "1648",
|
| 206 |
+
"confidence": 98,
|
| 207 |
+
"correct": true,
|
| 208 |
+
"domain": "obscure_historical"
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"question": "In what year did Constantinople fall to the Ottomans?",
|
| 212 |
+
"true_answer": "1453",
|
| 213 |
+
"predicted_answer": "1453",
|
| 214 |
+
"confidence": 95,
|
| 215 |
+
"correct": true,
|
| 216 |
+
"domain": "obscure_historical"
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"question": "Who was the first Roman emperor?",
|
| 220 |
+
"true_answer": "Augustus",
|
| 221 |
+
"predicted_answer": "augustus",
|
| 222 |
+
"confidence": 98,
|
| 223 |
+
"correct": true,
|
| 224 |
+
"domain": "obscure_historical"
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"question": "In what year was the Rosetta Stone discovered?",
|
| 228 |
+
"true_answer": "1799",
|
| 229 |
+
"predicted_answer": "1799",
|
| 230 |
+
"confidence": 98,
|
| 231 |
+
"correct": true,
|
| 232 |
+
"domain": "obscure_historical"
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"question": "Magellan died during his expedition. Who actually completed the first circumnavigation of the globe by ship?",
|
| 236 |
+
"true_answer": "Elcano",
|
| 237 |
+
"predicted_answer": "ferdinand magellan's crew, specifically juan sebasti\u00e1n elcano, completed the first circumnavigation of the globe.",
|
| 238 |
+
"confidence": 78,
|
| 239 |
+
"correct": true,
|
| 240 |
+
"domain": "obscure_historical"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"question": "In what year was the Magna Carta signed?",
|
| 244 |
+
"true_answer": "1215",
|
| 245 |
+
"predicted_answer": "1215",
|
| 246 |
+
"confidence": 98,
|
| 247 |
+
"correct": true,
|
| 248 |
+
"domain": "obscure_historical"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"question": "Who was the first United States Secretary of the Treasury?",
|
| 252 |
+
"true_answer": "Hamilton",
|
| 253 |
+
"predicted_answer": "alexander hamilton",
|
| 254 |
+
"confidence": 98,
|
| 255 |
+
"correct": true,
|
| 256 |
+
"domain": "obscure_historical"
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"question": "In what year did Krakatoa undergo its catastrophic eruption?",
|
| 260 |
+
"true_answer": "1883",
|
| 261 |
+
"predicted_answer": "1883",
|
| 262 |
+
"confidence": 98,
|
| 263 |
+
"correct": true,
|
| 264 |
+
"domain": "obscure_historical"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"question": "What is normal human body temperature in degrees Fahrenheit?",
|
| 268 |
+
"true_answer": "98.6",
|
| 269 |
+
"predicted_answer": "98.6",
|
| 270 |
+
"confidence": 85,
|
| 271 |
+
"correct": true,
|
| 272 |
+
"domain": "unit_aware"
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"question": "What is the freezing point of water in degrees Fahrenheit at 1 atmosphere?",
|
| 276 |
+
"true_answer": "32",
|
| 277 |
+
"predicted_answer": "32",
|
| 278 |
+
"confidence": 98,
|
| 279 |
+
"correct": true,
|
| 280 |
+
"domain": "unit_aware"
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"question": "What is the boiling point of water in Kelvin at 1 atmosphere, rounded to the nearest whole number?",
|
| 284 |
+
"true_answer": "373",
|
| 285 |
+
"predicted_answer": "373",
|
| 286 |
+
"confidence": 100,
|
| 287 |
+
"correct": true,
|
| 288 |
+
"domain": "unit_aware"
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"question": "How many feet are in one statute mile?",
|
| 292 |
+
"true_answer": "5280",
|
| 293 |
+
"predicted_answer": "5280",
|
| 294 |
+
"confidence": 100,
|
| 295 |
+
"correct": true,
|
| 296 |
+
"domain": "unit_aware"
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"question": "What is one atmosphere of pressure in pascals, rounded to the nearest hundred?",
|
| 300 |
+
"true_answer": "101300",
|
| 301 |
+
"predicted_answer": "101300",
|
| 302 |
+
"confidence": 98,
|
| 303 |
+
"correct": true,
|
| 304 |
+
"domain": "unit_aware"
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"question": "What is the gravitational acceleration on Earth's surface in feet per second squared, rounded to the nearest whole number?",
|
| 308 |
+
"true_answer": "32",
|
| 309 |
+
"predicted_answer": "32",
|
| 310 |
+
"confidence": 95,
|
| 311 |
+
"correct": true,
|
| 312 |
+
"domain": "unit_aware"
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"question": "Approximately how many kilometres is the average Earth-Moon distance, rounded to the nearest thousand?",
|
| 316 |
+
"true_answer": "384000",
|
| 317 |
+
"predicted_answer": "380,000",
|
| 318 |
+
"confidence": 70,
|
| 319 |
+
"correct": true,
|
| 320 |
+
"domain": "unit_aware"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"question": "What is absolute zero in degrees Fahrenheit, rounded to the nearest whole degree?",
|
| 324 |
+
"true_answer": "-460",
|
| 325 |
+
"predicted_answer": "-459",
|
| 326 |
+
"confidence": 98,
|
| 327 |
+
"correct": true,
|
| 328 |
+
"domain": "unit_aware"
|
| 329 |
+
}
|
| 330 |
+
]
|
| 331 |
+
}
|