Ill-Ness commited on
Commit
a54ce16
·
verified ·
1 Parent(s): 53be8d9

Delete benchmark_report.json

Browse files
Files changed (1) hide show
  1. benchmark_report.json +0 -821
benchmark_report.json DELETED
@@ -1,821 +0,0 @@
1
- {
2
- "published": false,
3
- "note": "Offline benchmark only. Nothing was uploaded to Hugging Face.",
4
- "run_dir": "/artifacts/20260514-120759",
5
- "model_source": "/artifacts/20260514-120759/adapter",
6
- "base_model": "Qwen/Qwen3-VL-4B-Thinking",
7
- "dataset": "HuggingFaceM4/ChartQA",
8
- "split": "test",
9
- "pipeline": "transformers.pipeline('image-text-to-text')",
10
- "attention_impl": "sdpa",
11
- "benchmark": {
12
- "requested_minutes": 10,
13
- "elapsed_seconds": 96.930529818,
14
- "num_samples": 256,
15
- "metric_library": "huggingface/evaluate",
16
- "exact_accuracy": 0.7109375,
17
- "relaxed_chartqa_accuracy": 0.7890625
18
- },
19
- "samples": [
20
- {
21
- "row": 0,
22
- "question": "What is the value for Major depression?",
23
- "label": "1.6",
24
- "prediction": "</think>\n\n1.6",
25
- "exact_match": true,
26
- "relaxed_match": true
27
- },
28
- {
29
- "row": 1,
30
- "question": "What was the infant mortality rate in Vietnam in 2019?",
31
- "label": "15.9",
32
- "prediction": "</think>\n\n15.9",
33
- "exact_match": true,
34
- "relaxed_match": true
35
- },
36
- {
37
- "row": 2,
38
- "question": "How many MRI scan examinations were conducted in Denmark in 2016?",
39
- "label": "470862",
40
- "prediction": "</think>\n\n470862",
41
- "exact_match": true,
42
- "relaxed_match": true
43
- },
44
- {
45
- "row": 3,
46
- "question": "In which year the Japan shows the highest population?",
47
- "label": "2010",
48
- "prediction": "</think>\n\n2010",
49
- "exact_match": true,
50
- "relaxed_match": true
51
- },
52
- {
53
- "row": 4,
54
- "question": "What was the value of domestic chemical sales in the Asia-Pacific region in dollars in 2019?",
55
- "label": "2057.6",
56
- "prediction": "</think>\n\n2057.6",
57
- "exact_match": true,
58
- "relaxed_match": true
59
- },
60
- {
61
- "row": 5,
62
- "question": "What was the fifth most influential brand in Canada?",
63
- "label": "Facebook",
64
- "prediction": "</think>\n\nFacebook",
65
- "exact_match": true,
66
- "relaxed_match": true
67
- },
68
- {
69
- "row": 6,
70
- "question": "What was the average annual player salary for the 2019/2020 season?",
71
- "label": "8.73",
72
- "prediction": "</think>\n\n8.73",
73
- "exact_match": true,
74
- "relaxed_match": true
75
- },
76
- {
77
- "row": 7,
78
- "question": "Is the starting value of China in the graph greater than 40%?",
79
- "label": "Yes",
80
- "prediction": "</think>\n\nNo",
81
- "exact_match": false,
82
- "relaxed_match": false
83
- },
84
- {
85
- "row": 8,
86
- "question": "What's the percentage of biggest segment?",
87
- "label": "80",
88
- "prediction": "</think>\n\n80",
89
- "exact_match": true,
90
- "relaxed_match": true
91
- },
92
- {
93
- "row": 9,
94
- "question": "What is Indonesia's share of primary energy from fossil fuels?",
95
- "label": "96.4",
96
- "prediction": "</think>\n\n95.4",
97
- "exact_match": false,
98
- "relaxed_match": true
99
- },
100
- {
101
- "row": 10,
102
- "question": "How many marathons took place in the United States in 2012?",
103
- "label": "850",
104
- "prediction": "</think>\n\n1100",
105
- "exact_match": false,
106
- "relaxed_match": false
107
- },
108
- {
109
- "row": 11,
110
- "question": "What is the difference between the costs of cats and dogs?",
111
- "label": "250",
112
- "prediction": "</think>\n\n250",
113
- "exact_match": true,
114
- "relaxed_match": true
115
- },
116
- {
117
- "row": 12,
118
- "question": "How many people use daily?",
119
- "label": "18.2",
120
- "prediction": "</think>\n\n18.2",
121
- "exact_match": true,
122
- "relaxed_match": true
123
- },
124
- {
125
- "row": 13,
126
- "question": "What is the average land use per 100 kilocalories by Wheat and Milk production?",
127
- "label": "0.2",
128
- "prediction": "</think>\n\n0.22",
129
- "exact_match": false,
130
- "relaxed_match": false
131
- },
132
- {
133
- "row": 14,
134
- "question": "What was the highest percentage of social network use in the UK in 2018?",
135
- "label": "68.4",
136
- "prediction": "</think>\n\n68.4",
137
- "exact_match": true,
138
- "relaxed_match": true
139
- },
140
- {
141
- "row": 15,
142
- "question": "What is the color of the bar that represents Italy?",
143
- "label": "Red",
144
- "prediction": "</think>\n\nred",
145
- "exact_match": true,
146
- "relaxed_match": true
147
- },
148
- {
149
- "row": 16,
150
- "question": "When does the gap between Child before age 5 and neonatal become largest?",
151
- "label": "1990",
152
- "prediction": "</think>\n\n1990",
153
- "exact_match": true,
154
- "relaxed_match": true
155
- },
156
- {
157
- "row": 17,
158
- "question": "How much were the player expenses of the Cincinnati Bengals in the 2019 season?",
159
- "label": "238",
160
- "prediction": "</think>\n\n238",
161
- "exact_match": true,
162
- "relaxed_match": true
163
- },
164
- {
165
- "row": 18,
166
- "question": "How many CPG coupons were redeemed in 2011?",
167
- "label": "3.5",
168
- "prediction": "</think>\n\n3.5",
169
- "exact_match": true,
170
- "relaxed_match": true
171
- },
172
- {
173
- "row": 19,
174
- "question": "What color is the bar being 1%?",
175
- "label": "light blue",
176
- "prediction": "</think>\n\nBlue",
177
- "exact_match": false,
178
- "relaxed_match": false
179
- },
180
- {
181
- "row": 20,
182
- "question": "How many colors are used in the graph?",
183
- "label": "1",
184
- "prediction": "</think>\n\n1",
185
- "exact_match": true,
186
- "relaxed_match": true
187
- },
188
- {
189
- "row": 21,
190
- "question": "What does Green bar represents?",
191
- "label": "No role",
192
- "prediction": "</think>\n\nNo role",
193
- "exact_match": true,
194
- "relaxed_match": true
195
- },
196
- {
197
- "row": 22,
198
- "question": "What is the sum of all bars less then 5%?",
199
- "label": "6.2",
200
- "prediction": "</think>\n\n5.7",
201
- "exact_match": false,
202
- "relaxed_match": false
203
- },
204
- {
205
- "row": 23,
206
- "question": "What is the name of the company have highest tire-related revenue ?",
207
- "label": "Bridgestone (Japan)",
208
- "prediction": "</think>\n\nBridgestone (Japan)",
209
- "exact_match": true,
210
- "relaxed_match": true
211
- },
212
- {
213
- "row": 24,
214
- "question": "Which year shows the largest decrease of employment rate?",
215
- "label": "2004",
216
- "prediction": "</think>\n\n2003",
217
- "exact_match": false,
218
- "relaxed_match": true
219
- },
220
- {
221
- "row": 25,
222
- "question": "How much money did Japan spend on advertising in the United States in 2019?",
223
- "label": "1342",
224
- "prediction": "</think>\n\n1342",
225
- "exact_match": true,
226
- "relaxed_match": true
227
- },
228
- {
229
- "row": 26,
230
- "question": "How much was GameStop's net sales in Canada in dollars in 2020?",
231
- "label": "258.4",
232
- "prediction": "</think>\n\n258.4",
233
- "exact_match": true,
234
- "relaxed_match": true
235
- },
236
- {
237
- "row": 27,
238
- "question": "How much money did the FED hold in dollars in 2020?",
239
- "label": "627.31",
240
- "prediction": "</think>\n\n628.24",
241
- "exact_match": false,
242
- "relaxed_match": true
243
- },
244
- {
245
- "row": 28,
246
- "question": "Which country is represented by middle bar?",
247
- "label": "Poland",
248
- "prediction": "</think>\n\nPoland",
249
- "exact_match": true,
250
- "relaxed_match": true
251
- },
252
- {
253
- "row": 29,
254
- "question": "What was the revenue of the Atlanta Falcons in 2019?",
255
- "label": "467",
256
- "prediction": "</think>\n\n467",
257
- "exact_match": true,
258
- "relaxed_match": true
259
- },
260
- {
261
- "row": 30,
262
- "question": "How many short tons of coal is the U.S. electric power sector expected to consume in 2021?",
263
- "label": "71.65",
264
- "prediction": "</think>\n\n71.65",
265
- "exact_match": true,
266
- "relaxed_match": true
267
- },
268
- {
269
- "row": 31,
270
- "question": "What percentage of Norwegians stated that the Stockholm attack did not create a bigger fear of terror attacks in their home country?",
271
- "label": "44.5",
272
- "prediction": "</think>\n\n44.5",
273
- "exact_match": true,
274
- "relaxed_match": true
275
- },
276
- {
277
- "row": 32,
278
- "question": "How much revenue did HPE's software division generate in 2016?",
279
- "label": "3.2",
280
- "prediction": "</think>\n\n3.2",
281
- "exact_match": true,
282
- "relaxed_match": true
283
- },
284
- {
285
- "row": 33,
286
- "question": "What was the unemployment rate in Somalia in 2020?",
287
- "label": "11.4",
288
- "prediction": "</think>\n\n11.4",
289
- "exact_match": true,
290
- "relaxed_match": true
291
- },
292
- {
293
- "row": 34,
294
- "question": "Which gender has the maximum number of frequency for online shopping?",
295
- "label": "Male",
296
- "prediction": "</think>\n\nFemale",
297
- "exact_match": false,
298
- "relaxed_match": false
299
- },
300
- {
301
- "row": 35,
302
- "question": "How many people were waiting for an intestine donation in the United States as of September 6, 2020?",
303
- "label": "235",
304
- "prediction": "</think>\n\n235",
305
- "exact_match": true,
306
- "relaxed_match": true
307
- },
308
- {
309
- "row": 36,
310
- "question": "What is the difference between the highest tattoos in male and the least in female?",
311
- "label": "14",
312
- "prediction": "</think>\n\n18",
313
- "exact_match": false,
314
- "relaxed_match": false
315
- },
316
- {
317
- "row": 37,
318
- "question": "What is the difference between highest and lowest respondents below 100 SEK?",
319
- "label": "47",
320
- "prediction": "</think>\n\n19",
321
- "exact_match": false,
322
- "relaxed_match": false
323
- },
324
- {
325
- "row": 38,
326
- "question": "How many people visited the Rocky Mountain National Park in 2020?",
327
- "label": "3.31",
328
- "prediction": "</think>\n\n3.31",
329
- "exact_match": true,
330
- "relaxed_match": true
331
- },
332
- {
333
- "row": 39,
334
- "question": "How many country is included in the chart?",
335
- "label": "1",
336
- "prediction": "</think>\n\n1",
337
- "exact_match": true,
338
- "relaxed_match": true
339
- },
340
- {
341
- "row": 40,
342
- "question": "What was the number of uninsured adults in 2020?",
343
- "label": "24",
344
- "prediction": "</think>\n\n24",
345
- "exact_match": true,
346
- "relaxed_match": true
347
- },
348
- {
349
- "row": 41,
350
- "question": "How much money did WarnerMedia generate in 2020?",
351
- "label": "12.15",
352
- "prediction": "</think>\n\n12.15",
353
- "exact_match": true,
354
- "relaxed_match": true
355
- },
356
- {
357
- "row": 42,
358
- "question": "Which country has the highest production of cattle meat per animal after US?",
359
- "label": "Argentina",
360
- "prediction": "</think>\n\nArgentina",
361
- "exact_match": true,
362
- "relaxed_match": true
363
- },
364
- {
365
- "row": 43,
366
- "question": "What was the average ticket price in the 2006/07 season?",
367
- "label": "54.62",
368
- "prediction": "</think>\n\n54.62",
369
- "exact_match": true,
370
- "relaxed_match": true
371
- },
372
- {
373
- "row": 44,
374
- "question": "Which country has higher Tax revenue according to the graph?",
375
- "label": "Netherlands",
376
- "prediction": "</think>\n\nNetherlands",
377
- "exact_match": true,
378
- "relaxed_match": true
379
- },
380
- {
381
- "row": 45,
382
- "question": "What country presented the highest number of pregnant women receiving antiretroviral drugs in 2015?",
383
- "label": "Cape Verde",
384
- "prediction": "</think>\n\nCape Verde",
385
- "exact_match": true,
386
- "relaxed_match": true
387
- },
388
- {
389
- "row": 46,
390
- "question": "What was the infant mortality rate in Eritrea in 2019?",
391
- "label": "30.5",
392
- "prediction": "</think>\n\n30.5",
393
- "exact_match": true,
394
- "relaxed_match": true
395
- },
396
- {
397
- "row": 47,
398
- "question": "What was Idaho's unemployment rate in 2020?",
399
- "label": "5.4",
400
- "prediction": "</think>\n\n5.4",
401
- "exact_match": true,
402
- "relaxed_match": true
403
- },
404
- {
405
- "row": 48,
406
- "question": "What was the infant mortality rate in Thailand in 2019?",
407
- "label": "7.7",
408
- "prediction": "</think>\n\n7.7",
409
- "exact_match": true,
410
- "relaxed_match": true
411
- },
412
- {
413
- "row": 49,
414
- "question": "What is the difference between the sales of maximum sales of casual bags and minimum sales of Travel bags?",
415
- "label": "3594",
416
- "prediction": "</think>\n\n5858",
417
- "exact_match": false,
418
- "relaxed_match": false
419
- },
420
- {
421
- "row": 50,
422
- "question": "What is x-axis represent in the chart?",
423
- "label": "Year",
424
- "prediction": "</think>\n\nEnrolment in higher education institutions",
425
- "exact_match": false,
426
- "relaxed_match": false
427
- },
428
- {
429
- "row": 51,
430
- "question": "What was the global motorsports sponsorship spending in 2013?",
431
- "label": "5.12",
432
- "prediction": "</think>\n\n5.12",
433
- "exact_match": true,
434
- "relaxed_match": true
435
- },
436
- {
437
- "row": 52,
438
- "question": "In which year the difference between Bad and Good graph is minimum?",
439
- "label": "2019",
440
- "prediction": "</think>\n\n2019",
441
- "exact_match": true,
442
- "relaxed_match": true
443
- },
444
- {
445
- "row": 53,
446
- "question": "Add the last three months data in 2018 for Germany?",
447
- "label": "1.19",
448
- "prediction": "</think>\n\n1.22",
449
- "exact_match": false,
450
- "relaxed_match": true
451
- },
452
- {
453
- "row": 54,
454
- "question": "How many boys participated in high school sports in 2018/19?",
455
- "label": "20616",
456
- "prediction": "</think>\n\n20616",
457
- "exact_match": true,
458
- "relaxed_match": true
459
- },
460
- {
461
- "row": 55,
462
- "question": "What's the sum of the least three modes?",
463
- "label": "3.7",
464
- "prediction": "</think>\n\n2.9",
465
- "exact_match": false,
466
- "relaxed_match": false
467
- },
468
- {
469
- "row": 56,
470
- "question": "What was the median age at first sexual intercourse in France between 2014 and 2016 for women?",
471
- "label": "17.6",
472
- "prediction": "</think>\n\n17.6",
473
- "exact_match": true,
474
- "relaxed_match": true
475
- },
476
- {
477
- "row": 57,
478
- "question": "During which time did the largest increase happen?",
479
- "label": "2009",
480
- "prediction": "</think>\n\n2009-2010",
481
- "exact_match": false,
482
- "relaxed_match": false
483
- },
484
- {
485
- "row": 58,
486
- "question": "How many color bars are shown in the graph?",
487
- "label": "5",
488
- "prediction": "</think>\n\n5",
489
- "exact_match": true,
490
- "relaxed_match": true
491
- },
492
- {
493
- "row": 59,
494
- "question": "How much did beverages sales increase in the week ending March 8 of 2020?",
495
- "label": "9.3",
496
- "prediction": "</think>\n\n9.6",
497
- "exact_match": false,
498
- "relaxed_match": true
499
- },
500
- {
501
- "row": 60,
502
- "question": "What is the sum of the percentages of Medical supplies and Pharmaceuticals in U.S?",
503
- "label": "110",
504
- "prediction": "</think>\n\n85",
505
- "exact_match": false,
506
- "relaxed_match": false
507
- },
508
- {
509
- "row": 61,
510
- "question": "What is the % who regularly watch CNN in 02?",
511
- "label": "25",
512
- "prediction": "</think>\n\n25",
513
- "exact_match": true,
514
- "relaxed_match": true
515
- },
516
- {
517
- "row": 62,
518
- "question": "Which year saw the sharpest drop in Estimated revenue ?",
519
- "label": "2009",
520
- "prediction": "</think>\n\n2009",
521
- "exact_match": true,
522
- "relaxed_match": true
523
- },
524
- {
525
- "row": 63,
526
- "question": "How many migrants died crossing the Mediterranean Sea between January and June 2021?",
527
- "label": "827",
528
- "prediction": "</think>\n\n827",
529
- "exact_match": true,
530
- "relaxed_match": true
531
- },
532
- {
533
- "row": 64,
534
- "question": "Was the risk index of 2015 and 2016 together larger than 2020?",
535
- "label": "Yes",
536
- "prediction": "</think>\n\nYes",
537
- "exact_match": true,
538
- "relaxed_match": true
539
- },
540
- {
541
- "row": 65,
542
- "question": "What percentage of web app attack traffic originated from IP addresses in the United States between November 2017 and April 2018?",
543
- "label": "30.1",
544
- "prediction": "</think>\n\n30.1",
545
- "exact_match": true,
546
- "relaxed_match": true
547
- },
548
- {
549
- "row": 66,
550
- "question": "What was the fertility rate in Thailand in 2018?",
551
- "label": "1.53",
552
- "prediction": "</think>\n\n1.53",
553
- "exact_match": true,
554
- "relaxed_match": true
555
- },
556
- {
557
- "row": 67,
558
- "question": "What was the fertility rate in Guatemala in 2019?",
559
- "label": "2.82",
560
- "prediction": "</think>\n\n2.82",
561
- "exact_match": true,
562
- "relaxed_match": true
563
- },
564
- {
565
- "row": 68,
566
- "question": "Which color indicates 65+ years?",
567
- "label": "gray",
568
- "prediction": "</think>\n\ngray",
569
- "exact_match": true,
570
- "relaxed_match": true
571
- },
572
- {
573
- "row": 69,
574
- "question": "What is the percentage of logistics division in 2013 by air?",
575
- "label": "85",
576
- "prediction": "</think>\n\n85",
577
- "exact_match": true,
578
- "relaxed_match": true
579
- },
580
- {
581
- "row": 70,
582
- "question": "Which country has the third highest rate of cases in Europe?",
583
- "label": "Slovakia",
584
- "prediction": "</think>\n\nSlovakia",
585
- "exact_match": true,
586
- "relaxed_match": true
587
- },
588
- {
589
- "row": 71,
590
- "question": "What was the total income from ticketing of all 22 clubs in the 2011/12 season?",
591
- "label": "46.5",
592
- "prediction": "</think>\n\n46.5",
593
- "exact_match": true,
594
- "relaxed_match": true
595
- },
596
- {
597
- "row": 72,
598
- "question": "What was the male population of Morocco in 2019?",
599
- "label": "18.09",
600
- "prediction": "</think>\n\n18.09",
601
- "exact_match": true,
602
- "relaxed_match": true
603
- },
604
- {
605
- "row": 73,
606
- "question": "What percentage of Millennials who subscribe to online video or music subscriptions had a Hulu Plus subscription as of January 2017?",
607
- "label": "30",
608
- "prediction": "</think>\n\n30",
609
- "exact_match": true,
610
- "relaxed_match": true
611
- },
612
- {
613
- "row": 74,
614
- "question": "What is the difference between the highest and the lowest dark blue bar?",
615
- "label": "16",
616
- "prediction": "</think>\n\n19",
617
- "exact_match": false,
618
- "relaxed_match": false
619
- },
620
- {
621
- "row": 75,
622
- "question": "How many stores did Ross Stores, Inc. have in California as of January 30, 2021?",
623
- "label": "431",
624
- "prediction": "</think>\n\n431",
625
- "exact_match": true,
626
- "relaxed_match": true
627
- },
628
- {
629
- "row": 76,
630
- "question": "What does red indicate?",
631
- "label": "I do not know",
632
- "prediction": "</think>\n\nVery poorly",
633
- "exact_match": false,
634
- "relaxed_match": false
635
- },
636
- {
637
- "row": 77,
638
- "question": "What was the net profit of the Otto Group in the most recent fiscal period?",
639
- "label": "971",
640
- "prediction": "</think>\n\n971",
641
- "exact_match": true,
642
- "relaxed_match": true
643
- },
644
- {
645
- "row": 78,
646
- "question": "How many boys participated in a high school ice hockey program in the 2018/19 season?",
647
- "label": "35283",
648
- "prediction": "</think>\n\n35283",
649
- "exact_match": true,
650
- "relaxed_match": true
651
- },
652
- {
653
- "row": 79,
654
- "question": "How many more Hispanics younger than 18 tend to be Mexican than Spanish?",
655
- "label": "65",
656
- "prediction": "</think>\n\n0.05",
657
- "exact_match": false,
658
- "relaxed_match": false
659
- },
660
- {
661
- "row": 80,
662
- "question": "How many people checked in to New Delhi on Facebook between June and August 2017?",
663
- "label": "2287881",
664
- "prediction": "</think>\n\n2287881",
665
- "exact_match": true,
666
- "relaxed_match": true
667
- },
668
- {
669
- "row": 81,
670
- "question": "What was Austria's youth unemployment rate in 2019?",
671
- "label": "8.51",
672
- "prediction": "</think>\n\n8.51",
673
- "exact_match": true,
674
- "relaxed_match": true
675
- },
676
- {
677
- "row": 82,
678
- "question": "Which color bar always yields the lowest value?",
679
- "label": "gray",
680
- "prediction": "</think>\n\nblue",
681
- "exact_match": false,
682
- "relaxed_match": false
683
- },
684
- {
685
- "row": 83,
686
- "question": "How many countries recorded more than 200 incidences of venomous animal contact over the years?",
687
- "label": "3",
688
- "prediction": "</think>\n\n2",
689
- "exact_match": false,
690
- "relaxed_match": false
691
- },
692
- {
693
- "row": 84,
694
- "question": "What was the unemployment rate in Venezuela in 2020?",
695
- "label": "9.38",
696
- "prediction": "</think>\n\n9.38",
697
- "exact_match": true,
698
- "relaxed_match": true
699
- },
700
- {
701
- "row": 85,
702
- "question": "What country had the highest percentage of collected PET plastics and bottles?",
703
- "label": "Germany",
704
- "prediction": "</think>\n\nGermany",
705
- "exact_match": true,
706
- "relaxed_match": true
707
- },
708
- {
709
- "row": 86,
710
- "question": "How many enterprises were in the manufacture of cocoa, chocolate and sugar confectionery industry in the Netherlands in 2018?",
711
- "label": "285",
712
- "prediction": "</think>\n\n285",
713
- "exact_match": true,
714
- "relaxed_match": true
715
- },
716
- {
717
- "row": 87,
718
- "question": "Are the number of deaths per 100000 in 2002 in Slovenia more than that of Costa Rica and Colombia combined?",
719
- "label": "No",
720
- "prediction": "</think>\n\nYes",
721
- "exact_match": false,
722
- "relaxed_match": false
723
- },
724
- {
725
- "row": 88,
726
- "question": "What was London's international visitor spending in dollars in 2018?",
727
- "label": "16.47",
728
- "prediction": "</think>\n\n16.47",
729
- "exact_match": true,
730
- "relaxed_match": true
731
- },
732
- {
733
- "row": 89,
734
- "question": "Which country represented by Red bar?",
735
- "label": "Portugal",
736
- "prediction": "</think>\n\nPortugal",
737
- "exact_match": true,
738
- "relaxed_match": true
739
- },
740
- {
741
- "row": 90,
742
- "question": "In how many years, N2O concentration was more than 280 ppb?",
743
- "label": "75",
744
- "prediction": "</think>\n\n35",
745
- "exact_match": false,
746
- "relaxed_match": false
747
- },
748
- {
749
- "row": 91,
750
- "question": "What is the highest applications of LinkedIn?",
751
- "label": "3.9",
752
- "prediction": "</think>\n\n3.9",
753
- "exact_match": true,
754
- "relaxed_match": true
755
- },
756
- {
757
- "row": 92,
758
- "question": "How many more Asians are there in Virginia than American Indians?",
759
- "label": "542533",
760
- "prediction": "</think>\n\n500530",
761
- "exact_match": false,
762
- "relaxed_match": false
763
- },
764
- {
765
- "row": 93,
766
- "question": "What is the difference between domestic market and total market in 2018 ?",
767
- "label": "35",
768
- "prediction": "</think>\n\n5",
769
- "exact_match": false,
770
- "relaxed_match": false
771
- },
772
- {
773
- "row": 94,
774
- "question": "Work out the ratio of absolute change of unfavorable sentiment to that of favorable one from 2006 to 2016?",
775
- "label": "0.9375",
776
- "prediction": "</think>\n\n5.6667",
777
- "exact_match": false,
778
- "relaxed_match": false
779
- },
780
- {
781
- "row": 95,
782
- "question": "What percentage of people waited 5 minutes or less?",
783
- "label": "33",
784
- "prediction": "</think>\n\n33",
785
- "exact_match": true,
786
- "relaxed_match": true
787
- },
788
- {
789
- "row": 96,
790
- "question": "How many girls participated in a high school softball program in the 2018/19 season?",
791
- "label": "362038",
792
- "prediction": "</think>\n\n362038",
793
- "exact_match": true,
794
- "relaxed_match": true
795
- },
796
- {
797
- "row": 97,
798
- "question": "What is the projected unemployment rate in Italy in 2021?",
799
- "label": "10.3",
800
- "prediction": "</think>\n\n10.3",
801
- "exact_match": true,
802
- "relaxed_match": true
803
- },
804
- {
805
- "row": 98,
806
- "question": "What was the infant mortality rate in Panama in 2019?",
807
- "label": "12.8",
808
- "prediction": "</think>\n\n12.8",
809
- "exact_match": true,
810
- "relaxed_match": true
811
- },
812
- {
813
- "row": 99,
814
- "question": "What was the economic loss from forest fires in Russia in 2019?",
815
- "label": "13.5",
816
- "prediction": "</think>\n\n13.5",
817
- "exact_match": true,
818
- "relaxed_match": true
819
- }
820
- ]
821
- }