amirali1985 commited on
Commit
40759fa
·
verified ·
1 Parent(s): aa8328e

Upload add_sub_baseline_50K_2L1H128d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_50K_2L1H128d/metrics.json CHANGED
@@ -1330,502 +1330,567 @@
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
- "n_per_split": 100
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
- "full_accuracy": 0.45,
1338
- "n_examples": 100,
 
1339
  "per_subtask": {
1340
  "SA": {
1341
- "accuracy": 0.8925619834710744,
1342
- "count": 605
1343
  },
1344
  "SS": {
1345
- "accuracy": 0.9894736842105263,
1346
- "count": 95
1347
  }
1348
  }
1349
  },
1350
  "add_S1": {
1351
- "full_accuracy": 0.24,
1352
- "n_examples": 100,
 
1353
  "per_subtask": {
1354
  "SA": {
1355
- "accuracy": 0.9166666666666666,
1356
- "count": 204
1357
  },
1358
  "SC": {
1359
- "accuracy": 0.9349112426035503,
1360
- "count": 169
1361
  },
1362
  "SS": {
1363
- "accuracy": 0.967741935483871,
1364
- "count": 31
1365
  },
1366
  "UC": {
1367
- "accuracy": 0.7297297297297297,
1368
- "count": 296
1369
  }
1370
  }
1371
  },
1372
  "add_S2": {
1373
- "full_accuracy": 0.23,
1374
- "n_examples": 100,
 
1375
  "per_subtask": {
1376
  "SA": {
1377
- "accuracy": 0.9079754601226994,
1378
- "count": 163
1379
  },
1380
  "SC": {
1381
- "accuracy": 0.9076923076923077,
1382
- "count": 130
1383
  },
1384
  "SS": {
1385
- "accuracy": 0.8735632183908046,
1386
- "count": 87
1387
  },
1388
  "UC": {
1389
- "accuracy": 0.6502463054187192,
1390
- "count": 203
1391
  },
1392
  "US": {
1393
- "accuracy": 0.9743589743589743,
1394
- "count": 117
1395
  }
1396
  }
1397
  },
1398
  "add_S3": {
1399
- "full_accuracy": 0.16,
1400
- "n_examples": 100,
 
1401
  "per_subtask": {
1402
  "SA": {
1403
- "accuracy": 0.9669421487603306,
1404
- "count": 121
1405
  },
1406
  "SC": {
1407
- "accuracy": 0.8925619834710744,
1408
- "count": 121
1409
  },
1410
  "SS": {
1411
- "accuracy": 0.9591836734693877,
1412
- "count": 49
1413
  },
1414
  "UC": {
1415
- "accuracy": 0.6290322580645161,
1416
- "count": 186
1417
  },
1418
  "US": {
1419
- "accuracy": 0.7399103139013453,
1420
- "count": 223
1421
  }
1422
  }
1423
  },
1424
  "add_S4": {
1425
- "full_accuracy": 0.29,
1426
- "n_examples": 100,
 
1427
  "per_subtask": {
1428
  "SA": {
1429
- "accuracy": 0.9711538461538461,
1430
- "count": 104
1431
  },
1432
  "SC": {
1433
- "accuracy": 0.9433962264150944,
1434
- "count": 106
1435
  },
1436
  "SS": {
1437
- "accuracy": 0.9130434782608695,
1438
- "count": 23
1439
  },
1440
  "UC": {
1441
- "accuracy": 0.66875,
1442
- "count": 160
1443
  },
1444
  "US": {
1445
- "accuracy": 0.5928338762214984,
1446
- "count": 307
1447
  }
1448
  }
1449
  },
1450
  "add_S5": {
1451
- "full_accuracy": 0.12,
1452
- "n_examples": 100,
 
1453
  "per_subtask": {
1454
  "SA": {
1455
- "accuracy": 0.99,
1456
- "count": 100
1457
  },
1458
  "SC": {
1459
- "accuracy": 0.99,
1460
- "count": 100
1461
  },
1462
  "UC": {
1463
- "accuracy": 0.31,
1464
- "count": 100
1465
  },
1466
  "US": {
1467
- "accuracy": 0.3675,
1468
- "count": 400
1469
  }
1470
  }
1471
  },
1472
  "add_S6": {
1473
- "full_accuracy": 0.29,
1474
- "n_examples": 100,
 
1475
  "per_subtask": {
1476
  "SC": {
1477
- "accuracy": 0.99,
1478
- "count": 100
1479
  },
1480
  "UC": {
1481
- "accuracy": 0.49,
1482
- "count": 100
1483
  },
1484
  "US": {
1485
- "accuracy": 0.492,
1486
- "count": 500
1487
  }
1488
  }
1489
  },
1490
  "add_random": {
1491
- "full_accuracy": 0.235,
 
1492
  "n_examples": 200,
1493
  "per_subtask": {
1494
  "SA": {
1495
- "accuracy": 0.8926174496644296,
1496
- "count": 447
1497
  },
1498
  "SC": {
1499
- "accuracy": 0.946875,
1500
- "count": 320
1501
  },
1502
  "SS": {
1503
  "accuracy": 1.0,
1504
- "count": 56
1505
  },
1506
  "UC": {
1507
- "accuracy": 0.7334593572778828,
1508
- "count": 529
1509
  },
1510
  "US": {
1511
- "accuracy": 0.8958333333333334,
1512
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1513
  }
1514
  }
1515
  },
1516
  "add_C3": {
1517
- "full_accuracy": 0.22,
1518
- "n_examples": 100,
 
1519
  "per_subtask": {
1520
  "SA": {
1521
- "accuracy": 0.93,
1522
- "count": 300
1523
  },
1524
  "SC": {
1525
- "accuracy": 0.99,
1526
- "count": 100
1527
  },
1528
  "UC": {
1529
- "accuracy": 0.538860103626943,
1530
- "count": 193
1531
  },
1532
  "US": {
1533
- "accuracy": 0.7850467289719626,
1534
- "count": 107
1535
  }
1536
  }
1537
  },
1538
  "add_C4": {
1539
- "full_accuracy": 0.21,
1540
- "n_examples": 100,
 
1541
  "per_subtask": {
1542
  "SA": {
1543
- "accuracy": 0.96,
1544
- "count": 200
1545
  },
1546
  "SC": {
1547
- "accuracy": 0.99,
1548
- "count": 100
1549
  },
1550
  "UC": {
1551
- "accuracy": 0.62109375,
1552
- "count": 256
1553
  },
1554
  "US": {
1555
- "accuracy": 0.7847222222222222,
1556
- "count": 144
1557
  }
1558
  }
1559
  },
1560
  "add_C5": {
1561
- "full_accuracy": 0.18,
1562
- "n_examples": 100,
 
1563
  "per_subtask": {
1564
  "SA": {
1565
- "accuracy": 1.0,
1566
- "count": 100
1567
  },
1568
  "SC": {
1569
- "accuracy": 0.97,
1570
- "count": 100
1571
  },
1572
  "UC": {
1573
- "accuracy": 0.6666666666666666,
1574
- "count": 306
1575
  },
1576
  "US": {
1577
- "accuracy": 0.7525773195876289,
1578
- "count": 194
1579
  }
1580
  }
1581
  },
1582
  "add_C6": {
1583
- "full_accuracy": 0.07,
1584
- "n_examples": 100,
 
1585
  "per_subtask": {
1586
  "SC": {
1587
- "accuracy": 0.99,
1588
- "count": 100
1589
  },
1590
  "UC": {
1591
- "accuracy": 0.6721311475409836,
1592
- "count": 366
1593
  },
1594
  "US": {
1595
- "accuracy": 0.8333333333333334,
1596
- "count": 234
1597
  }
1598
  }
1599
  },
1600
  "sub_M0": {
1601
- "full_accuracy": 0.45,
1602
- "n_examples": 100,
 
1603
  "per_subtask": {
1604
  "MD": {
1605
- "accuracy": 0.8885191347753744,
1606
- "count": 601
1607
  },
1608
  "ME": {
1609
  "accuracy": 1.0,
1610
- "count": 99
1611
  }
1612
  }
1613
  },
1614
  "sub_M1": {
1615
- "full_accuracy": 0.21,
1616
- "n_examples": 100,
 
1617
  "per_subtask": {
1618
  "MD": {
1619
- "accuracy": 0.9175627240143369,
1620
- "count": 279
1621
  },
1622
  "MB": {
1623
- "accuracy": 0.9379310344827586,
1624
- "count": 145
1625
  },
1626
  "ME": {
1627
- "accuracy": 0.9583333333333334,
1628
- "count": 24
1629
  },
1630
  "UB": {
1631
- "accuracy": 0.6031746031746031,
1632
- "count": 252
1633
  }
1634
  }
1635
  },
1636
  "sub_M2": {
1637
- "full_accuracy": 0.21,
1638
- "n_examples": 100,
 
1639
  "per_subtask": {
1640
  "MD": {
1641
- "accuracy": 0.9342723004694836,
1642
- "count": 213
1643
  },
1644
  "MB": {
1645
- "accuracy": 0.9557522123893806,
1646
- "count": 113
1647
  },
1648
  "ME": {
1649
- "accuracy": 0.9882352941176471,
1650
- "count": 85
1651
  },
1652
  "UB": {
1653
- "accuracy": 0.5027624309392266,
1654
- "count": 181
1655
  },
1656
  "UD": {
1657
- "accuracy": 0.9907407407407407,
1658
- "count": 108
1659
  }
1660
  }
1661
  },
1662
  "sub_M3": {
1663
- "full_accuracy": 0.04,
1664
- "n_examples": 100,
 
1665
  "per_subtask": {
1666
  "MD": {
1667
- "accuracy": 0.9888268156424581,
1668
- "count": 179
1669
  },
1670
  "MB": {
1671
- "accuracy": 0.9223300970873787,
1672
- "count": 103
1673
  },
1674
  "ME": {
1675
- "accuracy": 0.9464285714285714,
1676
- "count": 56
1677
  },
1678
  "UB": {
1679
- "accuracy": 0.436241610738255,
1680
- "count": 149
1681
  },
1682
  "UD": {
1683
- "accuracy": 0.5539906103286385,
1684
- "count": 213
1685
  }
1686
  }
1687
  },
1688
  "sub_M4": {
1689
- "full_accuracy": 0.06,
1690
- "n_examples": 100,
 
1691
  "per_subtask": {
1692
  "MD": {
1693
- "accuracy": 0.97,
1694
- "count": 200
1695
- },
1696
- "MB": {
1697
  "accuracy": 0.96,
1698
  "count": 100
1699
  },
 
 
 
 
1700
  "UB": {
1701
- "accuracy": 0.54,
1702
- "count": 100
1703
  },
1704
  "UD": {
1705
- "accuracy": 0.33,
1706
- "count": 300
1707
  }
1708
  }
1709
  },
1710
  "sub_M5": {
1711
  "full_accuracy": 0.04,
1712
- "n_examples": 100,
 
1713
  "per_subtask": {
1714
  "MD": {
1715
  "accuracy": 1.0,
1716
- "count": 100
1717
  },
1718
  "MB": {
1719
- "accuracy": 0.98,
1720
- "count": 100
1721
  },
1722
  "UB": {
1723
- "accuracy": 0.51,
1724
- "count": 100
1725
  },
1726
  "UD": {
1727
- "accuracy": 0.24,
1728
- "count": 400
1729
  }
1730
  }
1731
  },
1732
  "sub_random": {
1733
- "full_accuracy": 0.27,
 
1734
  "n_examples": 200,
1735
  "per_subtask": {
1736
  "MD": {
1737
- "accuracy": 0.9383333333333334,
1738
- "count": 600
1739
  },
1740
  "MB": {
1741
- "accuracy": 0.9363295880149812,
1742
- "count": 267
1743
  },
1744
  "ME": {
1745
- "accuracy": 1.0,
1746
  "count": 53
1747
  },
1748
  "UB": {
1749
- "accuracy": 0.6104783599088838,
1750
- "count": 439
1751
  },
1752
  "UD": {
1753
- "accuracy": 0.9512195121951219,
1754
- "count": 41
1755
  }
1756
  }
1757
  },
1758
  "sub_B3": {
1759
- "full_accuracy": 0.1,
1760
- "n_examples": 100,
 
1761
  "per_subtask": {
1762
  "MD": {
1763
- "accuracy": 0.9433333333333334,
1764
- "count": 300
1765
  },
1766
  "MB": {
1767
- "accuracy": 0.97,
1768
- "count": 100
1769
  },
1770
  "UB": {
1771
- "accuracy": 0.5126903553299492,
1772
- "count": 197
1773
  },
1774
  "UD": {
1775
- "accuracy": 0.7184466019417476,
1776
- "count": 103
1777
  }
1778
  }
1779
  },
1780
  "sub_B4": {
1781
- "full_accuracy": 0.03,
1782
- "n_examples": 100,
 
1783
  "per_subtask": {
1784
  "MD": {
1785
- "accuracy": 0.97,
1786
- "count": 200
1787
  },
1788
  "MB": {
1789
- "accuracy": 0.96,
1790
- "count": 100
1791
  },
1792
  "UB": {
1793
- "accuracy": 0.43724696356275305,
1794
- "count": 247
1795
  },
1796
  "UD": {
1797
- "accuracy": 0.6339869281045751,
1798
- "count": 153
1799
  }
1800
  }
1801
  },
1802
  "sub_B5": {
1803
- "full_accuracy": 0.03,
1804
- "n_examples": 100,
 
1805
  "per_subtask": {
1806
  "MD": {
1807
  "accuracy": 1.0,
1808
- "count": 100
1809
  },
1810
  "MB": {
1811
  "accuracy": 1.0,
1812
- "count": 100
1813
  },
1814
  "UB": {
1815
- "accuracy": 0.4395973154362416,
1816
- "count": 298
1817
  },
1818
  "UD": {
1819
- "accuracy": 0.6881188118811881,
1820
- "count": 202
1821
  }
1822
  }
1823
  }
1824
  },
1825
  "summary": {
1826
- "overall_accuracy": 0.1925,
1827
- "total_examples": 2400,
1828
- "n_splits": 22
 
1829
  }
1830
  }
1831
  }
 
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
+ "n_per_split": 50
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
+ "full_accuracy": 0.48,
1338
+ "digit_accuracy": 0.9114285714285715,
1339
+ "n_examples": 50,
1340
  "per_subtask": {
1341
  "SA": {
1342
+ "accuracy": 0.8983050847457628,
1343
+ "count": 295
1344
  },
1345
  "SS": {
1346
+ "accuracy": 0.9818181818181818,
1347
+ "count": 55
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
+ "full_accuracy": 0.3,
1353
+ "digit_accuracy": 0.8371428571428572,
1354
+ "n_examples": 50,
1355
  "per_subtask": {
1356
  "SA": {
1357
+ "accuracy": 0.9047619047619048,
1358
+ "count": 126
1359
  },
1360
  "SC": {
1361
+ "accuracy": 0.8987341772151899,
1362
+ "count": 79
1363
  },
1364
  "SS": {
1365
+ "accuracy": 0.9523809523809523,
1366
+ "count": 21
1367
  },
1368
  "UC": {
1369
+ "accuracy": 0.7096774193548387,
1370
+ "count": 124
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
+ "full_accuracy": 0.2,
1376
+ "digit_accuracy": 0.8114285714285714,
1377
+ "n_examples": 50,
1378
  "per_subtask": {
1379
  "SA": {
1380
+ "accuracy": 0.8266666666666667,
1381
+ "count": 75
1382
  },
1383
  "SC": {
1384
+ "accuracy": 0.8870967741935484,
1385
+ "count": 62
1386
  },
1387
  "SS": {
1388
+ "accuracy": 0.8461538461538461,
1389
+ "count": 39
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.6396396396396397,
1393
+ "count": 111
1394
  },
1395
  "US": {
1396
+ "accuracy": 1.0,
1397
+ "count": 63
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
+ "full_accuracy": 0.18,
1403
+ "digit_accuracy": 0.8228571428571428,
1404
+ "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
+ "accuracy": 0.9666666666666667,
1408
+ "count": 60
1409
  },
1410
  "SC": {
1411
+ "accuracy": 0.8947368421052632,
1412
+ "count": 57
1413
  },
1414
  "SS": {
1415
+ "accuracy": 0.8421052631578947,
1416
+ "count": 19
1417
  },
1418
  "UC": {
1419
+ "accuracy": 0.7115384615384616,
1420
+ "count": 104
1421
  },
1422
  "US": {
1423
+ "accuracy": 0.8090909090909091,
1424
+ "count": 110
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
+ "full_accuracy": 0.24,
1430
+ "digit_accuracy": 0.6914285714285714,
1431
+ "n_examples": 50,
1432
  "per_subtask": {
1433
  "SA": {
1434
+ "accuracy": 0.9375,
1435
+ "count": 48
1436
  },
1437
  "SC": {
1438
+ "accuracy": 0.9423076923076923,
1439
+ "count": 52
1440
  },
1441
  "SS": {
1442
+ "accuracy": 1.0,
1443
+ "count": 7
1444
  },
1445
  "UC": {
1446
+ "accuracy": 0.6067415730337079,
1447
+ "count": 89
1448
  },
1449
  "US": {
1450
+ "accuracy": 0.564935064935065,
1451
+ "count": 154
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
+ "full_accuracy": 0.1,
1457
+ "digit_accuracy": 0.5257142857142857,
1458
+ "n_examples": 50,
1459
  "per_subtask": {
1460
  "SA": {
1461
+ "accuracy": 1.0,
1462
+ "count": 50
1463
  },
1464
  "SC": {
1465
+ "accuracy": 1.0,
1466
+ "count": 50
1467
  },
1468
  "UC": {
1469
+ "accuracy": 0.42,
1470
+ "count": 50
1471
  },
1472
  "US": {
1473
+ "accuracy": 0.315,
1474
+ "count": 200
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
+ "full_accuracy": 0.34,
1480
+ "digit_accuracy": 0.5742857142857143,
1481
+ "n_examples": 50,
1482
  "per_subtask": {
1483
  "SC": {
1484
+ "accuracy": 1.0,
1485
+ "count": 50
1486
  },
1487
  "UC": {
1488
+ "accuracy": 0.46,
1489
+ "count": 50
1490
  },
1491
  "US": {
1492
+ "accuracy": 0.512,
1493
+ "count": 250
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
+ "full_accuracy": 0.25,
1499
+ "digit_accuracy": 0.8514285714285714,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
+ "accuracy": 0.91415313225058,
1504
+ "count": 431
1505
  },
1506
  "SC": {
1507
+ "accuracy": 0.9493670886075949,
1508
+ "count": 316
1509
  },
1510
  "SS": {
1511
  "accuracy": 1.0,
1512
+ "count": 39
1513
  },
1514
  "UC": {
1515
+ "accuracy": 0.7303571428571428,
1516
+ "count": 560
1517
  },
1518
  "US": {
1519
+ "accuracy": 0.9259259259259259,
1520
+ "count": 54
1521
+ }
1522
+ }
1523
+ },
1524
+ "add_C1": {
1525
+ "full_accuracy": 0.26,
1526
+ "digit_accuracy": 0.8657142857142858,
1527
+ "n_examples": 50,
1528
+ "per_subtask": {
1529
+ "SA": {
1530
+ "accuracy": 0.928,
1531
+ "count": 250
1532
+ },
1533
+ "SC": {
1534
+ "accuracy": 0.96,
1535
+ "count": 50
1536
+ },
1537
+ "UC": {
1538
+ "accuracy": 0.46,
1539
+ "count": 50
1540
+ }
1541
+ }
1542
+ },
1543
+ "add_C2": {
1544
+ "full_accuracy": 0.24,
1545
+ "digit_accuracy": 0.8371428571428572,
1546
+ "n_examples": 50,
1547
+ "per_subtask": {
1548
+ "SA": {
1549
+ "accuracy": 0.945,
1550
+ "count": 200
1551
+ },
1552
+ "SC": {
1553
+ "accuracy": 0.94,
1554
+ "count": 50
1555
+ },
1556
+ "UC": {
1557
+ "accuracy": 0.5180722891566265,
1558
+ "count": 83
1559
+ },
1560
+ "US": {
1561
+ "accuracy": 0.8235294117647058,
1562
+ "count": 17
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
+ "full_accuracy": 0.14,
1568
+ "digit_accuracy": 0.8057142857142857,
1569
+ "n_examples": 50,
1570
  "per_subtask": {
1571
  "SA": {
1572
+ "accuracy": 0.9533333333333334,
1573
+ "count": 150
1574
  },
1575
  "SC": {
1576
+ "accuracy": 1.0,
1577
+ "count": 50
1578
  },
1579
  "UC": {
1580
+ "accuracy": 0.5,
1581
+ "count": 100
1582
  },
1583
  "US": {
1584
+ "accuracy": 0.78,
1585
+ "count": 50
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
+ "full_accuracy": 0.18,
1591
+ "digit_accuracy": 0.82,
1592
+ "n_examples": 50,
1593
  "per_subtask": {
1594
  "SA": {
1595
+ "accuracy": 0.98,
1596
+ "count": 100
1597
  },
1598
  "SC": {
1599
+ "accuracy": 0.98,
1600
+ "count": 50
1601
  },
1602
  "UC": {
1603
+ "accuracy": 0.6060606060606061,
1604
+ "count": 132
1605
  },
1606
  "US": {
1607
+ "accuracy": 0.8823529411764706,
1608
+ "count": 68
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
+ "full_accuracy": 0.16,
1614
+ "digit_accuracy": 0.7942857142857143,
1615
+ "n_examples": 50,
1616
  "per_subtask": {
1617
  "SA": {
1618
+ "accuracy": 0.98,
1619
+ "count": 50
1620
  },
1621
  "SC": {
1622
+ "accuracy": 1.0,
1623
+ "count": 50
1624
  },
1625
  "UC": {
1626
+ "accuracy": 0.6438356164383562,
1627
+ "count": 146
1628
  },
1629
  "US": {
1630
+ "accuracy": 0.8173076923076923,
1631
+ "count": 104
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
+ "full_accuracy": 0.14,
1637
+ "digit_accuracy": 0.7628571428571429,
1638
+ "n_examples": 50,
1639
  "per_subtask": {
1640
  "SC": {
1641
+ "accuracy": 0.98,
1642
+ "count": 50
1643
  },
1644
  "UC": {
1645
+ "accuracy": 0.6931216931216931,
1646
+ "count": 189
1647
  },
1648
  "US": {
1649
+ "accuracy": 0.7837837837837838,
1650
+ "count": 111
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
+ "full_accuracy": 0.48,
1656
+ "digit_accuracy": 0.8971428571428571,
1657
+ "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
+ "accuracy": 0.8811881188118812,
1661
+ "count": 303
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
+ "count": 47
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
+ "full_accuracy": 0.22,
1671
+ "digit_accuracy": 0.8314285714285714,
1672
+ "n_examples": 50,
1673
  "per_subtask": {
1674
  "MD": {
1675
+ "accuracy": 0.9361702127659575,
1676
+ "count": 141
1677
  },
1678
  "MB": {
1679
+ "accuracy": 0.9583333333333334,
1680
+ "count": 72
1681
  },
1682
  "ME": {
1683
+ "accuracy": 0.8888888888888888,
1684
+ "count": 18
1685
  },
1686
  "UB": {
1687
+ "accuracy": 0.6218487394957983,
1688
+ "count": 119
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
+ "full_accuracy": 0.14,
1694
+ "digit_accuracy": 0.8171428571428572,
1695
+ "n_examples": 50,
1696
  "per_subtask": {
1697
  "MD": {
1698
+ "accuracy": 0.9464285714285714,
1699
+ "count": 112
1700
  },
1701
  "MB": {
1702
+ "accuracy": 0.9245283018867925,
1703
+ "count": 53
1704
  },
1705
  "ME": {
1706
+ "accuracy": 0.9148936170212766,
1707
+ "count": 47
1708
  },
1709
  "UB": {
1710
+ "accuracy": 0.4588235294117647,
1711
+ "count": 85
1712
  },
1713
  "UD": {
1714
+ "accuracy": 0.9245283018867925,
1715
+ "count": 53
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
+ "full_accuracy": 0.02,
1721
+ "digit_accuracy": 0.7142857142857143,
1722
+ "n_examples": 50,
1723
  "per_subtask": {
1724
  "MD": {
1725
+ "accuracy": 0.9278350515463918,
1726
+ "count": 97
1727
  },
1728
  "MB": {
1729
+ "accuracy": 0.9607843137254902,
1730
+ "count": 51
1731
  },
1732
  "ME": {
1733
+ "accuracy": 0.9629629629629629,
1734
+ "count": 27
1735
  },
1736
  "UB": {
1737
+ "accuracy": 0.4594594594594595,
1738
+ "count": 74
1739
  },
1740
  "UD": {
1741
+ "accuracy": 0.504950495049505,
1742
+ "count": 101
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
+ "full_accuracy": 0.08,
1748
+ "digit_accuracy": 0.6542857142857142,
1749
+ "n_examples": 50,
1750
  "per_subtask": {
1751
  "MD": {
 
 
 
 
1752
  "accuracy": 0.96,
1753
  "count": 100
1754
  },
1755
+ "MB": {
1756
+ "accuracy": 0.94,
1757
+ "count": 50
1758
+ },
1759
  "UB": {
1760
+ "accuracy": 0.56,
1761
+ "count": 50
1762
  },
1763
  "UD": {
1764
+ "accuracy": 0.38666666666666666,
1765
+ "count": 150
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
  "full_accuracy": 0.04,
1771
+ "digit_accuracy": 0.5314285714285715,
1772
+ "n_examples": 50,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
+ "count": 50
1777
  },
1778
  "MB": {
1779
+ "accuracy": 1.0,
1780
+ "count": 50
1781
  },
1782
  "UB": {
1783
+ "accuracy": 0.56,
1784
+ "count": 50
1785
  },
1786
  "UD": {
1787
+ "accuracy": 0.29,
1788
+ "count": 200
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
+ "full_accuracy": 0.31,
1794
+ "digit_accuracy": 0.8514285714285714,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
+ "accuracy": 0.9526315789473684,
1799
+ "count": 570
1800
  },
1801
  "MB": {
1802
+ "accuracy": 0.9494584837545126,
1803
+ "count": 277
1804
  },
1805
  "ME": {
1806
+ "accuracy": 0.9811320754716981,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
+ "accuracy": 0.6475583864118896,
1811
+ "count": 471
1812
  },
1813
  "UD": {
1814
+ "accuracy": 1.0,
1815
+ "count": 29
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
+ "full_accuracy": 0.06,
1821
+ "digit_accuracy": 0.7942857142857143,
1822
+ "n_examples": 50,
1823
  "per_subtask": {
1824
  "MD": {
1825
+ "accuracy": 0.9666666666666667,
1826
+ "count": 150
1827
  },
1828
  "MB": {
1829
+ "accuracy": 0.98,
1830
+ "count": 50
1831
  },
1832
  "UB": {
1833
+ "accuracy": 0.48514851485148514,
1834
+ "count": 101
1835
  },
1836
  "UD": {
1837
+ "accuracy": 0.7142857142857143,
1838
+ "count": 49
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
+ "full_accuracy": 0.08,
1844
+ "digit_accuracy": 0.7171428571428572,
1845
+ "n_examples": 50,
1846
  "per_subtask": {
1847
  "MD": {
1848
+ "accuracy": 1.0,
1849
+ "count": 100
1850
  },
1851
  "MB": {
1852
+ "accuracy": 0.98,
1853
+ "count": 50
1854
  },
1855
  "UB": {
1856
+ "accuracy": 0.45454545454545453,
1857
+ "count": 121
1858
  },
1859
  "UD": {
1860
+ "accuracy": 0.5949367088607594,
1861
+ "count": 79
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
+ "full_accuracy": 0.06,
1867
+ "digit_accuracy": 0.7057142857142857,
1868
+ "n_examples": 50,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
+ "count": 50
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
+ "count": 50
1877
  },
1878
  "UB": {
1879
+ "accuracy": 0.4934210526315789,
1880
+ "count": 152
1881
  },
1882
  "UD": {
1883
+ "accuracy": 0.7346938775510204,
1884
+ "count": 98
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
+ "overall_accuracy": 0.21266666666666667,
1891
+ "digit_accuracy": 0.7844761904761904,
1892
+ "total_examples": 1500,
1893
+ "n_splits": 24
1894
  }
1895
  }
1896
  }