amirali1985 commited on
Commit
cf2ddb5
·
verified ·
1 Parent(s): 0cbc409

Upload add_sub_baseline_50K_1L3H510d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_50K_1L3H510d/metrics.json CHANGED
@@ -1330,502 +1330,567 @@
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
- "n_per_split": 100
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
- "full_accuracy": 0.94,
1338
- "n_examples": 100,
 
1339
  "per_subtask": {
1340
  "SA": {
1341
- "accuracy": 0.9900826446280991,
1342
- "count": 605
1343
  },
1344
  "SS": {
1345
  "accuracy": 1.0,
1346
- "count": 95
1347
  }
1348
  }
1349
  },
1350
  "add_S1": {
1351
- "full_accuracy": 0.94,
1352
- "n_examples": 100,
 
1353
  "per_subtask": {
1354
  "SA": {
1355
- "accuracy": 0.9901960784313726,
1356
- "count": 204
1357
  },
1358
  "SC": {
1359
- "accuracy": 0.9940828402366864,
1360
- "count": 169
1361
  },
1362
  "SS": {
1363
- "accuracy": 1.0,
1364
- "count": 31
1365
  },
1366
  "UC": {
1367
- "accuracy": 0.9898648648648649,
1368
- "count": 296
1369
  }
1370
  }
1371
  },
1372
  "add_S2": {
1373
  "full_accuracy": 0.64,
1374
- "n_examples": 100,
 
1375
  "per_subtask": {
1376
  "SA": {
1377
- "accuracy": 0.9877300613496932,
1378
- "count": 163
1379
  },
1380
  "SC": {
1381
- "accuracy": 0.9384615384615385,
1382
- "count": 130
1383
  },
1384
  "SS": {
1385
- "accuracy": 0.9540229885057471,
1386
- "count": 87
1387
  },
1388
  "UC": {
1389
- "accuracy": 0.8669950738916257,
1390
- "count": 203
1391
  },
1392
  "US": {
1393
  "accuracy": 1.0,
1394
- "count": 117
1395
  }
1396
  }
1397
  },
1398
  "add_S3": {
1399
- "full_accuracy": 0.3,
1400
- "n_examples": 100,
 
1401
  "per_subtask": {
1402
  "SA": {
1403
- "accuracy": 0.9917355371900827,
1404
- "count": 121
1405
  },
1406
  "SC": {
1407
- "accuracy": 0.9421487603305785,
1408
- "count": 121
1409
  },
1410
  "SS": {
1411
- "accuracy": 0.9795918367346939,
1412
- "count": 49
1413
  },
1414
  "UC": {
1415
- "accuracy": 0.6935483870967742,
1416
- "count": 186
1417
  },
1418
  "US": {
1419
- "accuracy": 0.820627802690583,
1420
- "count": 223
1421
  }
1422
  }
1423
  },
1424
  "add_S4": {
1425
- "full_accuracy": 0.31,
1426
- "n_examples": 100,
 
1427
  "per_subtask": {
1428
  "SA": {
1429
  "accuracy": 1.0,
1430
- "count": 104
1431
  },
1432
  "SC": {
1433
- "accuracy": 1.0,
1434
- "count": 106
1435
  },
1436
  "SS": {
1437
  "accuracy": 1.0,
1438
- "count": 23
1439
  },
1440
  "UC": {
1441
- "accuracy": 0.65625,
1442
- "count": 160
1443
  },
1444
  "US": {
1445
- "accuracy": 0.6351791530944625,
1446
- "count": 307
1447
  }
1448
  }
1449
  },
1450
  "add_S5": {
1451
- "full_accuracy": 0.26,
1452
- "n_examples": 100,
 
1453
  "per_subtask": {
1454
  "SA": {
1455
  "accuracy": 1.0,
1456
- "count": 100
1457
  },
1458
  "SC": {
1459
  "accuracy": 1.0,
1460
- "count": 100
1461
  },
1462
  "UC": {
1463
  "accuracy": 0.42,
1464
- "count": 100
1465
  },
1466
  "US": {
1467
- "accuracy": 0.525,
1468
- "count": 400
1469
  }
1470
  }
1471
  },
1472
  "add_S6": {
1473
- "full_accuracy": 0.45,
1474
- "n_examples": 100,
 
1475
  "per_subtask": {
1476
  "SC": {
1477
  "accuracy": 1.0,
1478
- "count": 100
1479
  },
1480
  "UC": {
1481
- "accuracy": 0.58,
1482
- "count": 100
1483
  },
1484
  "US": {
1485
- "accuracy": 0.594,
1486
- "count": 500
1487
  }
1488
  }
1489
  },
1490
  "add_random": {
1491
- "full_accuracy": 0.88,
 
1492
  "n_examples": 200,
1493
  "per_subtask": {
1494
  "SA": {
1495
- "accuracy": 0.9910514541387024,
1496
- "count": 447
1497
  },
1498
  "SC": {
1499
- "accuracy": 0.990625,
1500
- "count": 320
1501
  },
1502
  "SS": {
1503
- "accuracy": 0.9464285714285714,
1504
- "count": 56
1505
  },
1506
  "UC": {
1507
- "accuracy": 0.9716446124763705,
1508
- "count": 529
1509
  },
1510
  "US": {
1511
- "accuracy": 0.9791666666666666,
1512
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1513
  }
1514
  }
1515
  },
1516
  "add_C3": {
1517
- "full_accuracy": 0.53,
1518
- "n_examples": 100,
 
1519
  "per_subtask": {
1520
  "SA": {
1521
- "accuracy": 0.9866666666666667,
1522
- "count": 300
1523
  },
1524
  "SC": {
1525
  "accuracy": 1.0,
1526
- "count": 100
1527
  },
1528
  "UC": {
1529
- "accuracy": 0.7927461139896373,
1530
- "count": 193
1531
  },
1532
  "US": {
1533
- "accuracy": 0.8037383177570093,
1534
- "count": 107
1535
  }
1536
  }
1537
  },
1538
  "add_C4": {
1539
- "full_accuracy": 0.5,
1540
- "n_examples": 100,
 
1541
  "per_subtask": {
1542
  "SA": {
1543
- "accuracy": 1.0,
1544
- "count": 200
1545
  },
1546
  "SC": {
1547
  "accuracy": 1.0,
1548
- "count": 100
1549
  },
1550
  "UC": {
1551
- "accuracy": 0.8046875,
1552
- "count": 256
1553
  },
1554
  "US": {
1555
- "accuracy": 0.8055555555555556,
1556
- "count": 144
1557
  }
1558
  }
1559
  },
1560
  "add_C5": {
1561
- "full_accuracy": 0.51,
1562
- "n_examples": 100,
 
1563
  "per_subtask": {
1564
  "SA": {
1565
  "accuracy": 1.0,
1566
- "count": 100
1567
  },
1568
  "SC": {
1569
  "accuracy": 1.0,
1570
- "count": 100
1571
  },
1572
  "UC": {
1573
- "accuracy": 0.8496732026143791,
1574
- "count": 306
1575
  },
1576
  "US": {
1577
- "accuracy": 0.8402061855670103,
1578
- "count": 194
1579
  }
1580
  }
1581
  },
1582
  "add_C6": {
1583
- "full_accuracy": 0.44,
1584
- "n_examples": 100,
 
1585
  "per_subtask": {
1586
  "SC": {
1587
  "accuracy": 1.0,
1588
- "count": 100
1589
  },
1590
  "UC": {
1591
- "accuracy": 0.8579234972677595,
1592
- "count": 366
1593
  },
1594
  "US": {
1595
- "accuracy": 0.8717948717948718,
1596
- "count": 234
1597
  }
1598
  }
1599
  },
1600
  "sub_M0": {
1601
- "full_accuracy": 0.88,
1602
- "n_examples": 100,
 
1603
  "per_subtask": {
1604
  "MD": {
1605
- "accuracy": 0.9800332778702163,
1606
- "count": 601
1607
  },
1608
  "ME": {
1609
  "accuracy": 1.0,
1610
- "count": 99
1611
  }
1612
  }
1613
  },
1614
  "sub_M1": {
1615
- "full_accuracy": 0.92,
1616
- "n_examples": 100,
 
1617
  "per_subtask": {
1618
  "MD": {
1619
- "accuracy": 0.985663082437276,
1620
- "count": 279
1621
  },
1622
  "MB": {
1623
- "accuracy": 0.9862068965517241,
1624
- "count": 145
1625
  },
1626
  "ME": {
1627
  "accuracy": 1.0,
1628
- "count": 24
1629
  },
1630
  "UB": {
1631
- "accuracy": 0.9920634920634921,
1632
- "count": 252
1633
  }
1634
  }
1635
  },
1636
  "sub_M2": {
1637
- "full_accuracy": 0.54,
1638
- "n_examples": 100,
 
1639
  "per_subtask": {
1640
  "MD": {
1641
- "accuracy": 0.9906103286384976,
1642
- "count": 213
1643
  },
1644
  "MB": {
1645
- "accuracy": 0.9823008849557522,
1646
- "count": 113
1647
  },
1648
  "ME": {
1649
  "accuracy": 1.0,
1650
- "count": 85
1651
  },
1652
  "UB": {
1653
- "accuracy": 0.7624309392265194,
1654
- "count": 181
1655
  },
1656
  "UD": {
1657
- "accuracy": 1.0,
1658
- "count": 108
1659
  }
1660
  }
1661
  },
1662
  "sub_M3": {
1663
  "full_accuracy": 0.22,
1664
- "n_examples": 100,
 
1665
  "per_subtask": {
1666
  "MD": {
1667
- "accuracy": 1.0,
1668
- "count": 179
1669
  },
1670
  "MB": {
1671
- "accuracy": 1.0,
1672
- "count": 103
1673
  },
1674
  "ME": {
1675
  "accuracy": 1.0,
1676
- "count": 56
1677
  },
1678
  "UB": {
1679
- "accuracy": 0.5570469798657718,
1680
- "count": 149
1681
  },
1682
  "UD": {
1683
- "accuracy": 0.8450704225352113,
1684
- "count": 213
1685
  }
1686
  }
1687
  },
1688
  "sub_M4": {
1689
- "full_accuracy": 0.02,
1690
- "n_examples": 100,
 
1691
  "per_subtask": {
1692
  "MD": {
1693
  "accuracy": 1.0,
1694
- "count": 200
1695
  },
1696
  "MB": {
1697
  "accuracy": 1.0,
1698
- "count": 100
1699
  },
1700
  "UB": {
1701
- "accuracy": 0.43,
1702
- "count": 100
1703
  },
1704
  "UD": {
1705
- "accuracy": 0.38333333333333336,
1706
- "count": 300
1707
  }
1708
  }
1709
  },
1710
  "sub_M5": {
1711
- "full_accuracy": 0.08,
1712
- "n_examples": 100,
 
1713
  "per_subtask": {
1714
  "MD": {
1715
  "accuracy": 1.0,
1716
- "count": 100
1717
  },
1718
  "MB": {
1719
  "accuracy": 1.0,
1720
- "count": 100
1721
  },
1722
  "UB": {
1723
- "accuracy": 0.51,
1724
- "count": 100
1725
  },
1726
  "UD": {
1727
- "accuracy": 0.4075,
1728
- "count": 400
1729
  }
1730
  }
1731
  },
1732
  "sub_random": {
1733
- "full_accuracy": 0.86,
 
1734
  "n_examples": 200,
1735
  "per_subtask": {
1736
  "MD": {
1737
- "accuracy": 0.985,
1738
- "count": 600
1739
  },
1740
  "MB": {
1741
- "accuracy": 0.9925093632958801,
1742
- "count": 267
1743
  },
1744
  "ME": {
1745
  "accuracy": 1.0,
1746
  "count": 53
1747
  },
1748
  "UB": {
1749
- "accuracy": 0.9567198177676538,
1750
- "count": 439
1751
  },
1752
  "UD": {
1753
- "accuracy": 1.0,
1754
- "count": 41
1755
  }
1756
  }
1757
  },
1758
  "sub_B3": {
1759
- "full_accuracy": 0.46,
1760
- "n_examples": 100,
 
1761
  "per_subtask": {
1762
  "MD": {
1763
  "accuracy": 1.0,
1764
- "count": 300
1765
  },
1766
  "MB": {
1767
  "accuracy": 1.0,
1768
- "count": 100
1769
  },
1770
  "UB": {
1771
- "accuracy": 0.7411167512690355,
1772
- "count": 197
1773
  },
1774
  "UD": {
1775
- "accuracy": 0.7669902912621359,
1776
- "count": 103
1777
  }
1778
  }
1779
  },
1780
  "sub_B4": {
1781
- "full_accuracy": 0.3,
1782
- "n_examples": 100,
 
1783
  "per_subtask": {
1784
  "MD": {
1785
  "accuracy": 1.0,
1786
- "count": 200
1787
  },
1788
  "MB": {
1789
  "accuracy": 1.0,
1790
- "count": 100
1791
  },
1792
  "UB": {
1793
- "accuracy": 0.7327935222672065,
1794
- "count": 247
1795
  },
1796
  "UD": {
1797
- "accuracy": 0.7450980392156863,
1798
- "count": 153
1799
  }
1800
  }
1801
  },
1802
  "sub_B5": {
1803
- "full_accuracy": 0.27,
1804
- "n_examples": 100,
 
1805
  "per_subtask": {
1806
  "MD": {
1807
  "accuracy": 1.0,
1808
- "count": 100
1809
  },
1810
  "MB": {
1811
  "accuracy": 1.0,
1812
- "count": 100
1813
  },
1814
  "UB": {
1815
- "accuracy": 0.7449664429530202,
1816
- "count": 298
1817
  },
1818
  "UD": {
1819
- "accuracy": 0.698019801980198,
1820
- "count": 202
1821
  }
1822
  }
1823
  }
1824
  },
1825
  "summary": {
1826
- "overall_accuracy": 0.54125,
1827
- "total_examples": 2400,
1828
- "n_splits": 22
 
1829
  }
1830
  }
1831
  }
 
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
+ "n_per_split": 50
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
+ "full_accuracy": 0.92,
1338
+ "digit_accuracy": 0.9885714285714285,
1339
+ "n_examples": 50,
1340
  "per_subtask": {
1341
  "SA": {
1342
+ "accuracy": 0.9864406779661017,
1343
+ "count": 295
1344
  },
1345
  "SS": {
1346
  "accuracy": 1.0,
1347
+ "count": 55
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
+ "full_accuracy": 0.88,
1353
+ "digit_accuracy": 0.9828571428571429,
1354
+ "n_examples": 50,
1355
  "per_subtask": {
1356
  "SA": {
1357
+ "accuracy": 1.0,
1358
+ "count": 126
1359
  },
1360
  "SC": {
1361
+ "accuracy": 0.9746835443037974,
1362
+ "count": 79
1363
  },
1364
  "SS": {
1365
+ "accuracy": 0.9523809523809523,
1366
+ "count": 21
1367
  },
1368
  "UC": {
1369
+ "accuracy": 0.9758064516129032,
1370
+ "count": 124
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
  "full_accuracy": 0.64,
1376
+ "digit_accuracy": 0.9428571428571428,
1377
+ "n_examples": 50,
1378
  "per_subtask": {
1379
  "SA": {
1380
+ "accuracy": 0.9866666666666667,
1381
+ "count": 75
1382
  },
1383
  "SC": {
1384
+ "accuracy": 0.9516129032258065,
1385
+ "count": 62
1386
  },
1387
  "SS": {
1388
+ "accuracy": 0.9743589743589743,
1389
+ "count": 39
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.8648648648648649,
1393
+ "count": 111
1394
  },
1395
  "US": {
1396
  "accuracy": 1.0,
1397
+ "count": 63
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
+ "full_accuracy": 0.44,
1403
+ "digit_accuracy": 0.8771428571428571,
1404
+ "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
+ "accuracy": 1.0,
1408
+ "count": 60
1409
  },
1410
  "SC": {
1411
+ "accuracy": 1.0,
1412
+ "count": 57
1413
  },
1414
  "SS": {
1415
+ "accuracy": 1.0,
1416
+ "count": 19
1417
  },
1418
  "UC": {
1419
+ "accuracy": 0.7692307692307693,
1420
+ "count": 104
1421
  },
1422
  "US": {
1423
+ "accuracy": 0.8272727272727273,
1424
+ "count": 110
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
+ "full_accuracy": 0.22,
1430
+ "digit_accuracy": 0.74,
1431
+ "n_examples": 50,
1432
  "per_subtask": {
1433
  "SA": {
1434
  "accuracy": 1.0,
1435
+ "count": 48
1436
  },
1437
  "SC": {
1438
+ "accuracy": 0.9807692307692307,
1439
+ "count": 52
1440
  },
1441
  "SS": {
1442
  "accuracy": 1.0,
1443
+ "count": 7
1444
  },
1445
  "UC": {
1446
+ "accuracy": 0.6404494382022472,
1447
+ "count": 89
1448
  },
1449
  "US": {
1450
+ "accuracy": 0.6233766233766234,
1451
+ "count": 154
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
+ "full_accuracy": 0.18,
1457
+ "digit_accuracy": 0.5971428571428572,
1458
+ "n_examples": 50,
1459
  "per_subtask": {
1460
  "SA": {
1461
  "accuracy": 1.0,
1462
+ "count": 50
1463
  },
1464
  "SC": {
1465
  "accuracy": 1.0,
1466
+ "count": 50
1467
  },
1468
  "UC": {
1469
  "accuracy": 0.42,
1470
+ "count": 50
1471
  },
1472
  "US": {
1473
+ "accuracy": 0.44,
1474
+ "count": 200
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
+ "full_accuracy": 0.5,
1480
+ "digit_accuracy": 0.6485714285714286,
1481
+ "n_examples": 50,
1482
  "per_subtask": {
1483
  "SC": {
1484
  "accuracy": 1.0,
1485
+ "count": 50
1486
  },
1487
  "UC": {
1488
+ "accuracy": 0.54,
1489
+ "count": 50
1490
  },
1491
  "US": {
1492
+ "accuracy": 0.6,
1493
+ "count": 250
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
+ "full_accuracy": 0.89,
1499
+ "digit_accuracy": 0.9814285714285714,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
+ "accuracy": 1.0,
1504
+ "count": 431
1505
  },
1506
  "SC": {
1507
+ "accuracy": 1.0,
1508
+ "count": 316
1509
  },
1510
  "SS": {
1511
+ "accuracy": 1.0,
1512
+ "count": 39
1513
  },
1514
  "UC": {
1515
+ "accuracy": 0.9607142857142857,
1516
+ "count": 560
1517
  },
1518
  "US": {
1519
+ "accuracy": 0.9259259259259259,
1520
+ "count": 54
1521
+ }
1522
+ }
1523
+ },
1524
+ "add_C1": {
1525
+ "full_accuracy": 0.94,
1526
+ "digit_accuracy": 0.9914285714285714,
1527
+ "n_examples": 50,
1528
+ "per_subtask": {
1529
+ "SA": {
1530
+ "accuracy": 0.996,
1531
+ "count": 250
1532
+ },
1533
+ "SC": {
1534
+ "accuracy": 1.0,
1535
+ "count": 50
1536
+ },
1537
+ "UC": {
1538
+ "accuracy": 0.96,
1539
+ "count": 50
1540
+ }
1541
+ }
1542
+ },
1543
+ "add_C2": {
1544
+ "full_accuracy": 0.84,
1545
+ "digit_accuracy": 0.9742857142857143,
1546
+ "n_examples": 50,
1547
+ "per_subtask": {
1548
+ "SA": {
1549
+ "accuracy": 1.0,
1550
+ "count": 200
1551
+ },
1552
+ "SC": {
1553
+ "accuracy": 1.0,
1554
+ "count": 50
1555
+ },
1556
+ "UC": {
1557
+ "accuracy": 0.9036144578313253,
1558
+ "count": 83
1559
+ },
1560
+ "US": {
1561
+ "accuracy": 0.9411764705882353,
1562
+ "count": 17
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
+ "full_accuracy": 0.62,
1568
+ "digit_accuracy": 0.9228571428571428,
1569
+ "n_examples": 50,
1570
  "per_subtask": {
1571
  "SA": {
1572
+ "accuracy": 0.9933333333333333,
1573
+ "count": 150
1574
  },
1575
  "SC": {
1576
  "accuracy": 1.0,
1577
+ "count": 50
1578
  },
1579
  "UC": {
1580
+ "accuracy": 0.85,
1581
+ "count": 100
1582
  },
1583
  "US": {
1584
+ "accuracy": 0.78,
1585
+ "count": 50
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
+ "full_accuracy": 0.66,
1591
+ "digit_accuracy": 0.9257142857142857,
1592
+ "n_examples": 50,
1593
  "per_subtask": {
1594
  "SA": {
1595
+ "accuracy": 0.99,
1596
+ "count": 100
1597
  },
1598
  "SC": {
1599
  "accuracy": 1.0,
1600
+ "count": 50
1601
  },
1602
  "UC": {
1603
+ "accuracy": 0.8787878787878788,
1604
+ "count": 132
1605
  },
1606
  "US": {
1607
+ "accuracy": 0.8676470588235294,
1608
+ "count": 68
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
+ "full_accuracy": 0.54,
1614
+ "digit_accuracy": 0.8742857142857143,
1615
+ "n_examples": 50,
1616
  "per_subtask": {
1617
  "SA": {
1618
  "accuracy": 1.0,
1619
+ "count": 50
1620
  },
1621
  "SC": {
1622
  "accuracy": 1.0,
1623
+ "count": 50
1624
  },
1625
  "UC": {
1626
+ "accuracy": 0.8493150684931506,
1627
+ "count": 146
1628
  },
1629
  "US": {
1630
+ "accuracy": 0.7884615384615384,
1631
+ "count": 104
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
+ "full_accuracy": 0.5,
1637
+ "digit_accuracy": 0.8771428571428571,
1638
+ "n_examples": 50,
1639
  "per_subtask": {
1640
  "SC": {
1641
  "accuracy": 1.0,
1642
+ "count": 50
1643
  },
1644
  "UC": {
1645
+ "accuracy": 0.8571428571428571,
1646
+ "count": 189
1647
  },
1648
  "US": {
1649
+ "accuracy": 0.8558558558558559,
1650
+ "count": 111
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
+ "full_accuracy": 0.9,
1656
+ "digit_accuracy": 0.9857142857142858,
1657
+ "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
+ "accuracy": 0.9834983498349835,
1661
+ "count": 303
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
+ "count": 47
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
+ "full_accuracy": 0.86,
1671
+ "digit_accuracy": 0.98,
1672
+ "n_examples": 50,
1673
  "per_subtask": {
1674
  "MD": {
1675
+ "accuracy": 0.9858156028368794,
1676
+ "count": 141
1677
  },
1678
  "MB": {
1679
+ "accuracy": 0.9861111111111112,
1680
+ "count": 72
1681
  },
1682
  "ME": {
1683
  "accuracy": 1.0,
1684
+ "count": 18
1685
  },
1686
  "UB": {
1687
+ "accuracy": 0.9663865546218487,
1688
+ "count": 119
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
+ "full_accuracy": 0.56,
1694
+ "digit_accuracy": 0.9257142857142857,
1695
+ "n_examples": 50,
1696
  "per_subtask": {
1697
  "MD": {
1698
+ "accuracy": 0.9732142857142857,
1699
+ "count": 112
1700
  },
1701
  "MB": {
1702
+ "accuracy": 0.9622641509433962,
1703
+ "count": 53
1704
  },
1705
  "ME": {
1706
  "accuracy": 1.0,
1707
+ "count": 47
1708
  },
1709
  "UB": {
1710
+ "accuracy": 0.7647058823529411,
1711
+ "count": 85
1712
  },
1713
  "UD": {
1714
+ "accuracy": 0.9811320754716981,
1715
+ "count": 53
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
  "full_accuracy": 0.22,
1721
+ "digit_accuracy": 0.8457142857142858,
1722
+ "n_examples": 50,
1723
  "per_subtask": {
1724
  "MD": {
1725
+ "accuracy": 0.979381443298969,
1726
+ "count": 97
1727
  },
1728
  "MB": {
1729
+ "accuracy": 0.9803921568627451,
1730
+ "count": 51
1731
  },
1732
  "ME": {
1733
  "accuracy": 1.0,
1734
+ "count": 27
1735
  },
1736
  "UB": {
1737
+ "accuracy": 0.5945945945945946,
1738
+ "count": 74
1739
  },
1740
  "UD": {
1741
+ "accuracy": 0.7920792079207921,
1742
+ "count": 101
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
+ "full_accuracy": 0.04,
1748
+ "digit_accuracy": 0.6771428571428572,
1749
+ "n_examples": 50,
1750
  "per_subtask": {
1751
  "MD": {
1752
  "accuracy": 1.0,
1753
+ "count": 100
1754
  },
1755
  "MB": {
1756
  "accuracy": 1.0,
1757
+ "count": 50
1758
  },
1759
  "UB": {
1760
+ "accuracy": 0.4,
1761
+ "count": 50
1762
  },
1763
  "UD": {
1764
+ "accuracy": 0.44666666666666666,
1765
+ "count": 150
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
+ "full_accuracy": 0.1,
1771
+ "digit_accuracy": 0.5771428571428572,
1772
+ "n_examples": 50,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
+ "count": 50
1777
  },
1778
  "MB": {
1779
  "accuracy": 1.0,
1780
+ "count": 50
1781
  },
1782
  "UB": {
1783
+ "accuracy": 0.58,
1784
+ "count": 50
1785
  },
1786
  "UD": {
1787
+ "accuracy": 0.365,
1788
+ "count": 200
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
+ "full_accuracy": 0.87,
1794
+ "digit_accuracy": 0.98,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
+ "accuracy": 0.9982456140350877,
1799
+ "count": 570
1800
  },
1801
  "MB": {
1802
+ "accuracy": 0.9819494584837545,
1803
+ "count": 277
1804
  },
1805
  "ME": {
1806
  "accuracy": 1.0,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
+ "accuracy": 0.9554140127388535,
1811
+ "count": 471
1812
  },
1813
  "UD": {
1814
+ "accuracy": 0.9655172413793104,
1815
+ "count": 29
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
+ "full_accuracy": 0.52,
1821
+ "digit_accuracy": 0.9142857142857143,
1822
+ "n_examples": 50,
1823
  "per_subtask": {
1824
  "MD": {
1825
  "accuracy": 1.0,
1826
+ "count": 150
1827
  },
1828
  "MB": {
1829
  "accuracy": 1.0,
1830
+ "count": 50
1831
  },
1832
  "UB": {
1833
+ "accuracy": 0.801980198019802,
1834
+ "count": 101
1835
  },
1836
  "UD": {
1837
+ "accuracy": 0.7959183673469388,
1838
+ "count": 49
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
+ "full_accuracy": 0.34,
1844
+ "digit_accuracy": 0.8485714285714285,
1845
+ "n_examples": 50,
1846
  "per_subtask": {
1847
  "MD": {
1848
  "accuracy": 1.0,
1849
+ "count": 100
1850
  },
1851
  "MB": {
1852
  "accuracy": 1.0,
1853
+ "count": 50
1854
  },
1855
  "UB": {
1856
+ "accuracy": 0.7603305785123967,
1857
+ "count": 121
1858
  },
1859
  "UD": {
1860
+ "accuracy": 0.6962025316455697,
1861
+ "count": 79
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
+ "full_accuracy": 0.22,
1867
+ "digit_accuracy": 0.8114285714285714,
1868
+ "n_examples": 50,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
+ "count": 50
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
+ "count": 50
1877
  },
1878
  "UB": {
1879
+ "accuracy": 0.75,
1880
+ "count": 152
1881
  },
1882
  "UD": {
1883
+ "accuracy": 0.7142857142857143,
1884
+ "count": 98
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
+ "overall_accuracy": 0.6226666666666667,
1891
+ "digit_accuracy": 0.8917142857142857,
1892
+ "total_examples": 1500,
1893
+ "n_splits": 24
1894
  }
1895
  }
1896
  }