amirali1985 commited on
Commit
245a004
·
verified ·
1 Parent(s): 92c7eac

Upload add_sub_sorl_v6_abs30_K1_25K_1L3H510d/metrics.json with huggingface_hub

Browse files
add_sub_sorl_v6_abs30_K1_25K_1L3H510d/metrics.json CHANGED
@@ -1712,525 +1712,567 @@
1712
  "K": null,
1713
  "mode": "sft",
1714
  "n_digits": 6,
1715
- "n_per_split": 100
1716
  },
1717
  "splits": {
1718
  "add_S0": {
1719
  "full_accuracy": 0.0,
1720
- "digit_accuracy": 0.08714285714285715,
1721
- "n_examples": 100,
1722
  "per_subtask": {
1723
  "SA": {
1724
- "accuracy": 0.09256198347107437,
1725
- "count": 605
1726
  },
1727
  "SS": {
1728
- "accuracy": 0.05263157894736842,
1729
- "count": 95
1730
  }
1731
  }
1732
  },
1733
  "add_S1": {
1734
  "full_accuracy": 0.0,
1735
- "digit_accuracy": 0.08571428571428572,
1736
- "n_examples": 100,
1737
  "per_subtask": {
1738
  "SA": {
1739
- "accuracy": 0.07352941176470588,
1740
- "count": 204
1741
  },
1742
  "SC": {
1743
- "accuracy": 0.07692307692307693,
1744
- "count": 169
1745
  },
1746
  "SS": {
1747
- "accuracy": 0.03225806451612903,
1748
- "count": 31
1749
  },
1750
  "UC": {
1751
- "accuracy": 0.10472972972972973,
1752
- "count": 296
1753
  }
1754
  }
1755
  },
1756
  "add_S2": {
1757
  "full_accuracy": 0.0,
1758
- "digit_accuracy": 0.09571428571428571,
1759
- "n_examples": 100,
1760
  "per_subtask": {
1761
  "SA": {
1762
- "accuracy": 0.10429447852760736,
1763
- "count": 163
1764
  },
1765
  "SC": {
1766
- "accuracy": 0.1076923076923077,
1767
- "count": 130
1768
  },
1769
  "SS": {
1770
- "accuracy": 0.06896551724137931,
1771
- "count": 87
1772
  },
1773
  "UC": {
1774
- "accuracy": 0.09852216748768473,
1775
- "count": 203
1776
  },
1777
  "US": {
1778
- "accuracy": 0.08547008547008547,
1779
- "count": 117
1780
  }
1781
  }
1782
  },
1783
  "add_S3": {
1784
  "full_accuracy": 0.0,
1785
- "digit_accuracy": 0.09428571428571429,
1786
- "n_examples": 100,
1787
  "per_subtask": {
1788
  "SA": {
1789
- "accuracy": 0.10743801652892562,
1790
- "count": 121
1791
  },
1792
  "SC": {
1793
- "accuracy": 0.11570247933884298,
1794
- "count": 121
1795
  },
1796
  "SS": {
1797
- "accuracy": 0.061224489795918366,
1798
- "count": 49
1799
  },
1800
  "UC": {
1801
- "accuracy": 0.11290322580645161,
1802
- "count": 186
1803
  },
1804
  "US": {
1805
- "accuracy": 0.06726457399103139,
1806
- "count": 223
1807
  }
1808
  }
1809
  },
1810
  "add_S4": {
1811
  "full_accuracy": 0.0,
1812
- "digit_accuracy": 0.07428571428571429,
1813
- "n_examples": 100,
1814
  "per_subtask": {
1815
  "SA": {
1816
- "accuracy": 0.0673076923076923,
1817
- "count": 104
1818
  },
1819
  "SC": {
1820
- "accuracy": 0.11320754716981132,
1821
- "count": 106
1822
  },
1823
  "SS": {
1824
- "accuracy": 0.043478260869565216,
1825
- "count": 23
1826
  },
1827
  "UC": {
1828
- "accuracy": 0.09375,
1829
- "count": 160
1830
  },
1831
  "US": {
1832
- "accuracy": 0.05537459283387622,
1833
- "count": 307
1834
  }
1835
  }
1836
  },
1837
  "add_S5": {
1838
  "full_accuracy": 0.0,
1839
- "digit_accuracy": 0.08,
1840
- "n_examples": 100,
1841
  "per_subtask": {
1842
  "SA": {
1843
- "accuracy": 0.05,
1844
- "count": 100
1845
  },
1846
  "SC": {
1847
- "accuracy": 0.14,
1848
- "count": 100
1849
  },
1850
  "UC": {
1851
- "accuracy": 0.11,
1852
- "count": 100
1853
  },
1854
  "US": {
1855
- "accuracy": 0.065,
1856
- "count": 400
1857
  }
1858
  }
1859
  },
1860
  "add_S6": {
1861
  "full_accuracy": 0.0,
1862
- "digit_accuracy": 0.07571428571428572,
1863
- "n_examples": 100,
1864
  "per_subtask": {
1865
  "SC": {
1866
- "accuracy": 0.03,
1867
- "count": 100
1868
  },
1869
  "UC": {
1870
- "accuracy": 0.15,
1871
- "count": 100
1872
  },
1873
  "US": {
1874
- "accuracy": 0.07,
1875
- "count": 500
1876
  }
1877
  }
1878
  },
1879
  "add_random": {
1880
  "full_accuracy": 0.0,
1881
- "digit_accuracy": 0.1,
1882
  "n_examples": 200,
1883
  "per_subtask": {
1884
  "SA": {
1885
- "accuracy": 0.09619686800894854,
1886
- "count": 447
1887
  },
1888
  "SC": {
1889
- "accuracy": 0.1,
1890
- "count": 320
1891
  },
1892
  "SS": {
1893
- "accuracy": 0.05357142857142857,
1894
- "count": 56
1895
  },
1896
  "UC": {
1897
- "accuracy": 0.11342155009451796,
1898
- "count": 529
1899
  },
1900
  "US": {
1901
- "accuracy": 0.041666666666666664,
1902
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1903
  }
1904
  }
1905
  },
1906
  "add_C3": {
1907
  "full_accuracy": 0.0,
1908
- "digit_accuracy": 0.09428571428571429,
1909
- "n_examples": 100,
1910
  "per_subtask": {
1911
  "SA": {
1912
- "accuracy": 0.10666666666666667,
1913
- "count": 300
1914
  },
1915
  "SC": {
1916
- "accuracy": 0.15,
1917
- "count": 100
1918
  },
1919
  "UC": {
1920
- "accuracy": 0.06735751295336788,
1921
- "count": 193
1922
  },
1923
  "US": {
1924
- "accuracy": 0.056074766355140186,
1925
- "count": 107
1926
  }
1927
  }
1928
  },
1929
  "add_C4": {
1930
  "full_accuracy": 0.0,
1931
- "digit_accuracy": 0.09428571428571429,
1932
- "n_examples": 100,
1933
  "per_subtask": {
1934
  "SA": {
1935
- "accuracy": 0.05,
1936
- "count": 200
1937
  },
1938
  "SC": {
1939
- "accuracy": 0.14,
1940
- "count": 100
1941
  },
1942
  "UC": {
1943
- "accuracy": 0.1171875,
1944
- "count": 256
1945
  },
1946
  "US": {
1947
- "accuracy": 0.08333333333333333,
1948
- "count": 144
1949
  }
1950
  }
1951
  },
1952
  "add_C5": {
1953
  "full_accuracy": 0.0,
1954
- "digit_accuracy": 0.06,
1955
- "n_examples": 100,
1956
  "per_subtask": {
1957
  "SA": {
1958
- "accuracy": 0.05,
1959
- "count": 100
1960
  },
1961
  "SC": {
1962
- "accuracy": 0.05,
1963
- "count": 100
1964
  },
1965
  "UC": {
1966
- "accuracy": 0.06862745098039216,
1967
- "count": 306
1968
  },
1969
  "US": {
1970
- "accuracy": 0.05670103092783505,
1971
- "count": 194
1972
  }
1973
  }
1974
  },
1975
  "add_C6": {
1976
  "full_accuracy": 0.0,
1977
- "digit_accuracy": 0.09714285714285714,
1978
- "n_examples": 100,
1979
  "per_subtask": {
1980
  "SC": {
1981
- "accuracy": 0.15,
1982
- "count": 100
1983
  },
1984
  "UC": {
1985
- "accuracy": 0.1092896174863388,
1986
- "count": 366
1987
  },
1988
  "US": {
1989
- "accuracy": 0.05555555555555555,
1990
- "count": 234
1991
  }
1992
  }
1993
  },
1994
  "sub_M0": {
1995
  "full_accuracy": 0.0,
1996
- "digit_accuracy": 0.06857142857142857,
1997
- "n_examples": 100,
1998
  "per_subtask": {
1999
  "MD": {
2000
- "accuracy": 0.07154742096505824,
2001
- "count": 601
2002
  },
2003
  "ME": {
2004
- "accuracy": 0.050505050505050504,
2005
- "count": 99
2006
  }
2007
  }
2008
  },
2009
  "sub_M1": {
2010
  "full_accuracy": 0.0,
2011
- "digit_accuracy": 0.10714285714285714,
2012
- "n_examples": 100,
2013
  "per_subtask": {
2014
  "MD": {
2015
- "accuracy": 0.08960573476702509,
2016
- "count": 279
2017
  },
2018
  "MB": {
2019
- "accuracy": 0.10344827586206896,
2020
- "count": 145
2021
  },
2022
  "ME": {
2023
- "accuracy": 0.08333333333333333,
2024
- "count": 24
2025
  },
2026
  "UB": {
2027
- "accuracy": 0.13095238095238096,
2028
- "count": 252
2029
  }
2030
  }
2031
  },
2032
  "sub_M2": {
2033
  "full_accuracy": 0.0,
2034
- "digit_accuracy": 0.08571428571428572,
2035
- "n_examples": 100,
2036
  "per_subtask": {
2037
  "MD": {
2038
- "accuracy": 0.06103286384976526,
2039
- "count": 213
2040
  },
2041
  "MB": {
2042
- "accuracy": 0.11504424778761062,
2043
- "count": 113
2044
  },
2045
  "ME": {
2046
- "accuracy": 0.09411764705882353,
2047
- "count": 85
2048
  },
2049
  "UB": {
2050
- "accuracy": 0.12154696132596685,
2051
- "count": 181
2052
  },
2053
  "UD": {
2054
- "accuracy": 0.037037037037037035,
2055
- "count": 108
2056
  }
2057
  }
2058
  },
2059
  "sub_M3": {
2060
  "full_accuracy": 0.0,
2061
- "digit_accuracy": 0.09857142857142857,
2062
- "n_examples": 100,
2063
  "per_subtask": {
2064
  "MD": {
2065
- "accuracy": 0.05027932960893855,
2066
- "count": 179
2067
  },
2068
  "MB": {
2069
- "accuracy": 0.1262135922330097,
2070
- "count": 103
2071
  },
2072
  "ME": {
2073
- "accuracy": 0.10714285714285714,
2074
- "count": 56
2075
  },
2076
  "UB": {
2077
- "accuracy": 0.14093959731543623,
2078
- "count": 149
2079
  },
2080
  "UD": {
2081
- "accuracy": 0.09389671361502347,
2082
- "count": 213
2083
  }
2084
  }
2085
  },
2086
  "sub_M4": {
2087
  "full_accuracy": 0.0,
2088
- "digit_accuracy": 0.06714285714285714,
2089
- "n_examples": 100,
2090
  "per_subtask": {
2091
  "MD": {
2092
- "accuracy": 0.05,
2093
- "count": 200
2094
  },
2095
  "MB": {
2096
- "accuracy": 0.12,
2097
- "count": 100
2098
  },
2099
  "UB": {
2100
  "accuracy": 0.08,
2101
- "count": 100
2102
  },
2103
  "UD": {
2104
- "accuracy": 0.056666666666666664,
2105
- "count": 300
2106
  }
2107
  }
2108
  },
2109
  "sub_M5": {
2110
  "full_accuracy": 0.0,
2111
- "digit_accuracy": 0.08857142857142856,
2112
- "n_examples": 100,
2113
  "per_subtask": {
2114
  "MD": {
2115
- "accuracy": 0.01,
2116
- "count": 100
2117
  },
2118
  "MB": {
2119
- "accuracy": 0.1,
2120
- "count": 100
2121
  },
2122
  "UB": {
2123
- "accuracy": 0.13,
2124
- "count": 100
2125
  },
2126
  "UD": {
2127
- "accuracy": 0.095,
2128
- "count": 400
2129
  }
2130
  }
2131
  },
2132
  "sub_random": {
2133
  "full_accuracy": 0.0,
2134
- "digit_accuracy": 0.085,
2135
  "n_examples": 200,
2136
  "per_subtask": {
2137
  "MD": {
2138
- "accuracy": 0.056666666666666664,
2139
- "count": 600
2140
  },
2141
  "MB": {
2142
- "accuracy": 0.1198501872659176,
2143
- "count": 267
2144
  },
2145
  "ME": {
2146
- "accuracy": 0.07547169811320754,
2147
  "count": 53
2148
  },
2149
  "UB": {
2150
- "accuracy": 0.1070615034168565,
2151
- "count": 439
2152
  },
2153
  "UD": {
2154
- "accuracy": 0.04878048780487805,
2155
- "count": 41
2156
  }
2157
  }
2158
  },
2159
  "sub_B3": {
2160
  "full_accuracy": 0.0,
2161
- "digit_accuracy": 0.08857142857142856,
2162
- "n_examples": 100,
2163
  "per_subtask": {
2164
  "MD": {
2165
- "accuracy": 0.07333333333333333,
2166
- "count": 300
2167
  },
2168
  "MB": {
2169
- "accuracy": 0.08,
2170
- "count": 100
2171
  },
2172
  "UB": {
2173
- "accuracy": 0.14213197969543148,
2174
- "count": 197
2175
  },
2176
  "UD": {
2177
- "accuracy": 0.038834951456310676,
2178
- "count": 103
2179
  }
2180
  }
2181
  },
2182
  "sub_B4": {
2183
  "full_accuracy": 0.0,
2184
- "digit_accuracy": 0.08857142857142856,
2185
- "n_examples": 100,
2186
  "per_subtask": {
2187
  "MD": {
2188
- "accuracy": 0.095,
2189
- "count": 200
2190
  },
2191
  "MB": {
2192
- "accuracy": 0.09,
2193
- "count": 100
2194
  },
2195
  "UB": {
2196
- "accuracy": 0.10931174089068826,
2197
- "count": 247
2198
  },
2199
  "UD": {
2200
- "accuracy": 0.0457516339869281,
2201
- "count": 153
2202
  }
2203
  }
2204
  },
2205
  "sub_B5": {
2206
  "full_accuracy": 0.0,
2207
- "digit_accuracy": 0.08571428571428572,
2208
- "n_examples": 100,
2209
  "per_subtask": {
2210
  "MD": {
2211
- "accuracy": 0.01,
2212
- "count": 100
2213
  },
2214
  "MB": {
2215
- "accuracy": 0.11,
2216
- "count": 100
2217
  },
2218
  "UB": {
2219
- "accuracy": 0.12416107382550336,
2220
- "count": 298
2221
  },
2222
  "UD": {
2223
- "accuracy": 0.054455445544554455,
2224
- "count": 202
2225
  }
2226
  }
2227
  }
2228
  },
2229
  "summary": {
2230
  "overall_accuracy": 0.0,
2231
- "digit_accuracy": 0.08678571428571429,
2232
- "total_examples": 2400,
2233
- "n_splits": 22
2234
  }
2235
  },
2236
  "sorl_eval": {
@@ -2239,525 +2281,567 @@
2239
  "K": 1,
2240
  "mode": "sorl",
2241
  "n_digits": 6,
2242
- "n_per_split": 100
2243
  },
2244
  "splits": {
2245
  "add_S0": {
2246
- "full_accuracy": 0.96,
2247
- "digit_accuracy": 0.9942857142857143,
2248
- "n_examples": 100,
2249
  "per_subtask": {
2250
  "SA": {
2251
- "accuracy": 0.9933884297520661,
2252
- "count": 605
2253
  },
2254
  "SS": {
2255
  "accuracy": 1.0,
2256
- "count": 95
2257
  }
2258
  }
2259
  },
2260
  "add_S1": {
2261
- "full_accuracy": 0.96,
2262
- "digit_accuracy": 0.9942857142857143,
2263
- "n_examples": 100,
2264
  "per_subtask": {
2265
  "SA": {
2266
- "accuracy": 0.9950980392156863,
2267
- "count": 204
2268
  },
2269
  "SC": {
2270
- "accuracy": 0.9940828402366864,
2271
- "count": 169
2272
  },
2273
  "SS": {
2274
  "accuracy": 1.0,
2275
- "count": 31
2276
  },
2277
  "UC": {
2278
- "accuracy": 0.9932432432432432,
2279
- "count": 296
2280
  }
2281
  }
2282
  },
2283
  "add_S2": {
2284
- "full_accuracy": 0.87,
2285
- "digit_accuracy": 0.9771428571428571,
2286
- "n_examples": 100,
2287
  "per_subtask": {
2288
  "SA": {
2289
- "accuracy": 0.9877300613496932,
2290
- "count": 163
2291
  },
2292
  "SC": {
2293
- "accuracy": 0.9923076923076923,
2294
- "count": 130
2295
  },
2296
  "SS": {
2297
- "accuracy": 0.9655172413793104,
2298
- "count": 87
2299
  },
2300
  "UC": {
2301
- "accuracy": 0.9507389162561576,
2302
- "count": 203
2303
  },
2304
  "US": {
2305
- "accuracy": 1.0,
2306
- "count": 117
2307
  }
2308
  }
2309
  },
2310
  "add_S3": {
2311
- "full_accuracy": 0.67,
2312
- "digit_accuracy": 0.9342857142857143,
2313
- "n_examples": 100,
2314
  "per_subtask": {
2315
  "SA": {
2316
  "accuracy": 1.0,
2317
- "count": 121
2318
  },
2319
  "SC": {
2320
  "accuracy": 1.0,
2321
- "count": 121
2322
  },
2323
  "SS": {
2324
  "accuracy": 1.0,
2325
- "count": 49
2326
  },
2327
  "UC": {
2328
- "accuracy": 0.8225806451612904,
2329
- "count": 186
2330
  },
2331
  "US": {
2332
- "accuracy": 0.9417040358744395,
2333
- "count": 223
2334
  }
2335
  }
2336
  },
2337
  "add_S4": {
2338
- "full_accuracy": 0.48,
2339
- "digit_accuracy": 0.8728571428571429,
2340
- "n_examples": 100,
2341
  "per_subtask": {
2342
  "SA": {
2343
  "accuracy": 1.0,
2344
- "count": 104
2345
  },
2346
  "SC": {
2347
  "accuracy": 1.0,
2348
- "count": 106
2349
  },
2350
  "SS": {
2351
  "accuracy": 1.0,
2352
- "count": 23
2353
  },
2354
  "UC": {
2355
- "accuracy": 0.71875,
2356
- "count": 160
2357
  },
2358
  "US": {
2359
- "accuracy": 0.8566775244299675,
2360
- "count": 307
2361
  }
2362
  }
2363
  },
2364
  "add_S5": {
2365
- "full_accuracy": 0.26,
2366
- "digit_accuracy": 0.6985714285714286,
2367
- "n_examples": 100,
2368
  "per_subtask": {
2369
  "SA": {
2370
  "accuracy": 1.0,
2371
- "count": 100
2372
  },
2373
  "SC": {
2374
  "accuracy": 1.0,
2375
- "count": 100
2376
  },
2377
  "UC": {
2378
- "accuracy": 0.47,
2379
- "count": 100
2380
  },
2381
  "US": {
2382
- "accuracy": 0.605,
2383
- "count": 400
2384
  }
2385
  }
2386
  },
2387
  "add_S6": {
2388
- "full_accuracy": 0.46,
2389
- "digit_accuracy": 0.71,
2390
- "n_examples": 100,
2391
  "per_subtask": {
2392
  "SC": {
2393
  "accuracy": 1.0,
2394
- "count": 100
2395
  },
2396
  "UC": {
2397
- "accuracy": 0.5,
2398
- "count": 100
2399
  },
2400
  "US": {
2401
- "accuracy": 0.694,
2402
- "count": 500
2403
  }
2404
  }
2405
  },
2406
  "add_random": {
2407
- "full_accuracy": 0.915,
2408
- "digit_accuracy": 0.9871428571428571,
2409
  "n_examples": 200,
2410
  "per_subtask": {
2411
  "SA": {
2412
- "accuracy": 0.9910514541387024,
2413
- "count": 447
2414
  },
2415
  "SC": {
2416
- "accuracy": 0.996875,
2417
- "count": 320
2418
  },
2419
  "SS": {
2420
  "accuracy": 1.0,
2421
- "count": 56
2422
  },
2423
  "UC": {
2424
- "accuracy": 0.9773156899810964,
2425
- "count": 529
2426
  },
2427
  "US": {
2428
- "accuracy": 0.9791666666666666,
2429
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2430
  }
2431
  }
2432
  },
2433
  "add_C3": {
2434
- "full_accuracy": 0.64,
2435
- "digit_accuracy": 0.9357142857142857,
2436
- "n_examples": 100,
2437
  "per_subtask": {
2438
  "SA": {
2439
- "accuracy": 0.9966666666666667,
2440
- "count": 300
2441
  },
2442
  "SC": {
2443
  "accuracy": 1.0,
2444
- "count": 100
2445
  },
2446
  "UC": {
2447
- "accuracy": 0.844559585492228,
2448
- "count": 193
2449
  },
2450
  "US": {
2451
- "accuracy": 0.8691588785046729,
2452
- "count": 107
2453
  }
2454
  }
2455
  },
2456
  "add_C4": {
2457
- "full_accuracy": 0.64,
2458
- "digit_accuracy": 0.9257142857142857,
2459
- "n_examples": 100,
2460
  "per_subtask": {
2461
  "SA": {
2462
  "accuracy": 1.0,
2463
- "count": 200
2464
  },
2465
  "SC": {
2466
  "accuracy": 1.0,
2467
- "count": 100
2468
  },
2469
  "UC": {
2470
- "accuracy": 0.875,
2471
- "count": 256
2472
  },
2473
  "US": {
2474
- "accuracy": 0.8611111111111112,
2475
- "count": 144
2476
  }
2477
  }
2478
  },
2479
  "add_C5": {
2480
- "full_accuracy": 0.62,
2481
- "digit_accuracy": 0.9114285714285715,
2482
- "n_examples": 100,
2483
  "per_subtask": {
2484
  "SA": {
2485
  "accuracy": 1.0,
2486
- "count": 100
2487
  },
2488
  "SC": {
2489
  "accuracy": 1.0,
2490
- "count": 100
2491
  },
2492
  "UC": {
2493
- "accuracy": 0.8660130718954249,
2494
- "count": 306
2495
  },
2496
  "US": {
2497
- "accuracy": 0.8917525773195877,
2498
- "count": 194
2499
  }
2500
  }
2501
  },
2502
  "add_C6": {
2503
- "full_accuracy": 0.7,
2504
- "digit_accuracy": 0.9428571428571428,
2505
- "n_examples": 100,
2506
  "per_subtask": {
2507
  "SC": {
2508
  "accuracy": 1.0,
2509
- "count": 100
2510
  },
2511
  "UC": {
2512
- "accuracy": 0.9180327868852459,
2513
- "count": 366
2514
  },
2515
  "US": {
2516
- "accuracy": 0.9572649572649573,
2517
- "count": 234
2518
  }
2519
  }
2520
  },
2521
  "sub_M0": {
2522
- "full_accuracy": 0.94,
2523
- "digit_accuracy": 0.9914285714285714,
2524
- "n_examples": 100,
2525
  "per_subtask": {
2526
  "MD": {
2527
- "accuracy": 0.9900166389351082,
2528
- "count": 601
2529
  },
2530
  "ME": {
2531
  "accuracy": 1.0,
2532
- "count": 99
2533
  }
2534
  }
2535
  },
2536
  "sub_M1": {
2537
- "full_accuracy": 0.92,
2538
- "digit_accuracy": 0.9885714285714285,
2539
- "n_examples": 100,
2540
  "per_subtask": {
2541
  "MD": {
2542
- "accuracy": 0.982078853046595,
2543
- "count": 279
2544
  },
2545
  "MB": {
2546
- "accuracy": 0.993103448275862,
2547
- "count": 145
2548
  },
2549
  "ME": {
2550
  "accuracy": 1.0,
2551
- "count": 24
2552
  },
2553
  "UB": {
2554
- "accuracy": 0.9920634920634921,
2555
- "count": 252
2556
  }
2557
  }
2558
  },
2559
  "sub_M2": {
2560
- "full_accuracy": 0.67,
2561
  "digit_accuracy": 0.9514285714285714,
2562
- "n_examples": 100,
2563
  "per_subtask": {
2564
  "MD": {
2565
- "accuracy": 0.9906103286384976,
2566
- "count": 213
2567
  },
2568
  "MB": {
2569
- "accuracy": 0.9911504424778761,
2570
- "count": 113
2571
  },
2572
  "ME": {
2573
  "accuracy": 1.0,
2574
- "count": 85
2575
  },
2576
  "UB": {
2577
- "accuracy": 0.8287292817679558,
2578
- "count": 181
2579
  },
2580
  "UD": {
2581
- "accuracy": 1.0,
2582
- "count": 108
2583
  }
2584
  }
2585
  },
2586
  "sub_M3": {
2587
- "full_accuracy": 0.26,
2588
- "digit_accuracy": 0.8557142857142858,
2589
- "n_examples": 100,
2590
  "per_subtask": {
2591
  "MD": {
2592
  "accuracy": 1.0,
2593
- "count": 179
2594
  },
2595
  "MB": {
2596
- "accuracy": 0.9805825242718447,
2597
- "count": 103
2598
  },
2599
  "ME": {
2600
  "accuracy": 1.0,
2601
- "count": 56
2602
  },
2603
  "UB": {
2604
- "accuracy": 0.5704697986577181,
2605
- "count": 149
2606
  },
2607
  "UD": {
2608
- "accuracy": 0.8356807511737089,
2609
- "count": 213
2610
  }
2611
  }
2612
  },
2613
  "sub_M4": {
2614
- "full_accuracy": 0.06,
2615
- "digit_accuracy": 0.7242857142857143,
2616
- "n_examples": 100,
2617
  "per_subtask": {
2618
  "MD": {
2619
  "accuracy": 1.0,
2620
- "count": 200
2621
  },
2622
  "MB": {
2623
  "accuracy": 1.0,
2624
- "count": 100
2625
  },
2626
  "UB": {
2627
- "accuracy": 0.31,
2628
- "count": 100
2629
  },
2630
  "UD": {
2631
- "accuracy": 0.5866666666666667,
2632
- "count": 300
2633
  }
2634
  }
2635
  },
2636
  "sub_M5": {
2637
- "full_accuracy": 0.01,
2638
- "digit_accuracy": 0.5328571428571428,
2639
- "n_examples": 100,
2640
  "per_subtask": {
2641
  "MD": {
2642
  "accuracy": 1.0,
2643
- "count": 100
2644
  },
2645
  "MB": {
2646
  "accuracy": 1.0,
2647
- "count": 100
2648
  },
2649
  "UB": {
2650
- "accuracy": 0.2,
2651
- "count": 100
2652
  },
2653
  "UD": {
2654
- "accuracy": 0.3825,
2655
- "count": 400
2656
  }
2657
  }
2658
  },
2659
  "sub_random": {
2660
- "full_accuracy": 0.905,
2661
- "digit_accuracy": 0.9857142857142858,
2662
  "n_examples": 200,
2663
  "per_subtask": {
2664
  "MD": {
2665
- "accuracy": 0.9983333333333333,
2666
- "count": 600
2667
  },
2668
  "MB": {
2669
- "accuracy": 0.9925093632958801,
2670
- "count": 267
2671
  },
2672
  "ME": {
2673
  "accuracy": 1.0,
2674
  "count": 53
2675
  },
2676
  "UB": {
2677
- "accuracy": 0.9635535307517085,
2678
- "count": 439
2679
  },
2680
  "UD": {
2681
- "accuracy": 0.975609756097561,
2682
- "count": 41
2683
  }
2684
  }
2685
  },
2686
  "sub_B3": {
2687
- "full_accuracy": 0.7,
2688
- "digit_accuracy": 0.9457142857142857,
2689
- "n_examples": 100,
2690
  "per_subtask": {
2691
  "MD": {
2692
- "accuracy": 1.0,
2693
- "count": 300
2694
  },
2695
  "MB": {
2696
  "accuracy": 1.0,
2697
- "count": 100
2698
  },
2699
  "UB": {
2700
- "accuracy": 0.8629441624365483,
2701
- "count": 197
2702
  },
2703
  "UD": {
2704
- "accuracy": 0.8932038834951457,
2705
- "count": 103
2706
  }
2707
  }
2708
  },
2709
  "sub_B4": {
2710
- "full_accuracy": 0.54,
2711
- "digit_accuracy": 0.9057142857142857,
2712
- "n_examples": 100,
2713
  "per_subtask": {
2714
  "MD": {
2715
  "accuracy": 1.0,
2716
- "count": 200
2717
  },
2718
  "MB": {
2719
- "accuracy": 0.99,
2720
- "count": 100
2721
  },
2722
  "UB": {
2723
- "accuracy": 0.8380566801619433,
2724
- "count": 247
2725
  },
2726
  "UD": {
2727
- "accuracy": 0.8366013071895425,
2728
- "count": 153
2729
  }
2730
  }
2731
  },
2732
  "sub_B5": {
2733
- "full_accuracy": 0.43,
2734
- "digit_accuracy": 0.8685714285714285,
2735
- "n_examples": 100,
2736
  "per_subtask": {
2737
  "MD": {
2738
  "accuracy": 1.0,
2739
- "count": 100
2740
  },
2741
  "MB": {
2742
  "accuracy": 1.0,
2743
- "count": 100
2744
  },
2745
  "UB": {
2746
- "accuracy": 0.8120805369127517,
2747
- "count": 298
2748
  },
2749
  "UD": {
2750
- "accuracy": 0.8217821782178217,
2751
- "count": 202
2752
  }
2753
  }
2754
  }
2755
  },
2756
  "summary": {
2757
- "overall_accuracy": 0.6429166666666667,
2758
- "digit_accuracy": 0.9002380952380953,
2759
- "total_examples": 2400,
2760
- "n_splits": 22
2761
  }
2762
  },
2763
  "sorl_overall_accuracy": 0.6429166666666667,
 
1712
  "K": null,
1713
  "mode": "sft",
1714
  "n_digits": 6,
1715
+ "n_per_split": 50
1716
  },
1717
  "splits": {
1718
  "add_S0": {
1719
  "full_accuracy": 0.0,
1720
+ "digit_accuracy": 0.08,
1721
+ "n_examples": 50,
1722
  "per_subtask": {
1723
  "SA": {
1724
+ "accuracy": 0.0847457627118644,
1725
+ "count": 295
1726
  },
1727
  "SS": {
1728
+ "accuracy": 0.05454545454545454,
1729
+ "count": 55
1730
  }
1731
  }
1732
  },
1733
  "add_S1": {
1734
  "full_accuracy": 0.0,
1735
+ "digit_accuracy": 0.06571428571428571,
1736
+ "n_examples": 50,
1737
  "per_subtask": {
1738
  "SA": {
1739
+ "accuracy": 0.06349206349206349,
1740
+ "count": 126
1741
  },
1742
  "SC": {
1743
+ "accuracy": 0.06329113924050633,
1744
+ "count": 79
1745
  },
1746
  "SS": {
1747
+ "accuracy": 0.0,
1748
+ "count": 21
1749
  },
1750
  "UC": {
1751
+ "accuracy": 0.08064516129032258,
1752
+ "count": 124
1753
  }
1754
  }
1755
  },
1756
  "add_S2": {
1757
  "full_accuracy": 0.0,
1758
+ "digit_accuracy": 0.11142857142857143,
1759
+ "n_examples": 50,
1760
  "per_subtask": {
1761
  "SA": {
1762
+ "accuracy": 0.12,
1763
+ "count": 75
1764
  },
1765
  "SC": {
1766
+ "accuracy": 0.16129032258064516,
1767
+ "count": 62
1768
  },
1769
  "SS": {
1770
+ "accuracy": 0.07692307692307693,
1771
+ "count": 39
1772
  },
1773
  "UC": {
1774
+ "accuracy": 0.0990990990990991,
1775
+ "count": 111
1776
  },
1777
  "US": {
1778
+ "accuracy": 0.09523809523809523,
1779
+ "count": 63
1780
  }
1781
  }
1782
  },
1783
  "add_S3": {
1784
  "full_accuracy": 0.0,
1785
+ "digit_accuracy": 0.08857142857142856,
1786
+ "n_examples": 50,
1787
  "per_subtask": {
1788
  "SA": {
1789
+ "accuracy": 0.11666666666666667,
1790
+ "count": 60
1791
  },
1792
  "SC": {
1793
+ "accuracy": 0.10526315789473684,
1794
+ "count": 57
1795
  },
1796
  "SS": {
1797
+ "accuracy": 0.05263157894736842,
1798
+ "count": 19
1799
  },
1800
  "UC": {
1801
+ "accuracy": 0.10576923076923077,
1802
+ "count": 104
1803
  },
1804
  "US": {
1805
+ "accuracy": 0.05454545454545454,
1806
+ "count": 110
1807
  }
1808
  }
1809
  },
1810
  "add_S4": {
1811
  "full_accuracy": 0.0,
1812
+ "digit_accuracy": 0.08285714285714285,
1813
+ "n_examples": 50,
1814
  "per_subtask": {
1815
  "SA": {
1816
+ "accuracy": 0.125,
1817
+ "count": 48
1818
  },
1819
  "SC": {
1820
+ "accuracy": 0.057692307692307696,
1821
+ "count": 52
1822
  },
1823
  "SS": {
1824
+ "accuracy": 0.14285714285714285,
1825
+ "count": 7
1826
  },
1827
  "UC": {
1828
+ "accuracy": 0.11235955056179775,
1829
+ "count": 89
1830
  },
1831
  "US": {
1832
+ "accuracy": 0.05844155844155844,
1833
+ "count": 154
1834
  }
1835
  }
1836
  },
1837
  "add_S5": {
1838
  "full_accuracy": 0.0,
1839
+ "digit_accuracy": 0.04857142857142857,
1840
+ "n_examples": 50,
1841
  "per_subtask": {
1842
  "SA": {
1843
+ "accuracy": 0.1,
1844
+ "count": 50
1845
  },
1846
  "SC": {
1847
+ "accuracy": 0.04,
1848
+ "count": 50
1849
  },
1850
  "UC": {
1851
+ "accuracy": 0.12,
1852
+ "count": 50
1853
  },
1854
  "US": {
1855
+ "accuracy": 0.02,
1856
+ "count": 200
1857
  }
1858
  }
1859
  },
1860
  "add_S6": {
1861
  "full_accuracy": 0.0,
1862
+ "digit_accuracy": 0.09428571428571429,
1863
+ "n_examples": 50,
1864
  "per_subtask": {
1865
  "SC": {
1866
+ "accuracy": 0.14,
1867
+ "count": 50
1868
  },
1869
  "UC": {
1870
+ "accuracy": 0.18,
1871
+ "count": 50
1872
  },
1873
  "US": {
1874
+ "accuracy": 0.068,
1875
+ "count": 250
1876
  }
1877
  }
1878
  },
1879
  "add_random": {
1880
  "full_accuracy": 0.0,
1881
+ "digit_accuracy": 0.09714285714285714,
1882
  "n_examples": 200,
1883
  "per_subtask": {
1884
  "SA": {
1885
+ "accuracy": 0.10672853828306264,
1886
+ "count": 431
1887
  },
1888
  "SC": {
1889
+ "accuracy": 0.10126582278481013,
1890
+ "count": 316
1891
  },
1892
  "SS": {
1893
+ "accuracy": 0.05128205128205128,
1894
+ "count": 39
1895
  },
1896
  "UC": {
1897
+ "accuracy": 0.09821428571428571,
1898
+ "count": 560
1899
  },
1900
  "US": {
1901
+ "accuracy": 0.018518518518518517,
1902
+ "count": 54
1903
+ }
1904
+ }
1905
+ },
1906
+ "add_C1": {
1907
+ "full_accuracy": 0.0,
1908
+ "digit_accuracy": 0.11714285714285715,
1909
+ "n_examples": 50,
1910
+ "per_subtask": {
1911
+ "SA": {
1912
+ "accuracy": 0.12,
1913
+ "count": 250
1914
+ },
1915
+ "SC": {
1916
+ "accuracy": 0.1,
1917
+ "count": 50
1918
+ },
1919
+ "UC": {
1920
+ "accuracy": 0.12,
1921
+ "count": 50
1922
+ }
1923
+ }
1924
+ },
1925
+ "add_C2": {
1926
+ "full_accuracy": 0.0,
1927
+ "digit_accuracy": 0.09142857142857143,
1928
+ "n_examples": 50,
1929
+ "per_subtask": {
1930
+ "SA": {
1931
+ "accuracy": 0.13,
1932
+ "count": 200
1933
+ },
1934
+ "SC": {
1935
+ "accuracy": 0.08,
1936
+ "count": 50
1937
+ },
1938
+ "UC": {
1939
+ "accuracy": 0.024096385542168676,
1940
+ "count": 83
1941
+ },
1942
+ "US": {
1943
+ "accuracy": 0.0,
1944
+ "count": 17
1945
  }
1946
  }
1947
  },
1948
  "add_C3": {
1949
  "full_accuracy": 0.0,
1950
+ "digit_accuracy": 0.11142857142857143,
1951
+ "n_examples": 50,
1952
  "per_subtask": {
1953
  "SA": {
1954
+ "accuracy": 0.14666666666666667,
1955
+ "count": 150
1956
  },
1957
  "SC": {
1958
+ "accuracy": 0.08,
1959
+ "count": 50
1960
  },
1961
  "UC": {
1962
+ "accuracy": 0.11,
1963
+ "count": 100
1964
  },
1965
  "US": {
1966
+ "accuracy": 0.04,
1967
+ "count": 50
1968
  }
1969
  }
1970
  },
1971
  "add_C4": {
1972
  "full_accuracy": 0.0,
1973
+ "digit_accuracy": 0.06571428571428571,
1974
+ "n_examples": 50,
1975
  "per_subtask": {
1976
  "SA": {
1977
+ "accuracy": 0.09,
1978
+ "count": 100
1979
  },
1980
  "SC": {
1981
+ "accuracy": 0.06,
1982
+ "count": 50
1983
  },
1984
  "UC": {
1985
+ "accuracy": 0.045454545454545456,
1986
+ "count": 132
1987
  },
1988
  "US": {
1989
+ "accuracy": 0.07352941176470588,
1990
+ "count": 68
1991
  }
1992
  }
1993
  },
1994
  "add_C5": {
1995
  "full_accuracy": 0.0,
1996
+ "digit_accuracy": 0.07142857142857142,
1997
+ "n_examples": 50,
1998
  "per_subtask": {
1999
  "SA": {
2000
+ "accuracy": 0.1,
2001
+ "count": 50
2002
  },
2003
  "SC": {
2004
+ "accuracy": 0.08,
2005
+ "count": 50
2006
  },
2007
  "UC": {
2008
+ "accuracy": 0.0958904109589041,
2009
+ "count": 146
2010
  },
2011
  "US": {
2012
+ "accuracy": 0.019230769230769232,
2013
+ "count": 104
2014
  }
2015
  }
2016
  },
2017
  "add_C6": {
2018
  "full_accuracy": 0.0,
2019
+ "digit_accuracy": 0.08285714285714285,
2020
+ "n_examples": 50,
2021
  "per_subtask": {
2022
  "SC": {
2023
+ "accuracy": 0.06,
2024
+ "count": 50
2025
  },
2026
  "UC": {
2027
+ "accuracy": 0.10052910052910052,
2028
+ "count": 189
2029
  },
2030
  "US": {
2031
+ "accuracy": 0.06306306306306306,
2032
+ "count": 111
2033
  }
2034
  }
2035
  },
2036
  "sub_M0": {
2037
  "full_accuracy": 0.0,
2038
+ "digit_accuracy": 0.06285714285714286,
2039
+ "n_examples": 50,
2040
  "per_subtask": {
2041
  "MD": {
2042
+ "accuracy": 0.0594059405940594,
2043
+ "count": 303
2044
  },
2045
  "ME": {
2046
+ "accuracy": 0.0851063829787234,
2047
+ "count": 47
2048
  }
2049
  }
2050
  },
2051
  "sub_M1": {
2052
  "full_accuracy": 0.0,
2053
+ "digit_accuracy": 0.08285714285714285,
2054
+ "n_examples": 50,
2055
  "per_subtask": {
2056
  "MD": {
2057
+ "accuracy": 0.09219858156028368,
2058
+ "count": 141
2059
  },
2060
  "MB": {
2061
+ "accuracy": 0.1111111111111111,
2062
+ "count": 72
2063
  },
2064
  "ME": {
2065
+ "accuracy": 0.05555555555555555,
2066
+ "count": 18
2067
  },
2068
  "UB": {
2069
+ "accuracy": 0.058823529411764705,
2070
+ "count": 119
2071
  }
2072
  }
2073
  },
2074
  "sub_M2": {
2075
  "full_accuracy": 0.0,
2076
+ "digit_accuracy": 0.09428571428571429,
2077
+ "n_examples": 50,
2078
  "per_subtask": {
2079
  "MD": {
2080
+ "accuracy": 0.07142857142857142,
2081
+ "count": 112
2082
  },
2083
  "MB": {
2084
+ "accuracy": 0.16981132075471697,
2085
+ "count": 53
2086
  },
2087
  "ME": {
2088
+ "accuracy": 0.10638297872340426,
2089
+ "count": 47
2090
  },
2091
  "UB": {
2092
+ "accuracy": 0.09411764705882353,
2093
+ "count": 85
2094
  },
2095
  "UD": {
2096
+ "accuracy": 0.05660377358490566,
2097
+ "count": 53
2098
  }
2099
  }
2100
  },
2101
  "sub_M3": {
2102
  "full_accuracy": 0.0,
2103
+ "digit_accuracy": 0.07714285714285714,
2104
+ "n_examples": 50,
2105
  "per_subtask": {
2106
  "MD": {
2107
+ "accuracy": 0.061855670103092786,
2108
+ "count": 97
2109
  },
2110
  "MB": {
2111
+ "accuracy": 0.0392156862745098,
2112
+ "count": 51
2113
  },
2114
  "ME": {
2115
+ "accuracy": 0.07407407407407407,
2116
+ "count": 27
2117
  },
2118
  "UB": {
2119
+ "accuracy": 0.10810810810810811,
2120
+ "count": 74
2121
  },
2122
  "UD": {
2123
+ "accuracy": 0.0891089108910891,
2124
+ "count": 101
2125
  }
2126
  }
2127
  },
2128
  "sub_M4": {
2129
  "full_accuracy": 0.0,
2130
+ "digit_accuracy": 0.06857142857142857,
2131
+ "n_examples": 50,
2132
  "per_subtask": {
2133
  "MD": {
2134
+ "accuracy": 0.03,
2135
+ "count": 100
2136
  },
2137
  "MB": {
2138
+ "accuracy": 0.2,
2139
+ "count": 50
2140
  },
2141
  "UB": {
2142
  "accuracy": 0.08,
2143
+ "count": 50
2144
  },
2145
  "UD": {
2146
+ "accuracy": 0.04666666666666667,
2147
+ "count": 150
2148
  }
2149
  }
2150
  },
2151
  "sub_M5": {
2152
  "full_accuracy": 0.0,
2153
+ "digit_accuracy": 0.054285714285714284,
2154
+ "n_examples": 50,
2155
  "per_subtask": {
2156
  "MD": {
2157
+ "accuracy": 0.08,
2158
+ "count": 50
2159
  },
2160
  "MB": {
2161
+ "accuracy": 0.02,
2162
+ "count": 50
2163
  },
2164
  "UB": {
2165
+ "accuracy": 0.12,
2166
+ "count": 50
2167
  },
2168
  "UD": {
2169
+ "accuracy": 0.04,
2170
+ "count": 200
2171
  }
2172
  }
2173
  },
2174
  "sub_random": {
2175
  "full_accuracy": 0.0,
2176
+ "digit_accuracy": 0.09142857142857143,
2177
  "n_examples": 200,
2178
  "per_subtask": {
2179
  "MD": {
2180
+ "accuracy": 0.07543859649122807,
2181
+ "count": 570
2182
  },
2183
  "MB": {
2184
+ "accuracy": 0.11191335740072202,
2185
+ "count": 277
2186
  },
2187
  "ME": {
2188
+ "accuracy": 0.018867924528301886,
2189
  "count": 53
2190
  },
2191
  "UB": {
2192
+ "accuracy": 0.10615711252653928,
2193
+ "count": 471
2194
  },
2195
  "UD": {
2196
+ "accuracy": 0.10344827586206896,
2197
+ "count": 29
2198
  }
2199
  }
2200
  },
2201
  "sub_B3": {
2202
  "full_accuracy": 0.0,
2203
+ "digit_accuracy": 0.07714285714285714,
2204
+ "n_examples": 50,
2205
  "per_subtask": {
2206
  "MD": {
2207
+ "accuracy": 0.05333333333333334,
2208
+ "count": 150
2209
  },
2210
  "MB": {
2211
+ "accuracy": 0.12,
2212
+ "count": 50
2213
  },
2214
  "UB": {
2215
+ "accuracy": 0.09900990099009901,
2216
+ "count": 101
2217
  },
2218
  "UD": {
2219
+ "accuracy": 0.061224489795918366,
2220
+ "count": 49
2221
  }
2222
  }
2223
  },
2224
  "sub_B4": {
2225
  "full_accuracy": 0.0,
2226
+ "digit_accuracy": 0.08571428571428572,
2227
+ "n_examples": 50,
2228
  "per_subtask": {
2229
  "MD": {
2230
+ "accuracy": 0.07,
2231
+ "count": 100
2232
  },
2233
  "MB": {
2234
+ "accuracy": 0.04,
2235
+ "count": 50
2236
  },
2237
  "UB": {
2238
+ "accuracy": 0.12396694214876033,
2239
+ "count": 121
2240
  },
2241
  "UD": {
2242
+ "accuracy": 0.0759493670886076,
2243
+ "count": 79
2244
  }
2245
  }
2246
  },
2247
  "sub_B5": {
2248
  "full_accuracy": 0.0,
2249
+ "digit_accuracy": 0.1,
2250
+ "n_examples": 50,
2251
  "per_subtask": {
2252
  "MD": {
2253
+ "accuracy": 0.04,
2254
+ "count": 50
2255
  },
2256
  "MB": {
2257
+ "accuracy": 0.08,
2258
+ "count": 50
2259
  },
2260
  "UB": {
2261
+ "accuracy": 0.1513157894736842,
2262
+ "count": 152
2263
  },
2264
  "UD": {
2265
+ "accuracy": 0.061224489795918366,
2266
+ "count": 98
2267
  }
2268
  }
2269
  }
2270
  },
2271
  "summary": {
2272
  "overall_accuracy": 0.0,
2273
+ "digit_accuracy": 0.08504761904761905,
2274
+ "total_examples": 1500,
2275
+ "n_splits": 24
2276
  }
2277
  },
2278
  "sorl_eval": {
 
2281
  "K": 1,
2282
  "mode": "sorl",
2283
  "n_digits": 6,
2284
+ "n_per_split": 50
2285
  },
2286
  "splits": {
2287
  "add_S0": {
2288
+ "full_accuracy": 0.92,
2289
+ "digit_accuracy": 0.9885714285714285,
2290
+ "n_examples": 50,
2291
  "per_subtask": {
2292
  "SA": {
2293
+ "accuracy": 0.9864406779661017,
2294
+ "count": 295
2295
  },
2296
  "SS": {
2297
  "accuracy": 1.0,
2298
+ "count": 55
2299
  }
2300
  }
2301
  },
2302
  "add_S1": {
2303
+ "full_accuracy": 0.92,
2304
+ "digit_accuracy": 0.9885714285714285,
2305
+ "n_examples": 50,
2306
  "per_subtask": {
2307
  "SA": {
2308
+ "accuracy": 0.9920634920634921,
2309
+ "count": 126
2310
  },
2311
  "SC": {
2312
+ "accuracy": 1.0,
2313
+ "count": 79
2314
  },
2315
  "SS": {
2316
  "accuracy": 1.0,
2317
+ "count": 21
2318
  },
2319
  "UC": {
2320
+ "accuracy": 0.9758064516129032,
2321
+ "count": 124
2322
  }
2323
  }
2324
  },
2325
  "add_S2": {
2326
+ "full_accuracy": 0.76,
2327
+ "digit_accuracy": 0.96,
2328
+ "n_examples": 50,
2329
  "per_subtask": {
2330
  "SA": {
2331
+ "accuracy": 0.9866666666666667,
2332
+ "count": 75
2333
  },
2334
  "SC": {
2335
+ "accuracy": 0.9838709677419355,
2336
+ "count": 62
2337
  },
2338
  "SS": {
2339
+ "accuracy": 1.0,
2340
+ "count": 39
2341
  },
2342
  "UC": {
2343
+ "accuracy": 0.9009009009009009,
2344
+ "count": 111
2345
  },
2346
  "US": {
2347
+ "accuracy": 0.9841269841269841,
2348
+ "count": 63
2349
  }
2350
  }
2351
  },
2352
  "add_S3": {
2353
+ "full_accuracy": 0.7,
2354
+ "digit_accuracy": 0.9457142857142857,
2355
+ "n_examples": 50,
2356
  "per_subtask": {
2357
  "SA": {
2358
  "accuracy": 1.0,
2359
+ "count": 60
2360
  },
2361
  "SC": {
2362
  "accuracy": 1.0,
2363
+ "count": 57
2364
  },
2365
  "SS": {
2366
  "accuracy": 1.0,
2367
+ "count": 19
2368
  },
2369
  "UC": {
2370
+ "accuracy": 0.8557692307692307,
2371
+ "count": 104
2372
  },
2373
  "US": {
2374
+ "accuracy": 0.9636363636363636,
2375
+ "count": 110
2376
  }
2377
  }
2378
  },
2379
  "add_S4": {
2380
+ "full_accuracy": 0.32,
2381
+ "digit_accuracy": 0.82,
2382
+ "n_examples": 50,
2383
  "per_subtask": {
2384
  "SA": {
2385
  "accuracy": 1.0,
2386
+ "count": 48
2387
  },
2388
  "SC": {
2389
  "accuracy": 1.0,
2390
+ "count": 52
2391
  },
2392
  "SS": {
2393
  "accuracy": 1.0,
2394
+ "count": 7
2395
  },
2396
  "UC": {
2397
+ "accuracy": 0.6292134831460674,
2398
+ "count": 89
2399
  },
2400
  "US": {
2401
+ "accuracy": 0.8051948051948052,
2402
+ "count": 154
2403
  }
2404
  }
2405
  },
2406
  "add_S5": {
2407
+ "full_accuracy": 0.28,
2408
+ "digit_accuracy": 0.7028571428571428,
2409
+ "n_examples": 50,
2410
  "per_subtask": {
2411
  "SA": {
2412
  "accuracy": 1.0,
2413
+ "count": 50
2414
  },
2415
  "SC": {
2416
  "accuracy": 1.0,
2417
+ "count": 50
2418
  },
2419
  "UC": {
2420
+ "accuracy": 0.46,
2421
+ "count": 50
2422
  },
2423
  "US": {
2424
+ "accuracy": 0.615,
2425
+ "count": 200
2426
  }
2427
  }
2428
  },
2429
  "add_S6": {
2430
+ "full_accuracy": 0.34,
2431
+ "digit_accuracy": 0.6371428571428571,
2432
+ "n_examples": 50,
2433
  "per_subtask": {
2434
  "SC": {
2435
  "accuracy": 1.0,
2436
+ "count": 50
2437
  },
2438
  "UC": {
2439
+ "accuracy": 0.36,
2440
+ "count": 50
2441
  },
2442
  "US": {
2443
+ "accuracy": 0.62,
2444
+ "count": 250
2445
  }
2446
  }
2447
  },
2448
  "add_random": {
2449
+ "full_accuracy": 0.95,
2450
+ "digit_accuracy": 0.9928571428571429,
2451
  "n_examples": 200,
2452
  "per_subtask": {
2453
  "SA": {
2454
+ "accuracy": 0.9930394431554525,
2455
+ "count": 431
2456
  },
2457
  "SC": {
2458
+ "accuracy": 0.9968354430379747,
2459
+ "count": 316
2460
  },
2461
  "SS": {
2462
  "accuracy": 1.0,
2463
+ "count": 39
2464
  },
2465
  "UC": {
2466
+ "accuracy": 0.9892857142857143,
2467
+ "count": 560
2468
  },
2469
  "US": {
2470
+ "accuracy": 1.0,
2471
+ "count": 54
2472
+ }
2473
+ }
2474
+ },
2475
+ "add_C1": {
2476
+ "full_accuracy": 0.9,
2477
+ "digit_accuracy": 0.9857142857142858,
2478
+ "n_examples": 50,
2479
+ "per_subtask": {
2480
+ "SA": {
2481
+ "accuracy": 1.0,
2482
+ "count": 250
2483
+ },
2484
+ "SC": {
2485
+ "accuracy": 1.0,
2486
+ "count": 50
2487
+ },
2488
+ "UC": {
2489
+ "accuracy": 0.9,
2490
+ "count": 50
2491
+ }
2492
+ }
2493
+ },
2494
+ "add_C2": {
2495
+ "full_accuracy": 0.82,
2496
+ "digit_accuracy": 0.96,
2497
+ "n_examples": 50,
2498
+ "per_subtask": {
2499
+ "SA": {
2500
+ "accuracy": 1.0,
2501
+ "count": 200
2502
+ },
2503
+ "SC": {
2504
+ "accuracy": 1.0,
2505
+ "count": 50
2506
+ },
2507
+ "UC": {
2508
+ "accuracy": 0.9036144578313253,
2509
+ "count": 83
2510
+ },
2511
+ "US": {
2512
+ "accuracy": 0.6470588235294118,
2513
+ "count": 17
2514
  }
2515
  }
2516
  },
2517
  "add_C3": {
2518
+ "full_accuracy": 0.58,
2519
+ "digit_accuracy": 0.9114285714285715,
2520
+ "n_examples": 50,
2521
  "per_subtask": {
2522
  "SA": {
2523
+ "accuracy": 1.0,
2524
+ "count": 150
2525
  },
2526
  "SC": {
2527
  "accuracy": 1.0,
2528
+ "count": 50
2529
  },
2530
  "UC": {
2531
+ "accuracy": 0.82,
2532
+ "count": 100
2533
  },
2534
  "US": {
2535
+ "accuracy": 0.74,
2536
+ "count": 50
2537
  }
2538
  }
2539
  },
2540
  "add_C4": {
2541
+ "full_accuracy": 0.66,
2542
+ "digit_accuracy": 0.9285714285714286,
2543
+ "n_examples": 50,
2544
  "per_subtask": {
2545
  "SA": {
2546
  "accuracy": 1.0,
2547
+ "count": 100
2548
  },
2549
  "SC": {
2550
  "accuracy": 1.0,
2551
+ "count": 50
2552
  },
2553
  "UC": {
2554
+ "accuracy": 0.8636363636363636,
2555
+ "count": 132
2556
  },
2557
  "US": {
2558
+ "accuracy": 0.8970588235294118,
2559
+ "count": 68
2560
  }
2561
  }
2562
  },
2563
  "add_C5": {
2564
+ "full_accuracy": 0.56,
2565
+ "digit_accuracy": 0.9028571428571428,
2566
+ "n_examples": 50,
2567
  "per_subtask": {
2568
  "SA": {
2569
  "accuracy": 1.0,
2570
+ "count": 50
2571
  },
2572
  "SC": {
2573
  "accuracy": 1.0,
2574
+ "count": 50
2575
  },
2576
  "UC": {
2577
+ "accuracy": 0.8493150684931506,
2578
+ "count": 146
2579
  },
2580
  "US": {
2581
+ "accuracy": 0.8846153846153846,
2582
+ "count": 104
2583
  }
2584
  }
2585
  },
2586
  "add_C6": {
2587
+ "full_accuracy": 0.62,
2588
+ "digit_accuracy": 0.9342857142857143,
2589
+ "n_examples": 50,
2590
  "per_subtask": {
2591
  "SC": {
2592
  "accuracy": 1.0,
2593
+ "count": 50
2594
  },
2595
  "UC": {
2596
+ "accuracy": 0.91005291005291,
2597
+ "count": 189
2598
  },
2599
  "US": {
2600
+ "accuracy": 0.9459459459459459,
2601
+ "count": 111
2602
  }
2603
  }
2604
  },
2605
  "sub_M0": {
2606
+ "full_accuracy": 0.96,
2607
+ "digit_accuracy": 0.9942857142857143,
2608
+ "n_examples": 50,
2609
  "per_subtask": {
2610
  "MD": {
2611
+ "accuracy": 0.9933993399339934,
2612
+ "count": 303
2613
  },
2614
  "ME": {
2615
  "accuracy": 1.0,
2616
+ "count": 47
2617
  }
2618
  }
2619
  },
2620
  "sub_M1": {
2621
+ "full_accuracy": 0.94,
2622
+ "digit_accuracy": 0.9914285714285714,
2623
+ "n_examples": 50,
2624
  "per_subtask": {
2625
  "MD": {
2626
+ "accuracy": 0.9929078014184397,
2627
+ "count": 141
2628
  },
2629
  "MB": {
2630
+ "accuracy": 0.9861111111111112,
2631
+ "count": 72
2632
  },
2633
  "ME": {
2634
  "accuracy": 1.0,
2635
+ "count": 18
2636
  },
2637
  "UB": {
2638
+ "accuracy": 0.9915966386554622,
2639
+ "count": 119
2640
  }
2641
  }
2642
  },
2643
  "sub_M2": {
2644
+ "full_accuracy": 0.7,
2645
  "digit_accuracy": 0.9514285714285714,
2646
+ "n_examples": 50,
2647
  "per_subtask": {
2648
  "MD": {
2649
+ "accuracy": 0.9910714285714286,
2650
+ "count": 112
2651
  },
2652
  "MB": {
2653
+ "accuracy": 0.9811320754716981,
2654
+ "count": 53
2655
  },
2656
  "ME": {
2657
  "accuracy": 1.0,
2658
+ "count": 47
2659
  },
2660
  "UB": {
2661
+ "accuracy": 0.8352941176470589,
2662
+ "count": 85
2663
  },
2664
  "UD": {
2665
+ "accuracy": 0.9811320754716981,
2666
+ "count": 53
2667
  }
2668
  }
2669
  },
2670
  "sub_M3": {
2671
+ "full_accuracy": 0.24,
2672
+ "digit_accuracy": 0.8542857142857143,
2673
+ "n_examples": 50,
2674
  "per_subtask": {
2675
  "MD": {
2676
  "accuracy": 1.0,
2677
+ "count": 97
2678
  },
2679
  "MB": {
2680
+ "accuracy": 0.9607843137254902,
2681
+ "count": 51
2682
  },
2683
  "ME": {
2684
  "accuracy": 1.0,
2685
+ "count": 27
2686
  },
2687
  "UB": {
2688
+ "accuracy": 0.5675675675675675,
2689
+ "count": 74
2690
  },
2691
  "UD": {
2692
+ "accuracy": 0.8316831683168316,
2693
+ "count": 101
2694
  }
2695
  }
2696
  },
2697
  "sub_M4": {
2698
+ "full_accuracy": 0.08,
2699
+ "digit_accuracy": 0.7257142857142858,
2700
+ "n_examples": 50,
2701
  "per_subtask": {
2702
  "MD": {
2703
  "accuracy": 1.0,
2704
+ "count": 100
2705
  },
2706
  "MB": {
2707
  "accuracy": 1.0,
2708
+ "count": 50
2709
  },
2710
  "UB": {
2711
+ "accuracy": 0.28,
2712
+ "count": 50
2713
  },
2714
  "UD": {
2715
+ "accuracy": 0.6,
2716
+ "count": 150
2717
  }
2718
  }
2719
  },
2720
  "sub_M5": {
2721
+ "full_accuracy": 0.02,
2722
+ "digit_accuracy": 0.5428571428571428,
2723
+ "n_examples": 50,
2724
  "per_subtask": {
2725
  "MD": {
2726
  "accuracy": 1.0,
2727
+ "count": 50
2728
  },
2729
  "MB": {
2730
  "accuracy": 1.0,
2731
+ "count": 50
2732
  },
2733
  "UB": {
2734
+ "accuracy": 0.36,
2735
+ "count": 50
2736
  },
2737
  "UD": {
2738
+ "accuracy": 0.36,
2739
+ "count": 200
2740
  }
2741
  }
2742
  },
2743
  "sub_random": {
2744
+ "full_accuracy": 0.895,
2745
+ "digit_accuracy": 0.9821428571428571,
2746
  "n_examples": 200,
2747
  "per_subtask": {
2748
  "MD": {
2749
+ "accuracy": 0.9859649122807017,
2750
+ "count": 570
2751
  },
2752
  "MB": {
2753
+ "accuracy": 0.9927797833935018,
2754
+ "count": 277
2755
  },
2756
  "ME": {
2757
  "accuracy": 1.0,
2758
  "count": 53
2759
  },
2760
  "UB": {
2761
+ "accuracy": 0.970276008492569,
2762
+ "count": 471
2763
  },
2764
  "UD": {
2765
+ "accuracy": 0.9655172413793104,
2766
+ "count": 29
2767
  }
2768
  }
2769
  },
2770
  "sub_B3": {
2771
+ "full_accuracy": 0.72,
2772
+ "digit_accuracy": 0.9485714285714286,
2773
+ "n_examples": 50,
2774
  "per_subtask": {
2775
  "MD": {
2776
+ "accuracy": 0.9933333333333333,
2777
+ "count": 150
2778
  },
2779
  "MB": {
2780
  "accuracy": 1.0,
2781
+ "count": 50
2782
  },
2783
  "UB": {
2784
+ "accuracy": 0.8910891089108911,
2785
+ "count": 101
2786
  },
2787
  "UD": {
2788
+ "accuracy": 0.8775510204081632,
2789
+ "count": 49
2790
  }
2791
  }
2792
  },
2793
  "sub_B4": {
2794
+ "full_accuracy": 0.38,
2795
+ "digit_accuracy": 0.8657142857142858,
2796
+ "n_examples": 50,
2797
  "per_subtask": {
2798
  "MD": {
2799
  "accuracy": 1.0,
2800
+ "count": 100
2801
  },
2802
  "MB": {
2803
+ "accuracy": 0.96,
2804
+ "count": 50
2805
  },
2806
  "UB": {
2807
+ "accuracy": 0.768595041322314,
2808
+ "count": 121
2809
  },
2810
  "UD": {
2811
+ "accuracy": 0.7848101265822784,
2812
+ "count": 79
2813
  }
2814
  }
2815
  },
2816
  "sub_B5": {
2817
+ "full_accuracy": 0.4,
2818
+ "digit_accuracy": 0.8657142857142858,
2819
+ "n_examples": 50,
2820
  "per_subtask": {
2821
  "MD": {
2822
  "accuracy": 1.0,
2823
+ "count": 50
2824
  },
2825
  "MB": {
2826
  "accuracy": 1.0,
2827
+ "count": 50
2828
  },
2829
  "UB": {
2830
+ "accuracy": 0.8223684210526315,
2831
+ "count": 152
2832
  },
2833
  "UD": {
2834
+ "accuracy": 0.7959183673469388,
2835
+ "count": 98
2836
  }
2837
  }
2838
  }
2839
  },
2840
  "summary": {
2841
+ "overall_accuracy": 0.6726666666666666,
2842
+ "digit_accuracy": 0.9100952380952381,
2843
+ "total_examples": 1500,
2844
+ "n_splits": 24
2845
  }
2846
  },
2847
  "sorl_overall_accuracy": 0.6429166666666667,