amirali1985 commited on
Commit
4ac5424
·
verified ·
1 Parent(s): e1570f9

Upload add_sub_sorl_v1_abs10_25K/metrics.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. add_sub_sorl_v1_abs10_25K/metrics.json +476 -346
add_sub_sorl_v1_abs10_25K/metrics.json CHANGED
@@ -610,502 +610,567 @@
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
- "n_per_split": 100
614
  },
615
  "splits": {
616
  "add_S0": {
617
- "full_accuracy": 0.55,
618
- "n_examples": 100,
 
619
  "per_subtask": {
620
  "SA": {
621
- "accuracy": 0.9190082644628099,
622
- "count": 605
623
  },
624
  "SS": {
625
- "accuracy": 0.9473684210526315,
626
- "count": 95
627
  }
628
  }
629
  },
630
  "add_S1": {
631
  "full_accuracy": 0.62,
632
- "n_examples": 100,
 
633
  "per_subtask": {
634
  "SA": {
635
- "accuracy": 0.9215686274509803,
636
- "count": 204
637
  },
638
  "SC": {
639
- "accuracy": 0.9467455621301775,
640
- "count": 169
641
  },
642
  "SS": {
643
- "accuracy": 0.9032258064516129,
644
- "count": 31
645
  },
646
  "UC": {
647
- "accuracy": 0.9391891891891891,
648
- "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
- "full_accuracy": 0.54,
654
- "n_examples": 100,
 
655
  "per_subtask": {
656
  "SA": {
657
- "accuracy": 0.9447852760736196,
658
- "count": 163
659
  },
660
  "SC": {
661
- "accuracy": 0.8769230769230769,
662
- "count": 130
663
  },
664
  "SS": {
665
- "accuracy": 0.9310344827586207,
666
- "count": 87
667
  },
668
  "UC": {
669
- "accuracy": 0.8472906403940886,
670
- "count": 203
671
  },
672
  "US": {
673
- "accuracy": 0.9572649572649573,
674
- "count": 117
675
  }
676
  }
677
  },
678
  "add_S3": {
679
- "full_accuracy": 0.42,
680
- "n_examples": 100,
 
681
  "per_subtask": {
682
  "SA": {
683
- "accuracy": 0.9504132231404959,
684
- "count": 121
685
  },
686
  "SC": {
687
- "accuracy": 0.9338842975206612,
688
- "count": 121
689
  },
690
  "SS": {
691
- "accuracy": 0.8775510204081632,
692
- "count": 49
693
  },
694
  "UC": {
695
- "accuracy": 0.7956989247311828,
696
- "count": 186
697
  },
698
  "US": {
699
- "accuracy": 0.8609865470852018,
700
- "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
  "full_accuracy": 0.32,
706
- "n_examples": 100,
 
707
  "per_subtask": {
708
  "SA": {
709
- "accuracy": 0.9519230769230769,
710
- "count": 104
711
  },
712
  "SC": {
713
- "accuracy": 0.8962264150943396,
714
- "count": 106
715
  },
716
  "SS": {
717
- "accuracy": 0.8695652173913043,
718
- "count": 23
719
  },
720
  "UC": {
721
- "accuracy": 0.70625,
722
- "count": 160
723
  },
724
  "US": {
725
- "accuracy": 0.6710097719869706,
726
- "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
- "full_accuracy": 0.26,
732
- "n_examples": 100,
 
733
  "per_subtask": {
734
  "SA": {
735
  "accuracy": 1.0,
736
- "count": 100
737
  },
738
  "SC": {
739
- "accuracy": 0.93,
740
- "count": 100
741
  },
742
  "UC": {
743
- "accuracy": 0.47,
744
- "count": 100
745
  },
746
  "US": {
747
- "accuracy": 0.44,
748
- "count": 400
749
  }
750
  }
751
  },
752
  "add_S6": {
753
- "full_accuracy": 0.41,
754
- "n_examples": 100,
 
755
  "per_subtask": {
756
  "SC": {
757
  "accuracy": 1.0,
758
- "count": 100
759
  },
760
  "UC": {
761
- "accuracy": 0.43,
762
- "count": 100
763
  },
764
  "US": {
765
- "accuracy": 0.516,
766
- "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
- "full_accuracy": 0.7,
 
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
- "accuracy": 0.941834451901566,
776
- "count": 447
777
  },
778
  "SC": {
779
- "accuracy": 0.94375,
780
- "count": 320
781
  },
782
  "SS": {
783
- "accuracy": 0.9642857142857143,
784
- "count": 56
785
  },
786
  "UC": {
787
- "accuracy": 0.9640831758034026,
788
- "count": 529
789
  },
790
  "US": {
791
- "accuracy": 0.8125,
792
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
  }
794
  }
795
  },
796
  "add_C3": {
797
  "full_accuracy": 0.52,
798
- "n_examples": 100,
 
799
  "per_subtask": {
800
  "SA": {
801
- "accuracy": 0.99,
802
- "count": 300
803
  },
804
  "SC": {
805
- "accuracy": 0.99,
806
- "count": 100
807
  },
808
  "UC": {
809
- "accuracy": 0.7927461139896373,
810
- "count": 193
811
  },
812
  "US": {
813
- "accuracy": 0.7570093457943925,
814
- "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
- "full_accuracy": 0.53,
820
- "n_examples": 100,
 
821
  "per_subtask": {
822
  "SA": {
823
  "accuracy": 0.99,
824
- "count": 200
825
  },
826
  "SC": {
827
- "accuracy": 0.95,
828
- "count": 100
829
  },
830
  "UC": {
831
- "accuracy": 0.8515625,
832
- "count": 256
833
  },
834
  "US": {
835
- "accuracy": 0.7916666666666666,
836
- "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
- "full_accuracy": 0.47,
842
- "n_examples": 100,
 
843
  "per_subtask": {
844
  "SA": {
845
  "accuracy": 1.0,
846
- "count": 100
847
  },
848
  "SC": {
849
- "accuracy": 0.97,
850
- "count": 100
851
  },
852
  "UC": {
853
- "accuracy": 0.8235294117647058,
854
- "count": 306
855
  },
856
  "US": {
857
- "accuracy": 0.845360824742268,
858
- "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
- "full_accuracy": 0.41,
864
- "n_examples": 100,
 
865
  "per_subtask": {
866
  "SC": {
867
  "accuracy": 1.0,
868
- "count": 100
869
  },
870
  "UC": {
871
- "accuracy": 0.825136612021858,
872
- "count": 366
873
  },
874
  "US": {
875
- "accuracy": 0.8803418803418803,
876
- "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
- "full_accuracy": 0.8,
882
- "n_examples": 100,
 
883
  "per_subtask": {
884
  "MD": {
885
- "accuracy": 0.9683860232945092,
886
- "count": 601
887
  },
888
  "ME": {
889
- "accuracy": 0.9797979797979798,
890
- "count": 99
891
  }
892
  }
893
  },
894
  "sub_M1": {
895
- "full_accuracy": 0.67,
896
- "n_examples": 100,
 
897
  "per_subtask": {
898
  "MD": {
899
- "accuracy": 0.982078853046595,
900
- "count": 279
901
  },
902
  "MB": {
903
- "accuracy": 0.9379310344827586,
904
- "count": 145
905
  },
906
  "ME": {
907
- "accuracy": 0.9583333333333334,
908
- "count": 24
909
  },
910
  "UB": {
911
- "accuracy": 0.9047619047619048,
912
- "count": 252
913
  }
914
  }
915
  },
916
  "sub_M2": {
917
- "full_accuracy": 0.38,
918
- "n_examples": 100,
 
919
  "per_subtask": {
920
  "MD": {
921
- "accuracy": 0.9906103286384976,
922
- "count": 213
923
  },
924
  "MB": {
925
- "accuracy": 0.9380530973451328,
926
- "count": 113
927
  },
928
  "ME": {
929
- "accuracy": 0.9882352941176471,
930
- "count": 85
931
  },
932
  "UB": {
933
- "accuracy": 0.6961325966850829,
934
- "count": 181
935
  },
936
  "UD": {
937
- "accuracy": 0.8981481481481481,
938
- "count": 108
939
  }
940
  }
941
  },
942
  "sub_M3": {
943
- "full_accuracy": 0.12,
944
- "n_examples": 100,
 
945
  "per_subtask": {
946
  "MD": {
947
- "accuracy": 1.0,
948
- "count": 179
949
  },
950
  "MB": {
951
- "accuracy": 0.9223300970873787,
952
- "count": 103
953
  },
954
  "ME": {
955
- "accuracy": 0.9821428571428571,
956
- "count": 56
957
  },
958
  "UB": {
959
- "accuracy": 0.48322147651006714,
960
- "count": 149
961
  },
962
  "UD": {
963
- "accuracy": 0.6291079812206573,
964
- "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
- "full_accuracy": 0.06,
970
- "n_examples": 100,
 
971
  "per_subtask": {
972
  "MD": {
973
  "accuracy": 1.0,
974
- "count": 200
975
  },
976
  "MB": {
977
- "accuracy": 0.99,
978
- "count": 100
979
  },
980
  "UB": {
981
- "accuracy": 0.3,
982
- "count": 100
983
  },
984
  "UD": {
985
- "accuracy": 0.38666666666666666,
986
- "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
- "full_accuracy": 0.03,
992
- "n_examples": 100,
 
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
- "count": 100
997
  },
998
  "MB": {
999
  "accuracy": 1.0,
1000
- "count": 100
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.19,
1004
- "count": 100
1005
  },
1006
  "UD": {
1007
- "accuracy": 0.2775,
1008
- "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
- "full_accuracy": 0.655,
 
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.9833333333333333,
1018
- "count": 600
1019
  },
1020
  "MB": {
1021
- "accuracy": 0.9625468164794008,
1022
- "count": 267
1023
  },
1024
  "ME": {
1025
- "accuracy": 1.0,
1026
  "count": 53
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.876993166287016,
1030
- "count": 439
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.8780487804878049,
1034
- "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
- "full_accuracy": 0.35,
1040
- "n_examples": 100,
 
1041
  "per_subtask": {
1042
  "MD": {
1043
- "accuracy": 0.99,
1044
- "count": 300
1045
  },
1046
  "MB": {
1047
- "accuracy": 0.95,
1048
- "count": 100
1049
  },
1050
  "UB": {
1051
- "accuracy": 0.6751269035532995,
1052
- "count": 197
1053
  },
1054
  "UD": {
1055
- "accuracy": 0.6893203883495146,
1056
- "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
- "full_accuracy": 0.24,
1062
- "n_examples": 100,
 
1063
  "per_subtask": {
1064
  "MD": {
1065
- "accuracy": 0.995,
1066
- "count": 200
1067
  },
1068
  "MB": {
1069
  "accuracy": 0.94,
1070
- "count": 100
1071
  },
1072
  "UB": {
1073
- "accuracy": 0.7206477732793523,
1074
- "count": 247
1075
  },
1076
  "UD": {
1077
- "accuracy": 0.6209150326797386,
1078
- "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
- "full_accuracy": 0.21,
1084
- "n_examples": 100,
 
1085
  "per_subtask": {
1086
  "MD": {
1087
  "accuracy": 1.0,
1088
- "count": 100
1089
  },
1090
  "MB": {
1091
  "accuracy": 1.0,
1092
- "count": 100
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.7013422818791947,
1096
- "count": 298
1097
  },
1098
  "UD": {
1099
- "accuracy": 0.6782178217821783,
1100
- "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
- "overall_accuracy": 0.4425,
1107
- "total_examples": 2400,
1108
- "n_splits": 22
 
1109
  }
1110
  },
1111
  "sorl_eval": {
@@ -1114,416 +1179,477 @@
1114
  "K": 4,
1115
  "mode": "sorl",
1116
  "n_digits": 6,
1117
- "n_per_split": 100
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
  "full_accuracy": 1.0,
1122
- "n_examples": 100,
 
1123
  "per_subtask": {
1124
  "SA": {
1125
  "accuracy": 1.0,
1126
- "count": 605
1127
  },
1128
  "SS": {
1129
  "accuracy": 1.0,
1130
- "count": 95
1131
  }
1132
  }
1133
  },
1134
  "add_S1": {
1135
  "full_accuracy": 1.0,
1136
- "n_examples": 100,
 
1137
  "per_subtask": {
1138
  "SA": {
1139
  "accuracy": 1.0,
1140
- "count": 204
1141
  },
1142
  "SC": {
1143
  "accuracy": 1.0,
1144
- "count": 169
1145
  },
1146
  "SS": {
1147
  "accuracy": 1.0,
1148
- "count": 31
1149
  },
1150
  "UC": {
1151
  "accuracy": 1.0,
1152
- "count": 296
1153
  }
1154
  }
1155
  },
1156
  "add_S2": {
1157
  "full_accuracy": 1.0,
1158
- "n_examples": 100,
 
1159
  "per_subtask": {
1160
  "SA": {
1161
  "accuracy": 1.0,
1162
- "count": 163
1163
  },
1164
  "SC": {
1165
  "accuracy": 1.0,
1166
- "count": 130
1167
  },
1168
  "SS": {
1169
  "accuracy": 1.0,
1170
- "count": 87
1171
  },
1172
  "UC": {
1173
  "accuracy": 1.0,
1174
- "count": 203
1175
  },
1176
  "US": {
1177
  "accuracy": 1.0,
1178
- "count": 117
1179
  }
1180
  }
1181
  },
1182
  "add_S3": {
1183
  "full_accuracy": 1.0,
1184
- "n_examples": 100,
 
1185
  "per_subtask": {
1186
  "SA": {
1187
  "accuracy": 1.0,
1188
- "count": 121
1189
  },
1190
  "SC": {
1191
  "accuracy": 1.0,
1192
- "count": 121
1193
  },
1194
  "SS": {
1195
  "accuracy": 1.0,
1196
- "count": 49
1197
  },
1198
  "UC": {
1199
  "accuracy": 1.0,
1200
- "count": 186
1201
  },
1202
  "US": {
1203
  "accuracy": 1.0,
1204
- "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
- "full_accuracy": 0.95,
1210
- "n_examples": 100,
 
1211
  "per_subtask": {
1212
  "SA": {
1213
  "accuracy": 1.0,
1214
- "count": 104
1215
  },
1216
  "SC": {
1217
  "accuracy": 1.0,
1218
- "count": 106
1219
  },
1220
  "SS": {
1221
  "accuracy": 1.0,
1222
- "count": 23
1223
  },
1224
  "UC": {
1225
- "accuracy": 0.96875,
1226
- "count": 160
1227
  },
1228
  "US": {
1229
  "accuracy": 1.0,
1230
- "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
- "full_accuracy": 0.79,
1236
- "n_examples": 100,
 
1237
  "per_subtask": {
1238
  "SA": {
1239
  "accuracy": 1.0,
1240
- "count": 100
1241
  },
1242
  "SC": {
1243
  "accuracy": 1.0,
1244
- "count": 100
1245
  },
1246
  "UC": {
1247
- "accuracy": 0.8,
1248
- "count": 100
1249
  },
1250
  "US": {
1251
  "accuracy": 0.99,
1252
- "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
- "full_accuracy": 0.97,
1258
- "n_examples": 100,
 
1259
  "per_subtask": {
1260
  "SC": {
1261
  "accuracy": 1.0,
1262
- "count": 100
1263
  },
1264
  "UC": {
1265
- "accuracy": 0.97,
1266
- "count": 100
1267
  },
1268
  "US": {
1269
- "accuracy": 0.992,
1270
- "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
  "full_accuracy": 1.0,
 
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
  "accuracy": 1.0,
1280
- "count": 447
1281
  },
1282
  "SC": {
1283
  "accuracy": 1.0,
1284
- "count": 320
1285
  },
1286
  "SS": {
1287
  "accuracy": 1.0,
1288
- "count": 56
1289
  },
1290
  "UC": {
1291
  "accuracy": 1.0,
1292
- "count": 529
1293
  },
1294
  "US": {
1295
  "accuracy": 1.0,
1296
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
  }
1298
  }
1299
  },
1300
  "add_C3": {
1301
- "full_accuracy": 0.99,
1302
- "n_examples": 100,
 
1303
  "per_subtask": {
1304
  "SA": {
1305
  "accuracy": 1.0,
1306
- "count": 300
1307
  },
1308
  "SC": {
1309
  "accuracy": 1.0,
1310
- "count": 100
1311
  },
1312
  "UC": {
1313
- "accuracy": 0.9948186528497409,
1314
- "count": 193
1315
  },
1316
  "US": {
1317
  "accuracy": 1.0,
1318
- "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
- "full_accuracy": 0.99,
1324
- "n_examples": 100,
 
1325
  "per_subtask": {
1326
  "SA": {
1327
  "accuracy": 1.0,
1328
- "count": 200
1329
  },
1330
  "SC": {
1331
  "accuracy": 1.0,
1332
- "count": 100
1333
  },
1334
  "UC": {
1335
- "accuracy": 0.99609375,
1336
- "count": 256
1337
  },
1338
  "US": {
1339
  "accuracy": 1.0,
1340
- "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
- "full_accuracy": 0.98,
1346
- "n_examples": 100,
 
1347
  "per_subtask": {
1348
  "SA": {
1349
  "accuracy": 1.0,
1350
- "count": 100
1351
  },
1352
  "SC": {
1353
  "accuracy": 1.0,
1354
- "count": 100
1355
  },
1356
  "UC": {
1357
- "accuracy": 0.9934640522875817,
1358
- "count": 306
1359
  },
1360
  "US": {
1361
  "accuracy": 1.0,
1362
- "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
  "full_accuracy": 1.0,
1368
- "n_examples": 100,
 
1369
  "per_subtask": {
1370
  "SC": {
1371
  "accuracy": 1.0,
1372
- "count": 100
1373
  },
1374
  "UC": {
1375
  "accuracy": 1.0,
1376
- "count": 366
1377
  },
1378
  "US": {
1379
  "accuracy": 1.0,
1380
- "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
  "full_accuracy": 1.0,
1386
- "n_examples": 100,
 
1387
  "per_subtask": {
1388
  "MD": {
1389
  "accuracy": 1.0,
1390
- "count": 601
1391
  },
1392
  "ME": {
1393
  "accuracy": 1.0,
1394
- "count": 99
1395
  }
1396
  }
1397
  },
1398
  "sub_M1": {
1399
- "full_accuracy": 1.0,
1400
- "n_examples": 100,
 
1401
  "per_subtask": {
1402
  "MD": {
1403
  "accuracy": 1.0,
1404
- "count": 279
1405
  },
1406
  "MB": {
1407
  "accuracy": 1.0,
1408
- "count": 145
1409
  },
1410
  "ME": {
1411
  "accuracy": 1.0,
1412
- "count": 24
1413
  },
1414
  "UB": {
1415
- "accuracy": 1.0,
1416
- "count": 252
1417
  }
1418
  }
1419
  },
1420
  "sub_M2": {
1421
  "full_accuracy": 1.0,
1422
- "n_examples": 100,
 
1423
  "per_subtask": {
1424
  "MD": {
1425
  "accuracy": 1.0,
1426
- "count": 213
1427
  },
1428
  "MB": {
1429
  "accuracy": 1.0,
1430
- "count": 113
1431
  },
1432
  "ME": {
1433
  "accuracy": 1.0,
1434
- "count": 85
1435
  },
1436
  "UB": {
1437
  "accuracy": 1.0,
1438
- "count": 181
1439
  },
1440
  "UD": {
1441
  "accuracy": 1.0,
1442
- "count": 108
1443
  }
1444
  }
1445
  },
1446
  "sub_M3": {
1447
  "full_accuracy": 1.0,
1448
- "n_examples": 100,
 
1449
  "per_subtask": {
1450
  "MD": {
1451
  "accuracy": 1.0,
1452
- "count": 179
1453
  },
1454
  "MB": {
1455
  "accuracy": 1.0,
1456
- "count": 103
1457
  },
1458
  "ME": {
1459
  "accuracy": 1.0,
1460
- "count": 56
1461
  },
1462
  "UB": {
1463
  "accuracy": 1.0,
1464
- "count": 149
1465
  },
1466
  "UD": {
1467
  "accuracy": 1.0,
1468
- "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
- "full_accuracy": 0.85,
1474
- "n_examples": 100,
 
1475
  "per_subtask": {
1476
  "MD": {
1477
  "accuracy": 1.0,
1478
- "count": 200
1479
  },
1480
  "MB": {
1481
  "accuracy": 1.0,
1482
- "count": 100
1483
  },
1484
  "UB": {
1485
- "accuracy": 0.85,
1486
- "count": 100
1487
  },
1488
  "UD": {
1489
  "accuracy": 1.0,
1490
- "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
- "full_accuracy": 0.79,
1496
- "n_examples": 100,
 
1497
  "per_subtask": {
1498
  "MD": {
1499
  "accuracy": 1.0,
1500
- "count": 100
1501
  },
1502
  "MB": {
1503
  "accuracy": 1.0,
1504
- "count": 100
1505
  },
1506
  "UB": {
1507
- "accuracy": 0.82,
1508
- "count": 100
1509
  },
1510
  "UD": {
1511
- "accuracy": 0.99,
1512
- "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
  "full_accuracy": 1.0,
 
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
  "accuracy": 1.0,
1522
- "count": 600
1523
  },
1524
  "MB": {
1525
  "accuracy": 1.0,
1526
- "count": 267
1527
  },
1528
  "ME": {
1529
  "accuracy": 1.0,
@@ -1531,85 +1657,89 @@
1531
  },
1532
  "UB": {
1533
  "accuracy": 1.0,
1534
- "count": 439
1535
  },
1536
  "UD": {
1537
  "accuracy": 1.0,
1538
- "count": 41
1539
  }
1540
  }
1541
  },
1542
  "sub_B3": {
1543
- "full_accuracy": 1.0,
1544
- "n_examples": 100,
 
1545
  "per_subtask": {
1546
  "MD": {
1547
  "accuracy": 1.0,
1548
- "count": 300
1549
  },
1550
  "MB": {
1551
  "accuracy": 1.0,
1552
- "count": 100
1553
  },
1554
  "UB": {
1555
- "accuracy": 1.0,
1556
- "count": 197
1557
  },
1558
  "UD": {
1559
  "accuracy": 1.0,
1560
- "count": 103
1561
  }
1562
  }
1563
  },
1564
  "sub_B4": {
1565
  "full_accuracy": 0.98,
1566
- "n_examples": 100,
 
1567
  "per_subtask": {
1568
  "MD": {
1569
  "accuracy": 1.0,
1570
- "count": 200
1571
  },
1572
  "MB": {
1573
  "accuracy": 1.0,
1574
- "count": 100
1575
  },
1576
  "UB": {
1577
- "accuracy": 0.9919028340080972,
1578
- "count": 247
1579
  },
1580
  "UD": {
1581
  "accuracy": 1.0,
1582
- "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
- "full_accuracy": 1.0,
1588
- "n_examples": 100,
 
1589
  "per_subtask": {
1590
  "MD": {
1591
  "accuracy": 1.0,
1592
- "count": 100
1593
  },
1594
  "MB": {
1595
  "accuracy": 1.0,
1596
- "count": 100
1597
  },
1598
  "UB": {
1599
- "accuracy": 1.0,
1600
- "count": 298
1601
  },
1602
  "UD": {
1603
  "accuracy": 1.0,
1604
- "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
- "overall_accuracy": 0.9704166666666667,
1611
- "total_examples": 2400,
1612
- "n_splits": 22
 
1613
  }
1614
  },
1615
  "sorl_overall_accuracy": 0.9704166666666667,
 
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
+ "n_per_split": 50
614
  },
615
  "splits": {
616
  "add_S0": {
617
+ "full_accuracy": 0.48,
618
+ "digit_accuracy": 0.9085714285714286,
619
+ "n_examples": 50,
620
  "per_subtask": {
621
  "SA": {
622
+ "accuracy": 0.9050847457627119,
623
+ "count": 295
624
  },
625
  "SS": {
626
+ "accuracy": 0.9272727272727272,
627
+ "count": 55
628
  }
629
  }
630
  },
631
  "add_S1": {
632
  "full_accuracy": 0.62,
633
+ "digit_accuracy": 0.9314285714285714,
634
+ "n_examples": 50,
635
  "per_subtask": {
636
  "SA": {
637
+ "accuracy": 0.9365079365079365,
638
+ "count": 126
639
  },
640
  "SC": {
641
+ "accuracy": 0.9493670886075949,
642
+ "count": 79
643
  },
644
  "SS": {
645
+ "accuracy": 0.9523809523809523,
646
+ "count": 21
647
  },
648
  "UC": {
649
+ "accuracy": 0.9112903225806451,
650
+ "count": 124
651
  }
652
  }
653
  },
654
  "add_S2": {
655
+ "full_accuracy": 0.28,
656
+ "digit_accuracy": 0.8371428571428572,
657
+ "n_examples": 50,
658
  "per_subtask": {
659
  "SA": {
660
+ "accuracy": 0.92,
661
+ "count": 75
662
  },
663
  "SC": {
664
+ "accuracy": 0.8387096774193549,
665
+ "count": 62
666
  },
667
  "SS": {
668
+ "accuracy": 0.6410256410256411,
669
+ "count": 39
670
  },
671
  "UC": {
672
+ "accuracy": 0.8198198198198198,
673
+ "count": 111
674
  },
675
  "US": {
676
+ "accuracy": 0.8888888888888888,
677
+ "count": 63
678
  }
679
  }
680
  },
681
  "add_S3": {
682
+ "full_accuracy": 0.46,
683
+ "digit_accuracy": 0.8571428571428571,
684
+ "n_examples": 50,
685
  "per_subtask": {
686
  "SA": {
687
+ "accuracy": 0.9333333333333333,
688
+ "count": 60
689
  },
690
  "SC": {
691
+ "accuracy": 0.8245614035087719,
692
+ "count": 57
693
  },
694
  "SS": {
695
+ "accuracy": 0.8421052631578947,
696
+ "count": 19
697
  },
698
  "UC": {
699
+ "accuracy": 0.8461538461538461,
700
+ "count": 104
701
  },
702
  "US": {
703
+ "accuracy": 0.8454545454545455,
704
+ "count": 110
705
  }
706
  }
707
  },
708
  "add_S4": {
709
  "full_accuracy": 0.32,
710
+ "digit_accuracy": 0.7428571428571429,
711
+ "n_examples": 50,
712
  "per_subtask": {
713
  "SA": {
714
+ "accuracy": 1.0,
715
+ "count": 48
716
  },
717
  "SC": {
718
+ "accuracy": 0.8846153846153846,
719
+ "count": 52
720
  },
721
  "SS": {
722
+ "accuracy": 0.7142857142857143,
723
+ "count": 7
724
  },
725
  "UC": {
726
+ "accuracy": 0.6629213483146067,
727
+ "count": 89
728
  },
729
  "US": {
730
+ "accuracy": 0.6623376623376623,
731
+ "count": 154
732
  }
733
  }
734
  },
735
  "add_S5": {
736
+ "full_accuracy": 0.34,
737
+ "digit_accuracy": 0.6285714285714286,
738
+ "n_examples": 50,
739
  "per_subtask": {
740
  "SA": {
741
  "accuracy": 1.0,
742
+ "count": 50
743
  },
744
  "SC": {
745
+ "accuracy": 0.94,
746
+ "count": 50
747
  },
748
  "UC": {
749
+ "accuracy": 0.5,
750
+ "count": 50
751
  },
752
  "US": {
753
+ "accuracy": 0.49,
754
+ "count": 200
755
  }
756
  }
757
  },
758
  "add_S6": {
759
+ "full_accuracy": 0.36,
760
+ "digit_accuracy": 0.5114285714285715,
761
+ "n_examples": 50,
762
  "per_subtask": {
763
  "SC": {
764
  "accuracy": 1.0,
765
+ "count": 50
766
  },
767
  "UC": {
768
+ "accuracy": 0.38,
769
+ "count": 50
770
  },
771
  "US": {
772
+ "accuracy": 0.44,
773
+ "count": 250
774
  }
775
  }
776
  },
777
  "add_random": {
778
+ "full_accuracy": 0.715,
779
+ "digit_accuracy": 0.9485714285714286,
780
  "n_examples": 200,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.9559164733178654,
784
+ "count": 431
785
  },
786
  "SC": {
787
+ "accuracy": 0.9651898734177216,
788
+ "count": 316
789
  },
790
  "SS": {
791
+ "accuracy": 0.9230769230769231,
792
+ "count": 39
793
  },
794
  "UC": {
795
+ "accuracy": 0.9357142857142857,
796
+ "count": 560
797
  },
798
  "US": {
799
+ "accuracy": 0.9444444444444444,
800
+ "count": 54
801
+ }
802
+ }
803
+ },
804
+ "add_C1": {
805
+ "full_accuracy": 0.66,
806
+ "digit_accuracy": 0.94,
807
+ "n_examples": 50,
808
+ "per_subtask": {
809
+ "SA": {
810
+ "accuracy": 0.992,
811
+ "count": 250
812
+ },
813
+ "SC": {
814
+ "accuracy": 0.9,
815
+ "count": 50
816
+ },
817
+ "UC": {
818
+ "accuracy": 0.72,
819
+ "count": 50
820
+ }
821
+ }
822
+ },
823
+ "add_C2": {
824
+ "full_accuracy": 0.78,
825
+ "digit_accuracy": 0.96,
826
+ "n_examples": 50,
827
+ "per_subtask": {
828
+ "SA": {
829
+ "accuracy": 0.995,
830
+ "count": 200
831
+ },
832
+ "SC": {
833
+ "accuracy": 0.98,
834
+ "count": 50
835
+ },
836
+ "UC": {
837
+ "accuracy": 0.9156626506024096,
838
+ "count": 83
839
+ },
840
+ "US": {
841
+ "accuracy": 0.7058823529411765,
842
+ "count": 17
843
  }
844
  }
845
  },
846
  "add_C3": {
847
  "full_accuracy": 0.52,
848
+ "digit_accuracy": 0.8828571428571429,
849
+ "n_examples": 50,
850
  "per_subtask": {
851
  "SA": {
852
+ "accuracy": 0.9866666666666667,
853
+ "count": 150
854
  },
855
  "SC": {
856
+ "accuracy": 0.86,
857
+ "count": 50
858
  },
859
  "UC": {
860
+ "accuracy": 0.84,
861
+ "count": 100
862
  },
863
  "US": {
864
+ "accuracy": 0.68,
865
+ "count": 50
866
  }
867
  }
868
  },
869
  "add_C4": {
870
+ "full_accuracy": 0.56,
871
+ "digit_accuracy": 0.9057142857142857,
872
+ "n_examples": 50,
873
  "per_subtask": {
874
  "SA": {
875
  "accuracy": 0.99,
876
+ "count": 100
877
  },
878
  "SC": {
879
+ "accuracy": 1.0,
880
+ "count": 50
881
  },
882
  "UC": {
883
+ "accuracy": 0.8257575757575758,
884
+ "count": 132
885
  },
886
  "US": {
887
+ "accuracy": 0.8676470588235294,
888
+ "count": 68
889
  }
890
  }
891
  },
892
  "add_C5": {
893
+ "full_accuracy": 0.46,
894
+ "digit_accuracy": 0.8971428571428571,
895
+ "n_examples": 50,
896
  "per_subtask": {
897
  "SA": {
898
  "accuracy": 1.0,
899
+ "count": 50
900
  },
901
  "SC": {
902
+ "accuracy": 0.96,
903
+ "count": 50
904
  },
905
  "UC": {
906
+ "accuracy": 0.8356164383561644,
907
+ "count": 146
908
  },
909
  "US": {
910
+ "accuracy": 0.9038461538461539,
911
+ "count": 104
912
  }
913
  }
914
  },
915
  "add_C6": {
916
+ "full_accuracy": 0.48,
917
+ "digit_accuracy": 0.8714285714285714,
918
+ "n_examples": 50,
919
  "per_subtask": {
920
  "SC": {
921
  "accuracy": 1.0,
922
+ "count": 50
923
  },
924
  "UC": {
925
+ "accuracy": 0.8518518518518519,
926
+ "count": 189
927
  },
928
  "US": {
929
+ "accuracy": 0.8468468468468469,
930
+ "count": 111
931
  }
932
  }
933
  },
934
  "sub_M0": {
935
+ "full_accuracy": 0.92,
936
+ "digit_accuracy": 0.9885714285714285,
937
+ "n_examples": 50,
938
  "per_subtask": {
939
  "MD": {
940
+ "accuracy": 0.9867986798679867,
941
+ "count": 303
942
  },
943
  "ME": {
944
+ "accuracy": 1.0,
945
+ "count": 47
946
  }
947
  }
948
  },
949
  "sub_M1": {
950
+ "full_accuracy": 0.6,
951
+ "digit_accuracy": 0.9314285714285714,
952
+ "n_examples": 50,
953
  "per_subtask": {
954
  "MD": {
955
+ "accuracy": 0.9645390070921985,
956
+ "count": 141
957
  },
958
  "MB": {
959
+ "accuracy": 0.9027777777777778,
960
+ "count": 72
961
  },
962
  "ME": {
963
+ "accuracy": 1.0,
964
+ "count": 18
965
  },
966
  "UB": {
967
+ "accuracy": 0.8991596638655462,
968
+ "count": 119
969
  }
970
  }
971
  },
972
  "sub_M2": {
973
+ "full_accuracy": 0.32,
974
+ "digit_accuracy": 0.86,
975
+ "n_examples": 50,
976
  "per_subtask": {
977
  "MD": {
978
+ "accuracy": 0.9732142857142857,
979
+ "count": 112
980
  },
981
  "MB": {
982
+ "accuracy": 0.9056603773584906,
983
+ "count": 53
984
  },
985
  "ME": {
986
+ "accuracy": 1.0,
987
+ "count": 47
988
  },
989
  "UB": {
990
+ "accuracy": 0.611764705882353,
991
+ "count": 85
992
  },
993
  "UD": {
994
+ "accuracy": 0.8490566037735849,
995
+ "count": 53
996
  }
997
  }
998
  },
999
  "sub_M3": {
1000
+ "full_accuracy": 0.1,
1001
+ "digit_accuracy": 0.7514285714285714,
1002
+ "n_examples": 50,
1003
  "per_subtask": {
1004
  "MD": {
1005
+ "accuracy": 0.9896907216494846,
1006
+ "count": 97
1007
  },
1008
  "MB": {
1009
+ "accuracy": 0.9803921568627451,
1010
+ "count": 51
1011
  },
1012
  "ME": {
1013
+ "accuracy": 1.0,
1014
+ "count": 27
1015
  },
1016
  "UB": {
1017
+ "accuracy": 0.47297297297297297,
1018
+ "count": 74
1019
  },
1020
  "UD": {
1021
+ "accuracy": 0.5445544554455446,
1022
+ "count": 101
1023
  }
1024
  }
1025
  },
1026
  "sub_M4": {
1027
+ "full_accuracy": 0.04,
1028
+ "digit_accuracy": 0.6457142857142857,
1029
+ "n_examples": 50,
1030
  "per_subtask": {
1031
  "MD": {
1032
  "accuracy": 1.0,
1033
+ "count": 100
1034
  },
1035
  "MB": {
1036
+ "accuracy": 0.98,
1037
+ "count": 50
1038
  },
1039
  "UB": {
1040
+ "accuracy": 0.28,
1041
+ "count": 50
1042
  },
1043
  "UD": {
1044
+ "accuracy": 0.42,
1045
+ "count": 150
1046
  }
1047
  }
1048
  },
1049
  "sub_M5": {
1050
+ "full_accuracy": 0.06,
1051
+ "digit_accuracy": 0.48,
1052
+ "n_examples": 50,
1053
  "per_subtask": {
1054
  "MD": {
1055
  "accuracy": 1.0,
1056
+ "count": 50
1057
  },
1058
  "MB": {
1059
  "accuracy": 1.0,
1060
+ "count": 50
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.26,
1064
+ "count": 50
1065
  },
1066
  "UD": {
1067
+ "accuracy": 0.275,
1068
+ "count": 200
1069
  }
1070
  }
1071
  },
1072
  "sub_random": {
1073
+ "full_accuracy": 0.635,
1074
+ "digit_accuracy": 0.9385714285714286,
1075
  "n_examples": 200,
1076
  "per_subtask": {
1077
  "MD": {
1078
+ "accuracy": 0.9789473684210527,
1079
+ "count": 570
1080
  },
1081
  "MB": {
1082
+ "accuracy": 0.9566787003610109,
1083
+ "count": 277
1084
  },
1085
  "ME": {
1086
+ "accuracy": 0.9811320754716981,
1087
  "count": 53
1088
  },
1089
  "UB": {
1090
+ "accuracy": 0.8747346072186837,
1091
+ "count": 471
1092
  },
1093
  "UD": {
1094
+ "accuracy": 0.9310344827586207,
1095
+ "count": 29
1096
  }
1097
  }
1098
  },
1099
  "sub_B3": {
1100
+ "full_accuracy": 0.4,
1101
+ "digit_accuracy": 0.8685714285714285,
1102
+ "n_examples": 50,
1103
  "per_subtask": {
1104
  "MD": {
1105
+ "accuracy": 0.9866666666666667,
1106
+ "count": 150
1107
  },
1108
  "MB": {
1109
+ "accuracy": 0.98,
1110
+ "count": 50
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.7128712871287128,
1114
+ "count": 101
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.7142857142857143,
1118
+ "count": 49
1119
  }
1120
  }
1121
  },
1122
  "sub_B4": {
1123
+ "full_accuracy": 0.28,
1124
+ "digit_accuracy": 0.8228571428571428,
1125
+ "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 1.0,
1129
+ "count": 100
1130
  },
1131
  "MB": {
1132
  "accuracy": 0.94,
1133
+ "count": 50
1134
  },
1135
  "UB": {
1136
+ "accuracy": 0.7520661157024794,
1137
+ "count": 121
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.6329113924050633,
1141
+ "count": 79
1142
  }
1143
  }
1144
  },
1145
  "sub_B5": {
1146
+ "full_accuracy": 0.22,
1147
+ "digit_accuracy": 0.76,
1148
+ "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 50
1153
  },
1154
  "MB": {
1155
  "accuracy": 1.0,
1156
+ "count": 50
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.7302631578947368,
1160
+ "count": 152
1161
  },
1162
  "UD": {
1163
+ "accuracy": 0.5612244897959183,
1164
+ "count": 98
1165
  }
1166
  }
1167
  }
1168
  },
1169
  "summary": {
1170
+ "overall_accuracy": 0.4886666666666667,
1171
+ "digit_accuracy": 0.851047619047619,
1172
+ "total_examples": 1500,
1173
+ "n_splits": 24
1174
  }
1175
  },
1176
  "sorl_eval": {
 
1179
  "K": 4,
1180
  "mode": "sorl",
1181
  "n_digits": 6,
1182
+ "n_per_split": 50
1183
  },
1184
  "splits": {
1185
  "add_S0": {
1186
  "full_accuracy": 1.0,
1187
+ "digit_accuracy": 1.0,
1188
+ "n_examples": 50,
1189
  "per_subtask": {
1190
  "SA": {
1191
  "accuracy": 1.0,
1192
+ "count": 295
1193
  },
1194
  "SS": {
1195
  "accuracy": 1.0,
1196
+ "count": 55
1197
  }
1198
  }
1199
  },
1200
  "add_S1": {
1201
  "full_accuracy": 1.0,
1202
+ "digit_accuracy": 1.0,
1203
+ "n_examples": 50,
1204
  "per_subtask": {
1205
  "SA": {
1206
  "accuracy": 1.0,
1207
+ "count": 126
1208
  },
1209
  "SC": {
1210
  "accuracy": 1.0,
1211
+ "count": 79
1212
  },
1213
  "SS": {
1214
  "accuracy": 1.0,
1215
+ "count": 21
1216
  },
1217
  "UC": {
1218
  "accuracy": 1.0,
1219
+ "count": 124
1220
  }
1221
  }
1222
  },
1223
  "add_S2": {
1224
  "full_accuracy": 1.0,
1225
+ "digit_accuracy": 1.0,
1226
+ "n_examples": 50,
1227
  "per_subtask": {
1228
  "SA": {
1229
  "accuracy": 1.0,
1230
+ "count": 75
1231
  },
1232
  "SC": {
1233
  "accuracy": 1.0,
1234
+ "count": 62
1235
  },
1236
  "SS": {
1237
  "accuracy": 1.0,
1238
+ "count": 39
1239
  },
1240
  "UC": {
1241
  "accuracy": 1.0,
1242
+ "count": 111
1243
  },
1244
  "US": {
1245
  "accuracy": 1.0,
1246
+ "count": 63
1247
  }
1248
  }
1249
  },
1250
  "add_S3": {
1251
  "full_accuracy": 1.0,
1252
+ "digit_accuracy": 1.0,
1253
+ "n_examples": 50,
1254
  "per_subtask": {
1255
  "SA": {
1256
  "accuracy": 1.0,
1257
+ "count": 60
1258
  },
1259
  "SC": {
1260
  "accuracy": 1.0,
1261
+ "count": 57
1262
  },
1263
  "SS": {
1264
  "accuracy": 1.0,
1265
+ "count": 19
1266
  },
1267
  "UC": {
1268
  "accuracy": 1.0,
1269
+ "count": 104
1270
  },
1271
  "US": {
1272
  "accuracy": 1.0,
1273
+ "count": 110
1274
  }
1275
  }
1276
  },
1277
  "add_S4": {
1278
+ "full_accuracy": 0.98,
1279
+ "digit_accuracy": 0.9971428571428571,
1280
+ "n_examples": 50,
1281
  "per_subtask": {
1282
  "SA": {
1283
  "accuracy": 1.0,
1284
+ "count": 48
1285
  },
1286
  "SC": {
1287
  "accuracy": 1.0,
1288
+ "count": 52
1289
  },
1290
  "SS": {
1291
  "accuracy": 1.0,
1292
+ "count": 7
1293
  },
1294
  "UC": {
1295
+ "accuracy": 0.9887640449438202,
1296
+ "count": 89
1297
  },
1298
  "US": {
1299
  "accuracy": 1.0,
1300
+ "count": 154
1301
  }
1302
  }
1303
  },
1304
  "add_S5": {
1305
+ "full_accuracy": 0.74,
1306
+ "digit_accuracy": 0.96,
1307
+ "n_examples": 50,
1308
  "per_subtask": {
1309
  "SA": {
1310
  "accuracy": 1.0,
1311
+ "count": 50
1312
  },
1313
  "SC": {
1314
  "accuracy": 1.0,
1315
+ "count": 50
1316
  },
1317
  "UC": {
1318
+ "accuracy": 0.76,
1319
+ "count": 50
1320
  },
1321
  "US": {
1322
  "accuracy": 0.99,
1323
+ "count": 200
1324
  }
1325
  }
1326
  },
1327
  "add_S6": {
1328
+ "full_accuracy": 0.94,
1329
+ "digit_accuracy": 0.9857142857142858,
1330
+ "n_examples": 50,
1331
  "per_subtask": {
1332
  "SC": {
1333
  "accuracy": 1.0,
1334
+ "count": 50
1335
  },
1336
  "UC": {
1337
+ "accuracy": 0.96,
1338
+ "count": 50
1339
  },
1340
  "US": {
1341
+ "accuracy": 0.988,
1342
+ "count": 250
1343
  }
1344
  }
1345
  },
1346
  "add_random": {
1347
  "full_accuracy": 1.0,
1348
+ "digit_accuracy": 1.0,
1349
  "n_examples": 200,
1350
  "per_subtask": {
1351
  "SA": {
1352
  "accuracy": 1.0,
1353
+ "count": 431
1354
  },
1355
  "SC": {
1356
  "accuracy": 1.0,
1357
+ "count": 316
1358
  },
1359
  "SS": {
1360
  "accuracy": 1.0,
1361
+ "count": 39
1362
  },
1363
  "UC": {
1364
  "accuracy": 1.0,
1365
+ "count": 560
1366
  },
1367
  "US": {
1368
  "accuracy": 1.0,
1369
+ "count": 54
1370
+ }
1371
+ }
1372
+ },
1373
+ "add_C1": {
1374
+ "full_accuracy": 1.0,
1375
+ "digit_accuracy": 1.0,
1376
+ "n_examples": 50,
1377
+ "per_subtask": {
1378
+ "SA": {
1379
+ "accuracy": 1.0,
1380
+ "count": 250
1381
+ },
1382
+ "SC": {
1383
+ "accuracy": 1.0,
1384
+ "count": 50
1385
+ },
1386
+ "UC": {
1387
+ "accuracy": 1.0,
1388
+ "count": 50
1389
+ }
1390
+ }
1391
+ },
1392
+ "add_C2": {
1393
+ "full_accuracy": 1.0,
1394
+ "digit_accuracy": 1.0,
1395
+ "n_examples": 50,
1396
+ "per_subtask": {
1397
+ "SA": {
1398
+ "accuracy": 1.0,
1399
+ "count": 200
1400
+ },
1401
+ "SC": {
1402
+ "accuracy": 1.0,
1403
+ "count": 50
1404
+ },
1405
+ "UC": {
1406
+ "accuracy": 1.0,
1407
+ "count": 83
1408
+ },
1409
+ "US": {
1410
+ "accuracy": 1.0,
1411
+ "count": 17
1412
  }
1413
  }
1414
  },
1415
  "add_C3": {
1416
+ "full_accuracy": 1.0,
1417
+ "digit_accuracy": 1.0,
1418
+ "n_examples": 50,
1419
  "per_subtask": {
1420
  "SA": {
1421
  "accuracy": 1.0,
1422
+ "count": 150
1423
  },
1424
  "SC": {
1425
  "accuracy": 1.0,
1426
+ "count": 50
1427
  },
1428
  "UC": {
1429
+ "accuracy": 1.0,
1430
+ "count": 100
1431
  },
1432
  "US": {
1433
  "accuracy": 1.0,
1434
+ "count": 50
1435
  }
1436
  }
1437
  },
1438
  "add_C4": {
1439
+ "full_accuracy": 1.0,
1440
+ "digit_accuracy": 1.0,
1441
+ "n_examples": 50,
1442
  "per_subtask": {
1443
  "SA": {
1444
  "accuracy": 1.0,
1445
+ "count": 100
1446
  },
1447
  "SC": {
1448
  "accuracy": 1.0,
1449
+ "count": 50
1450
  },
1451
  "UC": {
1452
+ "accuracy": 1.0,
1453
+ "count": 132
1454
  },
1455
  "US": {
1456
  "accuracy": 1.0,
1457
+ "count": 68
1458
  }
1459
  }
1460
  },
1461
  "add_C5": {
1462
+ "full_accuracy": 0.96,
1463
+ "digit_accuracy": 0.9942857142857143,
1464
+ "n_examples": 50,
1465
  "per_subtask": {
1466
  "SA": {
1467
  "accuracy": 1.0,
1468
+ "count": 50
1469
  },
1470
  "SC": {
1471
  "accuracy": 1.0,
1472
+ "count": 50
1473
  },
1474
  "UC": {
1475
+ "accuracy": 0.9863013698630136,
1476
+ "count": 146
1477
  },
1478
  "US": {
1479
  "accuracy": 1.0,
1480
+ "count": 104
1481
  }
1482
  }
1483
  },
1484
  "add_C6": {
1485
  "full_accuracy": 1.0,
1486
+ "digit_accuracy": 1.0,
1487
+ "n_examples": 50,
1488
  "per_subtask": {
1489
  "SC": {
1490
  "accuracy": 1.0,
1491
+ "count": 50
1492
  },
1493
  "UC": {
1494
  "accuracy": 1.0,
1495
+ "count": 189
1496
  },
1497
  "US": {
1498
  "accuracy": 1.0,
1499
+ "count": 111
1500
  }
1501
  }
1502
  },
1503
  "sub_M0": {
1504
  "full_accuracy": 1.0,
1505
+ "digit_accuracy": 1.0,
1506
+ "n_examples": 50,
1507
  "per_subtask": {
1508
  "MD": {
1509
  "accuracy": 1.0,
1510
+ "count": 303
1511
  },
1512
  "ME": {
1513
  "accuracy": 1.0,
1514
+ "count": 47
1515
  }
1516
  }
1517
  },
1518
  "sub_M1": {
1519
+ "full_accuracy": 0.98,
1520
+ "digit_accuracy": 0.9971428571428571,
1521
+ "n_examples": 50,
1522
  "per_subtask": {
1523
  "MD": {
1524
  "accuracy": 1.0,
1525
+ "count": 141
1526
  },
1527
  "MB": {
1528
  "accuracy": 1.0,
1529
+ "count": 72
1530
  },
1531
  "ME": {
1532
  "accuracy": 1.0,
1533
+ "count": 18
1534
  },
1535
  "UB": {
1536
+ "accuracy": 0.9915966386554622,
1537
+ "count": 119
1538
  }
1539
  }
1540
  },
1541
  "sub_M2": {
1542
  "full_accuracy": 1.0,
1543
+ "digit_accuracy": 1.0,
1544
+ "n_examples": 50,
1545
  "per_subtask": {
1546
  "MD": {
1547
  "accuracy": 1.0,
1548
+ "count": 112
1549
  },
1550
  "MB": {
1551
  "accuracy": 1.0,
1552
+ "count": 53
1553
  },
1554
  "ME": {
1555
  "accuracy": 1.0,
1556
+ "count": 47
1557
  },
1558
  "UB": {
1559
  "accuracy": 1.0,
1560
+ "count": 85
1561
  },
1562
  "UD": {
1563
  "accuracy": 1.0,
1564
+ "count": 53
1565
  }
1566
  }
1567
  },
1568
  "sub_M3": {
1569
  "full_accuracy": 1.0,
1570
+ "digit_accuracy": 1.0,
1571
+ "n_examples": 50,
1572
  "per_subtask": {
1573
  "MD": {
1574
  "accuracy": 1.0,
1575
+ "count": 97
1576
  },
1577
  "MB": {
1578
  "accuracy": 1.0,
1579
+ "count": 51
1580
  },
1581
  "ME": {
1582
  "accuracy": 1.0,
1583
+ "count": 27
1584
  },
1585
  "UB": {
1586
  "accuracy": 1.0,
1587
+ "count": 74
1588
  },
1589
  "UD": {
1590
  "accuracy": 1.0,
1591
+ "count": 101
1592
  }
1593
  }
1594
  },
1595
  "sub_M4": {
1596
+ "full_accuracy": 0.82,
1597
+ "digit_accuracy": 0.9742857142857143,
1598
+ "n_examples": 50,
1599
  "per_subtask": {
1600
  "MD": {
1601
  "accuracy": 1.0,
1602
+ "count": 100
1603
  },
1604
  "MB": {
1605
  "accuracy": 1.0,
1606
+ "count": 50
1607
  },
1608
  "UB": {
1609
+ "accuracy": 0.82,
1610
+ "count": 50
1611
  },
1612
  "UD": {
1613
  "accuracy": 1.0,
1614
+ "count": 150
1615
  }
1616
  }
1617
  },
1618
  "sub_M5": {
1619
+ "full_accuracy": 0.72,
1620
+ "digit_accuracy": 0.96,
1621
+ "n_examples": 50,
1622
  "per_subtask": {
1623
  "MD": {
1624
  "accuracy": 1.0,
1625
+ "count": 50
1626
  },
1627
  "MB": {
1628
  "accuracy": 1.0,
1629
+ "count": 50
1630
  },
1631
  "UB": {
1632
+ "accuracy": 0.78,
1633
+ "count": 50
1634
  },
1635
  "UD": {
1636
+ "accuracy": 0.985,
1637
+ "count": 200
1638
  }
1639
  }
1640
  },
1641
  "sub_random": {
1642
  "full_accuracy": 1.0,
1643
+ "digit_accuracy": 1.0,
1644
  "n_examples": 200,
1645
  "per_subtask": {
1646
  "MD": {
1647
  "accuracy": 1.0,
1648
+ "count": 570
1649
  },
1650
  "MB": {
1651
  "accuracy": 1.0,
1652
+ "count": 277
1653
  },
1654
  "ME": {
1655
  "accuracy": 1.0,
 
1657
  },
1658
  "UB": {
1659
  "accuracy": 1.0,
1660
+ "count": 471
1661
  },
1662
  "UD": {
1663
  "accuracy": 1.0,
1664
+ "count": 29
1665
  }
1666
  }
1667
  },
1668
  "sub_B3": {
1669
+ "full_accuracy": 0.98,
1670
+ "digit_accuracy": 0.9971428571428571,
1671
+ "n_examples": 50,
1672
  "per_subtask": {
1673
  "MD": {
1674
  "accuracy": 1.0,
1675
+ "count": 150
1676
  },
1677
  "MB": {
1678
  "accuracy": 1.0,
1679
+ "count": 50
1680
  },
1681
  "UB": {
1682
+ "accuracy": 0.9900990099009901,
1683
+ "count": 101
1684
  },
1685
  "UD": {
1686
  "accuracy": 1.0,
1687
+ "count": 49
1688
  }
1689
  }
1690
  },
1691
  "sub_B4": {
1692
  "full_accuracy": 0.98,
1693
+ "digit_accuracy": 0.9971428571428571,
1694
+ "n_examples": 50,
1695
  "per_subtask": {
1696
  "MD": {
1697
  "accuracy": 1.0,
1698
+ "count": 100
1699
  },
1700
  "MB": {
1701
  "accuracy": 1.0,
1702
+ "count": 50
1703
  },
1704
  "UB": {
1705
+ "accuracy": 0.9917355371900827,
1706
+ "count": 121
1707
  },
1708
  "UD": {
1709
  "accuracy": 1.0,
1710
+ "count": 79
1711
  }
1712
  }
1713
  },
1714
  "sub_B5": {
1715
+ "full_accuracy": 0.98,
1716
+ "digit_accuracy": 0.9971428571428571,
1717
+ "n_examples": 50,
1718
  "per_subtask": {
1719
  "MD": {
1720
  "accuracy": 1.0,
1721
+ "count": 50
1722
  },
1723
  "MB": {
1724
  "accuracy": 1.0,
1725
+ "count": 50
1726
  },
1727
  "UB": {
1728
+ "accuracy": 0.993421052631579,
1729
+ "count": 152
1730
  },
1731
  "UD": {
1732
  "accuracy": 1.0,
1733
+ "count": 98
1734
  }
1735
  }
1736
  }
1737
  },
1738
  "summary": {
1739
+ "overall_accuracy": 0.9693333333333334,
1740
+ "digit_accuracy": 0.9953333333333333,
1741
+ "total_examples": 1500,
1742
+ "n_splits": 24
1743
  }
1744
  },
1745
  "sorl_overall_accuracy": 0.9704166666666667,