amirali1985 commited on
Commit
2746b8d
·
verified ·
1 Parent(s): 9e87639

Upload add_sub_baseline_25K_1L2H256d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_25K_1L2H256d/metrics.json CHANGED
@@ -706,502 +706,567 @@
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.42,
714
- "n_examples": 100,
 
715
  "per_subtask": {
716
  "SA": {
717
- "accuracy": 0.8628099173553719,
718
- "count": 605
719
  },
720
  "SS": {
721
- "accuracy": 0.9473684210526315,
722
- "count": 95
723
  }
724
  }
725
  },
726
  "add_S1": {
727
- "full_accuracy": 0.01,
728
- "n_examples": 100,
 
729
  "per_subtask": {
730
  "SA": {
731
- "accuracy": 0.9019607843137255,
732
- "count": 204
733
  },
734
  "SC": {
735
- "accuracy": 0.7692307692307693,
736
- "count": 169
737
  },
738
  "SS": {
739
- "accuracy": 0.8709677419354839,
740
- "count": 31
741
  },
742
  "UC": {
743
- "accuracy": 0.3783783783783784,
744
- "count": 296
745
  }
746
  }
747
  },
748
  "add_S2": {
749
- "full_accuracy": 0.11,
750
- "n_examples": 100,
 
751
  "per_subtask": {
752
  "SA": {
753
- "accuracy": 0.9447852760736196,
754
- "count": 163
755
  },
756
  "SC": {
757
- "accuracy": 0.7307692307692307,
758
- "count": 130
759
  },
760
  "SS": {
761
- "accuracy": 0.8275862068965517,
762
- "count": 87
763
  },
764
  "UC": {
765
- "accuracy": 0.4433497536945813,
766
- "count": 203
767
  },
768
  "US": {
769
- "accuracy": 0.5213675213675214,
770
- "count": 117
771
  }
772
  }
773
  },
774
  "add_S3": {
775
- "full_accuracy": 0.07,
776
- "n_examples": 100,
 
777
  "per_subtask": {
778
  "SA": {
779
- "accuracy": 0.9173553719008265,
780
- "count": 121
781
  },
782
  "SC": {
783
- "accuracy": 0.7107438016528925,
784
- "count": 121
785
  },
786
  "SS": {
787
- "accuracy": 0.9183673469387755,
788
- "count": 49
789
  },
790
  "UC": {
791
- "accuracy": 0.3655913978494624,
792
- "count": 186
793
  },
794
  "US": {
795
- "accuracy": 0.2062780269058296,
796
- "count": 223
797
  }
798
  }
799
  },
800
  "add_S4": {
801
- "full_accuracy": 0.1,
802
- "n_examples": 100,
 
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.9519230769230769,
806
- "count": 104
807
  },
808
  "SC": {
809
- "accuracy": 0.7358490566037735,
810
- "count": 106
811
  },
812
  "SS": {
813
- "accuracy": 0.9130434782608695,
814
- "count": 23
815
  },
816
  "UC": {
817
- "accuracy": 0.46875,
818
- "count": 160
819
  },
820
  "US": {
821
- "accuracy": 0.20846905537459284,
822
- "count": 307
823
  }
824
  }
825
  },
826
  "add_S5": {
827
- "full_accuracy": 0.17,
828
- "n_examples": 100,
 
829
  "per_subtask": {
830
  "SA": {
831
- "accuracy": 0.99,
832
- "count": 100
833
  },
834
  "SC": {
835
- "accuracy": 0.62,
836
- "count": 100
837
  },
838
  "UC": {
839
- "accuracy": 0.36,
840
- "count": 100
841
  },
842
  "US": {
843
  "accuracy": 0.19,
844
- "count": 400
845
  }
846
  }
847
  },
848
  "add_S6": {
849
- "full_accuracy": 0.22,
850
- "n_examples": 100,
 
851
  "per_subtask": {
852
  "SC": {
853
- "accuracy": 0.67,
854
- "count": 100
855
  },
856
  "UC": {
857
- "accuracy": 0.38,
858
- "count": 100
859
  },
860
  "US": {
861
- "accuracy": 0.324,
862
- "count": 500
863
  }
864
  }
865
  },
866
  "add_random": {
867
- "full_accuracy": 0.025,
 
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
- "accuracy": 0.8814317673378076,
872
- "count": 447
873
  },
874
  "SC": {
875
- "accuracy": 0.7875,
876
- "count": 320
877
  },
878
  "SS": {
879
- "accuracy": 0.9285714285714286,
880
- "count": 56
881
  },
882
  "UC": {
883
- "accuracy": 0.3516068052930057,
884
- "count": 529
885
  },
886
  "US": {
887
- "accuracy": 0.3333333333333333,
888
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
  }
890
  }
891
  },
892
  "add_C3": {
893
- "full_accuracy": 0.07,
894
- "n_examples": 100,
 
895
  "per_subtask": {
896
  "SA": {
897
- "accuracy": 0.9266666666666666,
898
- "count": 300
899
  },
900
  "SC": {
901
- "accuracy": 0.76,
902
- "count": 100
903
  },
904
  "UC": {
905
- "accuracy": 0.29015544041450775,
906
- "count": 193
907
  },
908
  "US": {
909
- "accuracy": 0.3644859813084112,
910
- "count": 107
911
  }
912
  }
913
  },
914
  "add_C4": {
915
- "full_accuracy": 0.05,
916
- "n_examples": 100,
 
917
  "per_subtask": {
918
  "SA": {
919
- "accuracy": 0.94,
920
- "count": 200
921
  },
922
  "SC": {
923
- "accuracy": 0.73,
924
- "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.3125,
928
- "count": 256
929
  },
930
  "US": {
931
- "accuracy": 0.3819444444444444,
932
- "count": 144
933
  }
934
  }
935
  },
936
  "add_C5": {
937
- "full_accuracy": 0.01,
938
- "n_examples": 100,
 
939
  "per_subtask": {
940
  "SA": {
941
- "accuracy": 0.96,
942
- "count": 100
943
  },
944
  "SC": {
945
- "accuracy": 0.71,
946
- "count": 100
947
  },
948
  "UC": {
949
- "accuracy": 0.35294117647058826,
950
- "count": 306
951
  },
952
  "US": {
953
- "accuracy": 0.32989690721649484,
954
- "count": 194
955
  }
956
  }
957
  },
958
  "add_C6": {
959
  "full_accuracy": 0.0,
960
- "n_examples": 100,
 
961
  "per_subtask": {
962
  "SC": {
963
- "accuracy": 0.7,
964
- "count": 100
965
  },
966
  "UC": {
967
- "accuracy": 0.4453551912568306,
968
- "count": 366
969
  },
970
  "US": {
971
- "accuracy": 0.4444444444444444,
972
- "count": 234
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
- "full_accuracy": 0.25,
978
- "n_examples": 100,
 
979
  "per_subtask": {
980
  "MD": {
981
- "accuracy": 0.8419301164725458,
982
- "count": 601
983
  },
984
  "ME": {
985
- "accuracy": 0.9595959595959596,
986
- "count": 99
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
- "full_accuracy": 0.02,
992
- "n_examples": 100,
 
993
  "per_subtask": {
994
  "MD": {
995
- "accuracy": 0.8781362007168458,
996
- "count": 279
997
  },
998
  "MB": {
999
- "accuracy": 0.7448275862068966,
1000
- "count": 145
1001
  },
1002
  "ME": {
1003
- "accuracy": 0.9166666666666666,
1004
- "count": 24
1005
  },
1006
  "UB": {
1007
- "accuracy": 0.2896825396825397,
1008
- "count": 252
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
- "full_accuracy": 0.01,
1014
- "n_examples": 100,
 
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.9436619718309859,
1018
- "count": 213
1019
  },
1020
  "MB": {
1021
- "accuracy": 0.5752212389380531,
1022
- "count": 113
1023
  },
1024
  "ME": {
1025
- "accuracy": 0.9529411764705882,
1026
- "count": 85
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.281767955801105,
1030
- "count": 181
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.18518518518518517,
1034
- "count": 108
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
  "full_accuracy": 0.0,
1040
- "n_examples": 100,
 
1041
  "per_subtask": {
1042
  "MD": {
1043
- "accuracy": 0.9664804469273743,
1044
- "count": 179
1045
  },
1046
  "MB": {
1047
- "accuracy": 0.5242718446601942,
1048
- "count": 103
1049
  },
1050
  "ME": {
1051
- "accuracy": 0.9642857142857143,
1052
- "count": 56
1053
  },
1054
  "UB": {
1055
- "accuracy": 0.2953020134228188,
1056
- "count": 149
1057
  },
1058
  "UD": {
1059
- "accuracy": 0.08450704225352113,
1060
- "count": 213
1061
  }
1062
  }
1063
  },
1064
  "sub_M4": {
1065
  "full_accuracy": 0.0,
1066
- "n_examples": 100,
 
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.89,
1070
- "count": 200
1071
  },
1072
  "MB": {
1073
- "accuracy": 0.75,
1074
- "count": 100
1075
  },
1076
  "UB": {
1077
- "accuracy": 0.29,
1078
- "count": 100
1079
  },
1080
  "UD": {
1081
- "accuracy": 0.006666666666666667,
1082
- "count": 300
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
  "full_accuracy": 0.0,
1088
- "n_examples": 100,
 
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
- "count": 100
1093
  },
1094
  "MB": {
1095
- "accuracy": 0.74,
1096
- "count": 100
1097
  },
1098
  "UB": {
1099
- "accuracy": 0.35,
1100
- "count": 100
1101
  },
1102
  "UD": {
1103
- "accuracy": 0.025,
1104
- "count": 400
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
- "full_accuracy": 0.04,
 
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
- "accuracy": 0.875,
1114
- "count": 600
1115
  },
1116
  "MB": {
1117
- "accuracy": 0.6629213483146067,
1118
- "count": 267
1119
  },
1120
  "ME": {
1121
- "accuracy": 0.9811320754716981,
1122
  "count": 53
1123
  },
1124
  "UB": {
1125
- "accuracy": 0.2437357630979499,
1126
- "count": 439
1127
  },
1128
  "UD": {
1129
- "accuracy": 0.12195121951219512,
1130
- "count": 41
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
- "full_accuracy": 0.02,
1136
- "n_examples": 100,
 
1137
  "per_subtask": {
1138
  "MD": {
1139
- "accuracy": 0.87,
1140
- "count": 300
1141
  },
1142
  "MB": {
1143
- "accuracy": 0.75,
1144
- "count": 100
1145
  },
1146
  "UB": {
1147
- "accuracy": 0.26903553299492383,
1148
- "count": 197
1149
  },
1150
  "UD": {
1151
- "accuracy": 0.11650485436893204,
1152
- "count": 103
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
  "full_accuracy": 0.0,
1158
- "n_examples": 100,
 
1159
  "per_subtask": {
1160
  "MD": {
1161
- "accuracy": 0.915,
1162
- "count": 200
1163
  },
1164
  "MB": {
1165
- "accuracy": 0.79,
1166
- "count": 100
1167
  },
1168
  "UB": {
1169
- "accuracy": 0.291497975708502,
1170
- "count": 247
1171
  },
1172
  "UD": {
1173
- "accuracy": 0.10457516339869281,
1174
- "count": 153
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
  "full_accuracy": 0.0,
1180
- "n_examples": 100,
 
1181
  "per_subtask": {
1182
  "MD": {
1183
  "accuracy": 1.0,
1184
- "count": 100
1185
  },
1186
  "MB": {
1187
  "accuracy": 0.76,
1188
- "count": 100
1189
  },
1190
  "UB": {
1191
- "accuracy": 0.29194630872483224,
1192
- "count": 298
1193
  },
1194
  "UD": {
1195
- "accuracy": 0.16336633663366337,
1196
- "count": 202
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
- "overall_accuracy": 0.06916666666666667,
1203
- "total_examples": 2400,
1204
- "n_splits": 22
 
1205
  }
1206
  }
1207
  }
 
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.48,
714
+ "digit_accuracy": 0.8971428571428571,
715
+ "n_examples": 50,
716
  "per_subtask": {
717
  "SA": {
718
+ "accuracy": 0.8813559322033898,
719
+ "count": 295
720
  },
721
  "SS": {
722
+ "accuracy": 0.9818181818181818,
723
+ "count": 55
724
  }
725
  }
726
  },
727
  "add_S1": {
728
+ "full_accuracy": 0.06,
729
+ "digit_accuracy": 0.6514285714285715,
730
+ "n_examples": 50,
731
  "per_subtask": {
732
  "SA": {
733
+ "accuracy": 0.8968253968253969,
734
+ "count": 126
735
  },
736
  "SC": {
737
+ "accuracy": 0.7848101265822784,
738
+ "count": 79
739
  },
740
  "SS": {
741
+ "accuracy": 0.7619047619047619,
742
+ "count": 21
743
  },
744
  "UC": {
745
+ "accuracy": 0.29838709677419356,
746
+ "count": 124
747
  }
748
  }
749
  },
750
  "add_S2": {
751
+ "full_accuracy": 0.08,
752
+ "digit_accuracy": 0.64,
753
+ "n_examples": 50,
754
  "per_subtask": {
755
  "SA": {
756
+ "accuracy": 0.8666666666666667,
757
+ "count": 75
758
  },
759
  "SC": {
760
+ "accuracy": 0.7096774193548387,
761
+ "count": 62
762
  },
763
  "SS": {
764
+ "accuracy": 0.8461538461538461,
765
+ "count": 39
766
  },
767
  "UC": {
768
+ "accuracy": 0.44144144144144143,
769
+ "count": 111
770
  },
771
  "US": {
772
+ "accuracy": 0.5238095238095238,
773
+ "count": 63
774
  }
775
  }
776
  },
777
  "add_S3": {
778
+ "full_accuracy": 0.08,
779
+ "digit_accuracy": 0.5142857142857142,
780
+ "n_examples": 50,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.9333333333333333,
784
+ "count": 60
785
  },
786
  "SC": {
787
+ "accuracy": 0.7543859649122807,
788
+ "count": 57
789
  },
790
  "SS": {
791
+ "accuracy": 0.8421052631578947,
792
+ "count": 19
793
  },
794
  "UC": {
795
+ "accuracy": 0.375,
796
+ "count": 104
797
  },
798
  "US": {
799
+ "accuracy": 0.23636363636363636,
800
+ "count": 110
801
  }
802
  }
803
  },
804
  "add_S4": {
805
+ "full_accuracy": 0.06,
806
+ "digit_accuracy": 0.44857142857142857,
807
+ "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
+ "accuracy": 0.9583333333333334,
811
+ "count": 48
812
  },
813
  "SC": {
814
+ "accuracy": 0.8076923076923077,
815
+ "count": 52
816
  },
817
  "SS": {
818
+ "accuracy": 1.0,
819
+ "count": 7
820
  },
821
  "UC": {
822
+ "accuracy": 0.3595505617977528,
823
+ "count": 89
824
  },
825
  "US": {
826
+ "accuracy": 0.19480519480519481,
827
+ "count": 154
828
  }
829
  }
830
  },
831
  "add_S5": {
832
+ "full_accuracy": 0.1,
833
+ "digit_accuracy": 0.37714285714285717,
834
+ "n_examples": 50,
835
  "per_subtask": {
836
  "SA": {
837
+ "accuracy": 0.98,
838
+ "count": 50
839
  },
840
  "SC": {
841
+ "accuracy": 0.56,
842
+ "count": 50
843
  },
844
  "UC": {
845
+ "accuracy": 0.34,
846
+ "count": 50
847
  },
848
  "US": {
849
  "accuracy": 0.19,
850
+ "count": 200
851
  }
852
  }
853
  },
854
  "add_S6": {
855
+ "full_accuracy": 0.28,
856
+ "digit_accuracy": 0.4342857142857143,
857
+ "n_examples": 50,
858
  "per_subtask": {
859
  "SC": {
860
+ "accuracy": 0.6,
861
+ "count": 50
862
  },
863
  "UC": {
864
+ "accuracy": 0.46,
865
+ "count": 50
866
  },
867
  "US": {
868
+ "accuracy": 0.396,
869
+ "count": 250
870
  }
871
  }
872
  },
873
  "add_random": {
874
+ "full_accuracy": 0.05,
875
+ "digit_accuracy": 0.635,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
+ "accuracy": 0.8723897911832946,
880
+ "count": 431
881
  },
882
  "SC": {
883
+ "accuracy": 0.7974683544303798,
884
+ "count": 316
885
  },
886
  "SS": {
887
+ "accuracy": 0.9230769230769231,
888
+ "count": 39
889
  },
890
  "UC": {
891
+ "accuracy": 0.36607142857142855,
892
+ "count": 560
893
  },
894
  "US": {
895
+ "accuracy": 0.37037037037037035,
896
+ "count": 54
897
+ }
898
+ }
899
+ },
900
+ "add_C1": {
901
+ "full_accuracy": 0.16,
902
+ "digit_accuracy": 0.8114285714285714,
903
+ "n_examples": 50,
904
+ "per_subtask": {
905
+ "SA": {
906
+ "accuracy": 0.928,
907
+ "count": 250
908
+ },
909
+ "SC": {
910
+ "accuracy": 0.78,
911
+ "count": 50
912
+ },
913
+ "UC": {
914
+ "accuracy": 0.26,
915
+ "count": 50
916
+ }
917
+ }
918
+ },
919
+ "add_C2": {
920
+ "full_accuracy": 0.08,
921
+ "digit_accuracy": 0.74,
922
+ "n_examples": 50,
923
+ "per_subtask": {
924
+ "SA": {
925
+ "accuracy": 0.94,
926
+ "count": 200
927
+ },
928
+ "SC": {
929
+ "accuracy": 0.74,
930
+ "count": 50
931
+ },
932
+ "UC": {
933
+ "accuracy": 0.3132530120481928,
934
+ "count": 83
935
+ },
936
+ "US": {
937
+ "accuracy": 0.47058823529411764,
938
+ "count": 17
939
  }
940
  }
941
  },
942
  "add_C3": {
943
+ "full_accuracy": 0.02,
944
+ "digit_accuracy": 0.6457142857142857,
945
+ "n_examples": 50,
946
  "per_subtask": {
947
  "SA": {
948
+ "accuracy": 0.9133333333333333,
949
+ "count": 150
950
  },
951
  "SC": {
952
+ "accuracy": 0.8,
953
+ "count": 50
954
  },
955
  "UC": {
956
+ "accuracy": 0.33,
957
+ "count": 100
958
  },
959
  "US": {
960
+ "accuracy": 0.32,
961
+ "count": 50
962
  }
963
  }
964
  },
965
  "add_C4": {
966
+ "full_accuracy": 0.02,
967
+ "digit_accuracy": 0.5885714285714285,
968
+ "n_examples": 50,
969
  "per_subtask": {
970
  "SA": {
971
+ "accuracy": 0.99,
972
+ "count": 100
973
  },
974
  "SC": {
975
+ "accuracy": 0.76,
976
+ "count": 50
977
  },
978
  "UC": {
979
+ "accuracy": 0.3484848484848485,
980
+ "count": 132
981
  },
982
  "US": {
983
+ "accuracy": 0.3382352941176471,
984
+ "count": 68
985
  }
986
  }
987
  },
988
  "add_C5": {
989
+ "full_accuracy": 0.0,
990
+ "digit_accuracy": 0.4657142857142857,
991
+ "n_examples": 50,
992
  "per_subtask": {
993
  "SA": {
994
+ "accuracy": 0.94,
995
+ "count": 50
996
  },
997
  "SC": {
998
+ "accuracy": 0.72,
999
+ "count": 50
1000
  },
1001
  "UC": {
1002
+ "accuracy": 0.3835616438356164,
1003
+ "count": 146
1004
  },
1005
  "US": {
1006
+ "accuracy": 0.23076923076923078,
1007
+ "count": 104
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
  "full_accuracy": 0.0,
1013
+ "digit_accuracy": 0.44571428571428573,
1014
+ "n_examples": 50,
1015
  "per_subtask": {
1016
  "SC": {
1017
+ "accuracy": 0.64,
1018
+ "count": 50
1019
  },
1020
  "UC": {
1021
+ "accuracy": 0.4444444444444444,
1022
+ "count": 189
1023
  },
1024
  "US": {
1025
+ "accuracy": 0.36036036036036034,
1026
+ "count": 111
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
+ "full_accuracy": 0.26,
1032
+ "digit_accuracy": 0.8657142857142858,
1033
+ "n_examples": 50,
1034
  "per_subtask": {
1035
  "MD": {
1036
+ "accuracy": 0.8481848184818482,
1037
+ "count": 303
1038
  },
1039
  "ME": {
1040
+ "accuracy": 0.9787234042553191,
1041
+ "count": 47
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
+ "full_accuracy": 0.0,
1047
+ "digit_accuracy": 0.6371428571428571,
1048
+ "n_examples": 50,
1049
  "per_subtask": {
1050
  "MD": {
1051
+ "accuracy": 0.8652482269503546,
1052
+ "count": 141
1053
  },
1054
  "MB": {
1055
+ "accuracy": 0.6527777777777778,
1056
+ "count": 72
1057
  },
1058
  "ME": {
1059
+ "accuracy": 0.9444444444444444,
1060
+ "count": 18
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.31092436974789917,
1064
+ "count": 119
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
+ "full_accuracy": 0.0,
1070
+ "digit_accuracy": 0.62,
1071
+ "n_examples": 50,
1072
  "per_subtask": {
1073
  "MD": {
1074
+ "accuracy": 0.9285714285714286,
1075
+ "count": 112
1076
  },
1077
  "MB": {
1078
+ "accuracy": 0.6981132075471698,
1079
+ "count": 53
1080
  },
1081
  "ME": {
1082
+ "accuracy": 0.9361702127659575,
1083
+ "count": 47
1084
  },
1085
  "UB": {
1086
+ "accuracy": 0.2823529411764706,
1087
+ "count": 85
1088
  },
1089
  "UD": {
1090
+ "accuracy": 0.1509433962264151,
1091
+ "count": 53
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
  "full_accuracy": 0.0,
1097
+ "digit_accuracy": 0.48857142857142855,
1098
+ "n_examples": 50,
1099
  "per_subtask": {
1100
  "MD": {
1101
+ "accuracy": 0.9381443298969072,
1102
+ "count": 97
1103
  },
1104
  "MB": {
1105
+ "accuracy": 0.5686274509803921,
1106
+ "count": 51
1107
  },
1108
  "ME": {
1109
+ "accuracy": 1.0,
1110
+ "count": 27
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.1891891891891892,
1114
+ "count": 74
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.09900990099009901,
1118
+ "count": 101
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
  "full_accuracy": 0.0,
1124
+ "digit_accuracy": 0.4342857142857143,
1125
+ "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 0.88,
1129
+ "count": 100
1130
  },
1131
  "MB": {
1132
+ "accuracy": 0.68,
1133
+ "count": 50
1134
  },
1135
  "UB": {
1136
+ "accuracy": 0.4,
1137
+ "count": 50
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.06666666666666667,
1141
+ "count": 150
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
+ "digit_accuracy": 0.30857142857142855,
1148
+ "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 50
1153
  },
1154
  "MB": {
1155
+ "accuracy": 0.78,
1156
+ "count": 50
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.38,
1160
+ "count": 50
1161
  },
1162
  "UD": {
1163
+ "accuracy": 0.0,
1164
+ "count": 200
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
+ "full_accuracy": 0.02,
1170
+ "digit_accuracy": 0.62,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
+ "accuracy": 0.8807017543859649,
1175
+ "count": 570
1176
  },
1177
  "MB": {
1178
+ "accuracy": 0.6534296028880866,
1179
+ "count": 277
1180
  },
1181
  "ME": {
1182
+ "accuracy": 0.9245283018867925,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
+ "accuracy": 0.2823779193205945,
1187
+ "count": 471
1188
  },
1189
  "UD": {
1190
+ "accuracy": 0.10344827586206896,
1191
+ "count": 29
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
+ "full_accuracy": 0.0,
1197
+ "digit_accuracy": 0.5657142857142857,
1198
+ "n_examples": 50,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.8733333333333333,
1202
+ "count": 150
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.68,
1206
+ "count": 50
1207
  },
1208
  "UB": {
1209
+ "accuracy": 0.21782178217821782,
1210
+ "count": 101
1211
  },
1212
  "UD": {
1213
+ "accuracy": 0.22448979591836735,
1214
+ "count": 49
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
+ "digit_accuracy": 0.52,
1221
+ "n_examples": 50,
1222
  "per_subtask": {
1223
  "MD": {
1224
+ "accuracy": 0.96,
1225
+ "count": 100
1226
  },
1227
  "MB": {
1228
+ "accuracy": 0.74,
1229
+ "count": 50
1230
  },
1231
  "UB": {
1232
+ "accuracy": 0.30578512396694213,
1233
+ "count": 121
1234
  },
1235
  "UD": {
1236
+ "accuracy": 0.1518987341772152,
1237
+ "count": 79
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
+ "digit_accuracy": 0.4142857142857143,
1244
+ "n_examples": 50,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
+ "count": 50
1249
  },
1250
  "MB": {
1251
  "accuracy": 0.76,
1252
+ "count": 50
1253
  },
1254
  "UB": {
1255
+ "accuracy": 0.2565789473684211,
1256
+ "count": 152
1257
  },
1258
  "UD": {
1259
+ "accuracy": 0.1836734693877551,
1260
+ "count": 98
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
+ "overall_accuracy": 0.06533333333333333,
1267
+ "digit_accuracy": 0.5842857142857143,
1268
+ "total_examples": 1500,
1269
+ "n_splits": 24
1270
  }
1271
  }
1272
  }