amirali1985 commited on
Commit
1059cd1
·
verified ·
1 Parent(s): 2746b8d

Upload add_sub_baseline_25K_2L1H128d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_25K_2L1H128d/metrics.json CHANGED
@@ -706,502 +706,567 @@
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.03,
714
- "n_examples": 100,
 
715
  "per_subtask": {
716
  "SA": {
717
- "accuracy": 0.6066115702479339,
718
- "count": 605
719
  },
720
  "SS": {
721
- "accuracy": 0.7684210526315789,
722
- "count": 95
723
  }
724
  }
725
  },
726
  "add_S1": {
727
- "full_accuracy": 0.01,
728
- "n_examples": 100,
 
729
  "per_subtask": {
730
  "SA": {
731
- "accuracy": 0.6323529411764706,
732
- "count": 204
733
  },
734
  "SC": {
735
- "accuracy": 0.591715976331361,
736
- "count": 169
737
  },
738
  "SS": {
739
- "accuracy": 0.8387096774193549,
740
- "count": 31
741
  },
742
  "UC": {
743
- "accuracy": 0.5202702702702703,
744
- "count": 296
745
  }
746
  }
747
  },
748
  "add_S2": {
749
- "full_accuracy": 0.04,
750
- "n_examples": 100,
 
751
  "per_subtask": {
752
  "SA": {
753
- "accuracy": 0.6441717791411042,
754
- "count": 163
755
  },
756
  "SC": {
757
- "accuracy": 0.6307692307692307,
758
- "count": 130
759
  },
760
  "SS": {
761
- "accuracy": 0.5632183908045977,
762
- "count": 87
763
  },
764
  "UC": {
765
- "accuracy": 0.5665024630541872,
766
- "count": 203
767
  },
768
  "US": {
769
- "accuracy": 0.5897435897435898,
770
- "count": 117
771
  }
772
  }
773
  },
774
  "add_S3": {
775
- "full_accuracy": 0.03,
776
- "n_examples": 100,
 
777
  "per_subtask": {
778
  "SA": {
779
- "accuracy": 0.6942148760330579,
780
- "count": 121
781
  },
782
  "SC": {
783
- "accuracy": 0.6859504132231405,
784
- "count": 121
785
  },
786
  "SS": {
787
- "accuracy": 0.5306122448979592,
788
- "count": 49
789
  },
790
  "UC": {
791
- "accuracy": 0.42473118279569894,
792
- "count": 186
793
  },
794
  "US": {
795
- "accuracy": 0.6278026905829597,
796
- "count": 223
797
  }
798
  }
799
  },
800
  "add_S4": {
801
- "full_accuracy": 0.11,
802
- "n_examples": 100,
 
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.7307692307692307,
806
- "count": 104
807
  },
808
  "SC": {
809
- "accuracy": 0.6415094339622641,
810
- "count": 106
811
  },
812
  "SS": {
813
- "accuracy": 0.7391304347826086,
814
- "count": 23
815
  },
816
  "UC": {
817
- "accuracy": 0.525,
818
- "count": 160
819
  },
820
  "US": {
821
- "accuracy": 0.5635179153094463,
822
- "count": 307
823
  }
824
  }
825
  },
826
  "add_S5": {
827
- "full_accuracy": 0.13,
828
- "n_examples": 100,
 
829
  "per_subtask": {
830
  "SA": {
831
- "accuracy": 0.69,
832
- "count": 100
833
  },
834
  "SC": {
835
- "accuracy": 0.56,
836
- "count": 100
837
  },
838
  "UC": {
839
- "accuracy": 0.39,
840
- "count": 100
841
  },
842
  "US": {
843
- "accuracy": 0.445,
844
- "count": 400
845
  }
846
  }
847
  },
848
  "add_S6": {
849
- "full_accuracy": 0.27,
850
- "n_examples": 100,
 
851
  "per_subtask": {
852
  "SC": {
853
- "accuracy": 0.63,
854
- "count": 100
855
  },
856
  "UC": {
857
- "accuracy": 0.32,
858
- "count": 100
859
  },
860
  "US": {
861
- "accuracy": 0.378,
862
- "count": 500
863
  }
864
  }
865
  },
866
  "add_random": {
867
- "full_accuracy": 0.015,
 
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
- "accuracy": 0.6219239373601789,
872
- "count": 447
873
  },
874
  "SC": {
875
- "accuracy": 0.640625,
876
- "count": 320
877
  },
878
  "SS": {
879
- "accuracy": 0.75,
880
- "count": 56
881
  },
882
  "UC": {
883
- "accuracy": 0.5633270321361059,
884
- "count": 529
885
  },
886
  "US": {
887
- "accuracy": 0.7083333333333334,
888
- "count": 48
889
  }
890
  }
891
  },
892
- "add_C3": {
893
- "full_accuracy": 0.03,
894
- "n_examples": 100,
 
895
  "per_subtask": {
896
  "SA": {
897
- "accuracy": 0.67,
898
- "count": 300
899
  },
900
  "SC": {
901
- "accuracy": 0.59,
902
- "count": 100
903
  },
904
  "UC": {
905
- "accuracy": 0.42487046632124353,
906
- "count": 193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
907
  },
908
  "US": {
909
- "accuracy": 0.5981308411214953,
910
- "count": 107
911
  }
912
  }
913
  },
914
- "add_C4": {
915
- "full_accuracy": 0.03,
916
- "n_examples": 100,
 
917
  "per_subtask": {
918
  "SA": {
919
- "accuracy": 0.665,
920
- "count": 200
921
  },
922
  "SC": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
  "accuracy": 0.71,
924
  "count": 100
925
  },
 
 
 
 
926
  "UC": {
927
- "accuracy": 0.4296875,
928
- "count": 256
929
  },
930
  "US": {
931
- "accuracy": 0.7430555555555556,
932
- "count": 144
933
  }
934
  }
935
  },
936
  "add_C5": {
937
- "full_accuracy": 0.01,
938
- "n_examples": 100,
 
939
  "per_subtask": {
940
  "SA": {
941
- "accuracy": 0.71,
942
- "count": 100
943
  },
944
  "SC": {
945
- "accuracy": 0.69,
946
- "count": 100
947
  },
948
  "UC": {
949
- "accuracy": 0.4150326797385621,
950
- "count": 306
951
  },
952
  "US": {
953
- "accuracy": 0.7164948453608248,
954
- "count": 194
955
  }
956
  }
957
  },
958
  "add_C6": {
959
  "full_accuracy": 0.0,
960
- "n_examples": 100,
 
961
  "per_subtask": {
962
  "SC": {
963
- "accuracy": 0.7,
964
- "count": 100
965
  },
966
  "UC": {
967
- "accuracy": 0.4918032786885246,
968
- "count": 366
969
  },
970
  "US": {
971
- "accuracy": 0.8589743589743589,
972
- "count": 234
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
  "full_accuracy": 0.04,
978
- "n_examples": 100,
 
979
  "per_subtask": {
980
  "MD": {
981
- "accuracy": 0.6156405990016639,
982
- "count": 601
983
  },
984
  "ME": {
985
- "accuracy": 0.98989898989899,
986
- "count": 99
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
- "full_accuracy": 0.03,
992
- "n_examples": 100,
 
993
  "per_subtask": {
994
  "MD": {
995
- "accuracy": 0.6630824372759857,
996
- "count": 279
997
  },
998
  "MB": {
999
- "accuracy": 0.7172413793103448,
1000
- "count": 145
1001
  },
1002
  "ME": {
1003
- "accuracy": 0.9583333333333334,
1004
- "count": 24
1005
  },
1006
  "UB": {
1007
- "accuracy": 0.4722222222222222,
1008
- "count": 252
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
- "full_accuracy": 0.06,
1014
- "n_examples": 100,
 
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.7981220657276995,
1018
- "count": 213
1019
  },
1020
  "MB": {
1021
- "accuracy": 0.6460176991150443,
1022
- "count": 113
1023
  },
1024
  "ME": {
1025
- "accuracy": 0.9764705882352941,
1026
- "count": 85
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.4530386740331492,
1030
- "count": 181
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.5092592592592593,
1034
- "count": 108
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
  "full_accuracy": 0.0,
1040
- "n_examples": 100,
 
1041
  "per_subtask": {
1042
  "MD": {
1043
- "accuracy": 0.8491620111731844,
1044
- "count": 179
1045
  },
1046
  "MB": {
1047
- "accuracy": 0.6504854368932039,
1048
- "count": 103
1049
  },
1050
  "ME": {
1051
  "accuracy": 1.0,
1052
- "count": 56
1053
  },
1054
  "UB": {
1055
- "accuracy": 0.5167785234899329,
1056
- "count": 149
1057
  },
1058
  "UD": {
1059
- "accuracy": 0.04225352112676056,
1060
- "count": 213
1061
  }
1062
  }
1063
  },
1064
  "sub_M4": {
1065
  "full_accuracy": 0.0,
1066
- "n_examples": 100,
 
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.68,
1070
- "count": 200
1071
  },
1072
  "MB": {
1073
- "accuracy": 0.69,
1074
- "count": 100
1075
  },
1076
  "UB": {
1077
- "accuracy": 0.54,
1078
- "count": 100
1079
  },
1080
  "UD": {
1081
- "accuracy": 0.013333333333333334,
1082
- "count": 300
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
  "full_accuracy": 0.0,
1088
- "n_examples": 100,
 
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
- "count": 100
1093
  },
1094
  "MB": {
1095
- "accuracy": 0.68,
1096
- "count": 100
1097
  },
1098
  "UB": {
1099
- "accuracy": 0.74,
1100
- "count": 100
1101
  },
1102
  "UD": {
1103
  "accuracy": 0.0,
1104
- "count": 400
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
- "full_accuracy": 0.03,
 
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
- "accuracy": 0.65,
1114
- "count": 600
1115
  },
1116
  "MB": {
1117
- "accuracy": 0.7265917602996255,
1118
- "count": 267
1119
  },
1120
  "ME": {
1121
- "accuracy": 0.9811320754716981,
1122
  "count": 53
1123
  },
1124
  "UB": {
1125
- "accuracy": 0.40774487471526194,
1126
- "count": 439
1127
  },
1128
  "UD": {
1129
- "accuracy": 0.5853658536585366,
1130
- "count": 41
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
- "full_accuracy": 0.0,
1136
- "n_examples": 100,
 
1137
  "per_subtask": {
1138
  "MD": {
1139
- "accuracy": 0.6466666666666666,
1140
- "count": 300
1141
  },
1142
  "MB": {
1143
- "accuracy": 0.74,
1144
- "count": 100
1145
  },
1146
  "UB": {
1147
- "accuracy": 0.4365482233502538,
1148
- "count": 197
1149
  },
1150
  "UD": {
1151
- "accuracy": 0.24271844660194175,
1152
- "count": 103
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
  "full_accuracy": 0.0,
1158
- "n_examples": 100,
 
1159
  "per_subtask": {
1160
  "MD": {
1161
- "accuracy": 0.705,
1162
- "count": 200
1163
  },
1164
  "MB": {
1165
- "accuracy": 0.75,
1166
- "count": 100
1167
  },
1168
  "UB": {
1169
- "accuracy": 0.42105263157894735,
1170
- "count": 247
1171
  },
1172
  "UD": {
1173
- "accuracy": 0.13725490196078433,
1174
- "count": 153
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
  "full_accuracy": 0.0,
1180
- "n_examples": 100,
 
1181
  "per_subtask": {
1182
  "MD": {
1183
  "accuracy": 1.0,
1184
- "count": 100
1185
  },
1186
  "MB": {
1187
- "accuracy": 0.69,
1188
- "count": 100
1189
  },
1190
  "UB": {
1191
- "accuracy": 0.436241610738255,
1192
- "count": 298
1193
  },
1194
  "UD": {
1195
- "accuracy": 0.13366336633663367,
1196
- "count": 202
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
- "overall_accuracy": 0.03791666666666667,
1203
- "total_examples": 2400,
1204
- "n_splits": 22
 
1205
  }
1206
  }
1207
  }
 
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.04,
714
+ "digit_accuracy": 0.6485714285714286,
715
+ "n_examples": 50,
716
  "per_subtask": {
717
  "SA": {
718
+ "accuracy": 0.6372881355932203,
719
+ "count": 295
720
  },
721
  "SS": {
722
+ "accuracy": 0.7090909090909091,
723
+ "count": 55
724
  }
725
  }
726
  },
727
  "add_S1": {
728
+ "full_accuracy": 0.0,
729
+ "digit_accuracy": 0.5885714285714285,
730
+ "n_examples": 50,
731
  "per_subtask": {
732
  "SA": {
733
+ "accuracy": 0.6349206349206349,
734
+ "count": 126
735
  },
736
  "SC": {
737
+ "accuracy": 0.620253164556962,
738
+ "count": 79
739
  },
740
  "SS": {
741
+ "accuracy": 0.6666666666666666,
742
+ "count": 21
743
  },
744
  "UC": {
745
+ "accuracy": 0.5080645161290323,
746
+ "count": 124
747
  }
748
  }
749
  },
750
  "add_S2": {
751
+ "full_accuracy": 0.02,
752
+ "digit_accuracy": 0.56,
753
+ "n_examples": 50,
754
  "per_subtask": {
755
  "SA": {
756
+ "accuracy": 0.6,
757
+ "count": 75
758
  },
759
  "SC": {
760
+ "accuracy": 0.46774193548387094,
761
+ "count": 62
762
  },
763
  "SS": {
764
+ "accuracy": 0.46153846153846156,
765
+ "count": 39
766
  },
767
  "UC": {
768
+ "accuracy": 0.5405405405405406,
769
+ "count": 111
770
  },
771
  "US": {
772
+ "accuracy": 0.6984126984126984,
773
+ "count": 63
774
  }
775
  }
776
  },
777
  "add_S3": {
778
+ "full_accuracy": 0.02,
779
+ "digit_accuracy": 0.6085714285714285,
780
+ "n_examples": 50,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.6166666666666667,
784
+ "count": 60
785
  },
786
  "SC": {
787
+ "accuracy": 0.5964912280701754,
788
+ "count": 57
789
  },
790
  "SS": {
791
+ "accuracy": 0.42105263157894735,
792
+ "count": 19
793
  },
794
  "UC": {
795
+ "accuracy": 0.47115384615384615,
796
+ "count": 104
797
  },
798
  "US": {
799
+ "accuracy": 0.7727272727272727,
800
+ "count": 110
801
  }
802
  }
803
  },
804
  "add_S4": {
805
+ "full_accuracy": 0.08,
806
+ "digit_accuracy": 0.5942857142857143,
807
+ "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
+ "accuracy": 0.6458333333333334,
811
+ "count": 48
812
  },
813
  "SC": {
814
+ "accuracy": 0.6730769230769231,
815
+ "count": 52
816
  },
817
  "SS": {
818
+ "accuracy": 0.8571428571428571,
819
+ "count": 7
820
  },
821
  "UC": {
822
+ "accuracy": 0.5056179775280899,
823
+ "count": 89
824
  },
825
  "US": {
826
+ "accuracy": 0.5909090909090909,
827
+ "count": 154
828
  }
829
  }
830
  },
831
  "add_S5": {
832
+ "full_accuracy": 0.16,
833
+ "digit_accuracy": 0.4685714285714286,
834
+ "n_examples": 50,
835
  "per_subtask": {
836
  "SA": {
837
+ "accuracy": 0.7,
838
+ "count": 50
839
  },
840
  "SC": {
841
+ "accuracy": 0.7,
842
+ "count": 50
843
  },
844
  "UC": {
845
+ "accuracy": 0.38,
846
+ "count": 50
847
  },
848
  "US": {
849
+ "accuracy": 0.375,
850
+ "count": 200
851
  }
852
  }
853
  },
854
  "add_S6": {
855
+ "full_accuracy": 0.28,
856
+ "digit_accuracy": 0.3942857142857143,
857
+ "n_examples": 50,
858
  "per_subtask": {
859
  "SC": {
860
+ "accuracy": 0.5,
861
+ "count": 50
862
  },
863
  "UC": {
864
+ "accuracy": 0.34,
865
+ "count": 50
866
  },
867
  "US": {
868
+ "accuracy": 0.384,
869
+ "count": 250
870
  }
871
  }
872
  },
873
  "add_random": {
874
+ "full_accuracy": 0.02,
875
+ "digit_accuracy": 0.5921428571428572,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
+ "accuracy": 0.5986078886310905,
880
+ "count": 431
881
  },
882
  "SC": {
883
+ "accuracy": 0.6360759493670886,
884
+ "count": 316
885
  },
886
  "SS": {
887
+ "accuracy": 0.7948717948717948,
888
+ "count": 39
889
  },
890
  "UC": {
891
+ "accuracy": 0.5232142857142857,
892
+ "count": 560
893
  },
894
  "US": {
895
+ "accuracy": 0.8518518518518519,
896
+ "count": 54
897
  }
898
  }
899
  },
900
+ "add_C1": {
901
+ "full_accuracy": 0.0,
902
+ "digit_accuracy": 0.5714285714285714,
903
+ "n_examples": 50,
904
  "per_subtask": {
905
  "SA": {
906
+ "accuracy": 0.596,
907
+ "count": 250
908
  },
909
  "SC": {
910
+ "accuracy": 0.62,
911
+ "count": 50
912
  },
913
  "UC": {
914
+ "accuracy": 0.4,
915
+ "count": 50
916
+ }
917
+ }
918
+ },
919
+ "add_C2": {
920
+ "full_accuracy": 0.02,
921
+ "digit_accuracy": 0.5628571428571428,
922
+ "n_examples": 50,
923
+ "per_subtask": {
924
+ "SA": {
925
+ "accuracy": 0.64,
926
+ "count": 200
927
+ },
928
+ "SC": {
929
+ "accuracy": 0.54,
930
+ "count": 50
931
+ },
932
+ "UC": {
933
+ "accuracy": 0.3855421686746988,
934
+ "count": 83
935
  },
936
  "US": {
937
+ "accuracy": 0.5882352941176471,
938
+ "count": 17
939
  }
940
  }
941
  },
942
+ "add_C3": {
943
+ "full_accuracy": 0.0,
944
+ "digit_accuracy": 0.5428571428571428,
945
+ "n_examples": 50,
946
  "per_subtask": {
947
  "SA": {
948
+ "accuracy": 0.64,
949
+ "count": 150
950
  },
951
  "SC": {
952
+ "accuracy": 0.56,
953
+ "count": 50
954
+ },
955
+ "UC": {
956
+ "accuracy": 0.35,
957
+ "count": 100
958
+ },
959
+ "US": {
960
+ "accuracy": 0.62,
961
+ "count": 50
962
+ }
963
+ }
964
+ },
965
+ "add_C4": {
966
+ "full_accuracy": 0.06,
967
+ "digit_accuracy": 0.5971428571428572,
968
+ "n_examples": 50,
969
+ "per_subtask": {
970
+ "SA": {
971
  "accuracy": 0.71,
972
  "count": 100
973
  },
974
+ "SC": {
975
+ "accuracy": 0.66,
976
+ "count": 50
977
+ },
978
  "UC": {
979
+ "accuracy": 0.4090909090909091,
980
+ "count": 132
981
  },
982
  "US": {
983
+ "accuracy": 0.75,
984
+ "count": 68
985
  }
986
  }
987
  },
988
  "add_C5": {
989
+ "full_accuracy": 0.02,
990
+ "digit_accuracy": 0.6428571428571429,
991
+ "n_examples": 50,
992
  "per_subtask": {
993
  "SA": {
994
+ "accuracy": 0.68,
995
+ "count": 50
996
  },
997
  "SC": {
998
+ "accuracy": 0.68,
999
+ "count": 50
1000
  },
1001
  "UC": {
1002
+ "accuracy": 0.4520547945205479,
1003
+ "count": 146
1004
  },
1005
  "US": {
1006
+ "accuracy": 0.875,
1007
+ "count": 104
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
  "full_accuracy": 0.0,
1013
+ "digit_accuracy": 0.5657142857142857,
1014
+ "n_examples": 50,
1015
  "per_subtask": {
1016
  "SC": {
1017
+ "accuracy": 0.66,
1018
+ "count": 50
1019
  },
1020
  "UC": {
1021
+ "accuracy": 0.5026455026455027,
1022
+ "count": 189
1023
  },
1024
  "US": {
1025
+ "accuracy": 0.6306306306306306,
1026
+ "count": 111
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
  "full_accuracy": 0.04,
1032
+ "digit_accuracy": 0.6571428571428571,
1033
+ "n_examples": 50,
1034
  "per_subtask": {
1035
  "MD": {
1036
+ "accuracy": 0.6039603960396039,
1037
+ "count": 303
1038
  },
1039
  "ME": {
1040
+ "accuracy": 1.0,
1041
+ "count": 47
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
+ "full_accuracy": 0.0,
1047
+ "digit_accuracy": 0.6542857142857142,
1048
+ "n_examples": 50,
1049
  "per_subtask": {
1050
  "MD": {
1051
+ "accuracy": 0.7092198581560284,
1052
+ "count": 141
1053
  },
1054
  "MB": {
1055
+ "accuracy": 0.7083333333333334,
1056
+ "count": 72
1057
  },
1058
  "ME": {
1059
+ "accuracy": 0.9444444444444444,
1060
+ "count": 18
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.5126050420168067,
1064
+ "count": 119
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
+ "full_accuracy": 0.04,
1070
+ "digit_accuracy": 0.6828571428571428,
1071
+ "n_examples": 50,
1072
  "per_subtask": {
1073
  "MD": {
1074
+ "accuracy": 0.8214285714285714,
1075
+ "count": 112
1076
  },
1077
  "MB": {
1078
+ "accuracy": 0.7169811320754716,
1079
+ "count": 53
1080
  },
1081
  "ME": {
1082
+ "accuracy": 0.9148936170212766,
1083
+ "count": 47
1084
  },
1085
  "UB": {
1086
+ "accuracy": 0.4470588235294118,
1087
+ "count": 85
1088
  },
1089
  "UD": {
1090
+ "accuracy": 0.5283018867924528,
1091
+ "count": 53
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
  "full_accuracy": 0.0,
1097
+ "digit_accuracy": 0.5085714285714286,
1098
+ "n_examples": 50,
1099
  "per_subtask": {
1100
  "MD": {
1101
+ "accuracy": 0.8144329896907216,
1102
+ "count": 97
1103
  },
1104
  "MB": {
1105
+ "accuracy": 0.6274509803921569,
1106
+ "count": 51
1107
  },
1108
  "ME": {
1109
  "accuracy": 1.0,
1110
+ "count": 27
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.5135135135135135,
1114
+ "count": 74
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.019801980198019802,
1118
+ "count": 101
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
  "full_accuracy": 0.0,
1124
+ "digit_accuracy": 0.3485714285714286,
1125
+ "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 0.64,
1129
+ "count": 100
1130
  },
1131
  "MB": {
1132
+ "accuracy": 0.58,
1133
+ "count": 50
1134
  },
1135
  "UB": {
1136
+ "accuracy": 0.56,
1137
+ "count": 50
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.006666666666666667,
1141
+ "count": 150
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
+ "digit_accuracy": 0.34,
1148
+ "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 50
1153
  },
1154
  "MB": {
1155
+ "accuracy": 0.74,
1156
+ "count": 50
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.64,
1160
+ "count": 50
1161
  },
1162
  "UD": {
1163
  "accuracy": 0.0,
1164
+ "count": 200
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
+ "full_accuracy": 0.035,
1170
+ "digit_accuracy": 0.6064285714285714,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
+ "accuracy": 0.6543859649122807,
1175
+ "count": 570
1176
  },
1177
  "MB": {
1178
+ "accuracy": 0.7075812274368231,
1179
+ "count": 277
1180
  },
1181
  "ME": {
1182
+ "accuracy": 0.9245283018867925,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
+ "accuracy": 0.46709129511677283,
1187
+ "count": 471
1188
  },
1189
  "UD": {
1190
+ "accuracy": 0.3793103448275862,
1191
+ "count": 29
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
+ "full_accuracy": 0.02,
1197
+ "digit_accuracy": 0.5485714285714286,
1198
+ "n_examples": 50,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.6333333333333333,
1202
+ "count": 150
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.82,
1206
+ "count": 50
1207
  },
1208
  "UB": {
1209
+ "accuracy": 0.45544554455445546,
1210
+ "count": 101
1211
  },
1212
  "UD": {
1213
+ "accuracy": 0.20408163265306123,
1214
+ "count": 49
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
+ "digit_accuracy": 0.52,
1221
+ "n_examples": 50,
1222
  "per_subtask": {
1223
  "MD": {
1224
+ "accuracy": 0.77,
1225
+ "count": 100
1226
  },
1227
  "MB": {
1228
+ "accuracy": 0.78,
1229
+ "count": 50
1230
  },
1231
  "UB": {
1232
+ "accuracy": 0.4214876033057851,
1233
+ "count": 121
1234
  },
1235
  "UD": {
1236
+ "accuracy": 0.189873417721519,
1237
+ "count": 79
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
+ "digit_accuracy": 0.5057142857142857,
1244
+ "n_examples": 50,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
+ "count": 50
1249
  },
1250
  "MB": {
1251
+ "accuracy": 0.76,
1252
+ "count": 50
1253
  },
1254
  "UB": {
1255
+ "accuracy": 0.46710526315789475,
1256
+ "count": 152
1257
  },
1258
  "UD": {
1259
+ "accuracy": 0.1836734693877551,
1260
+ "count": 98
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
+ "overall_accuracy": 0.034,
1267
+ "digit_accuracy": 0.5633333333333334,
1268
+ "total_examples": 1500,
1269
+ "n_splits": 24
1270
  }
1271
  }
1272
  }